闲来没事,找了个av网站,看了几个视频,然后想下载一下。结果发现多年不上这些av网站,现在的av网站播放的源文件已经不是avi或者mp4了,而是m3u8的播放列表。在firefox中可以使用Video DownloadHelper 来获取相应的下载地址,但是有的时候如果m3u8中包含的是播放列表,会无法获取下载链接。
于是就想着怎么直接下载文件,其实通过ffmpeg可以很方便的获取下载链接:
只需要下面的一样命令:
ffmpeg -protocol_whitelist "file,http,crypto,tcp,https,tls" -i https://videox11.ynkcq.com:8081/20200109/8Pr79HKk/600kb/hls/index.m3u8 -c copy out.mp4
事实证明,这个命令在mac os下是可以直接使用的,我也是后来才发现,下面的代码最开始实在windows下写的。但是在windows下执行这条命令会出现各种问题,网上看了一下貌似是和加密有关,对应的m3u8文件内容:
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-TARGETDURATION:3
#EXT-X-MEDIA-SEQUENCE:0
#EXT-X-KEY:METHOD=AES-128,URI="key.key"
#EXTINF:2.085000,
NT6PfA2975000.ts
#EXTINF:2.085000,
NT6PfA2975001.ts
#EXTINF:2.085000,
NT6PfA2975002.ts
#EXTINF:2.085000,
NT6PfA2975003.ts
#EXTINF:2.085000,
NT6PfA2975004.ts
#EXTINF:2.085000,
NT6PfA2975005.ts
#EXTINF:2.085000,
NT6PfA2975006.ts
#EXTINF:2.085000,
NT6PfA2975007.ts
#EXTINF:2.085000,
NT6PfA2975008.ts
#EXTINF:2.085000,
NT6PfA2975009.ts
#EXTINF:2.085000,
NT6PfA2975010.ts
#EXTINF:2.085000,
NT6PfA2975011.ts
#EXTINF:2.085000,
NT6PfA2975012.ts
#EXTINF:2.085000,
...以下省略...
在windows下由于无法解密会导致输入错误,看了下网上的解决方案,主要有两种
- 可以将m3u8文件下载到本地,然后替换文件中ts对应的url, 以及key对应的url。然后通过ffmpeg 使用修改后的m3u8文件进行下载,下面的代码就是基于这个逻辑
- 直接使用代码拼接ts文件,下载完成之后通过openssl进行解密,然后进行拼接。
# -*- coding: UTF-8 -*-
import m3u8
import os, sys
import re, json
import requests
import subprocess
from bs4 import BeautifulSoup
import getopt
"""
https://pypi.org/project/m3u8/
https://blog.csdn.net/u014484783/article/details/79350392
"""
HEADERS = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
if sys.platform.startswith('win32'):
DIR_PATH = r"F:\PyCharmProjects\m3u8_downloader\download\m3u8_files"
OUT_PATH = r"F:\PyCharmProjects\m3u8_downloader\download\mp4"
FFMPEG_PATH = 'G:\\ffmpeg-20200126-5e62100-win64-static\\bin\\ffmpeg.exe'
else:
DIR_PATH = '/Volumes/DATA/python_projects/m3u8_downloader/download'
OUT_PATH = '/Volumes/DATA/python_projects/m3u8_downloader/download'
FFMPEG_PATH = '/usr/local/bin/ffmpeg'
def get_play_list(m3u8_uri):
variant_m3u8 = m3u8._load_from_uri(m3u8_uri)
print('[A] 解析播放列表.......')
print('[A] 是否存在播放列表:' + str(variant_m3u8.is_variant))
play_list = []
if variant_m3u8.is_variant is False:
play_list = [m3u8_uri]
return play_list
# variant_m3u8.playlists
for iframe_playlist in variant_m3u8.playlists:
pl_uri = iframe_playlist.uri
if str(pl_uri).startswith('/'):
ul = str(m3u8_uri).split('/')[:3]
base_uri = '/'.join(ul)
else:
ul = str(m3u8_uri).split('/')
del (ul[-1])
base_uri = '/'.join(ul) + '/'
print('[A] Base Url:' + base_uri)
pl_url = base_uri + pl_uri
play_list.append(pl_url)
# print(iframe_playlist.stream_info.subtitles)
print('[A] 播放列表地址:' + pl_url)
# print(play_list)
return play_list
def format_lines(base_url, file_path):
print('[F] 处理文件路径......')
m3u8_obj = m3u8._load_from_file(file_path)
with open(file_path, 'r+') as f:
content = list(f)
new_content = []
for l in content:
for key in m3u8_obj.keys:
if key and key.uri in l and (not str(key.uri).startswith('http')):
key_url = base_url + key.uri
l = str(l).replace(key.uri, key_url)
print('[F] Key Url:' + key_url)
if '.ts' in l:
if not str(l).startswith('http'):
if str(l).startswith('/'):
ul = str(base_url).split('/')[:3]
base_uri = '/'.join(ul)
l = base_uri + l
else:
l = base_url + l
# print('[F] TS Url:' + l)
new_content.append(l)
f.seek(0)
f.truncate() #
f.writelines(new_content)
return content
def get_url_source_code(url):
print('[G] 获取源码')
souce = requests.get(url=url, headers=HEADERS, timeout=10, verify=False)
html = souce.content
html_doc = str(html, 'utf-8')
return html_doc
def save_m3u8_file(url, path, file_name):
try:
file_path = os.path.join(path, file_name)
print('[S] 下载m3u8路径:' + path + ' \r\n[S] 链接:' + url)
print('[S] 文件名:' + file_name)
durl = url
if os.path.isfile(file_path):
print('[F] 文件已经存在,跳过保存')
return file_path
img = requests.get(durl, headers=HEADERS, timeout=10).content
with open(file_path, 'ab') as f:
f.write(img)
print('[S] 下载m3u8成功')
return file_path
except Exception as e:
# print(e)
print('[S] 下载m3u8失败: ' + str(e))
return None
def get_m3u8_link(url):
print('_' * 70)
print('[A] 解析播放地址......')
html_doc = get_url_source_code(url)
bs = BeautifulSoup(html_doc, "html.parser")
pattern = re.compile(r"var cms_player = {(.*?);$", re.MULTILINE | re.DOTALL)
surls = bs.find('script', text=pattern)
# print(surls.text)
js_string = str(surls.text).replace('var cms_player = ', '').replace(';', '')
json_data = json.loads(js_string)
m3u8_link = json_data['url']
title = bs.title.string
print('[A] 标题:' + title)
print('[A] 播放地址:' + m3u8_link)
print('_' * 70)
return m3u8_link, title
def mark_dir(flot_name):
"""
检测文件夹是否创建,没有创建则创建文件夹,创建了就跳过
"""
print('[C] 创建目录: ' + flot_name)
PATH = os.path.join(DIR_PATH, flot_name)
if not os.path.exists(PATH): # 检测是否有这个文件夹
os.makedirs(PATH)
os.chdir(PATH)
return PATH
def download_ts_with_ffmpeg(m3u8_file_path, out_file_path):
print('[D] 下载文件......')
print('[D] 文件路径:' + out_file_path)
cmd_string = FFMPEG_PATH \
+ ' -protocol_whitelist "file,http,crypto,tcp,https,tls" ' \
+ ' -i ' + m3u8_file_path \
+ ' -c copy ' + out_file_path
print(cmd_string)
os.system(cmd_string)
print('[D] 下载完成.')
print('*' * 100)
def print_usage():
print('*' * 100)
print('m3u8 downloader by obaby')
print('downloader.py -i -o ')
print('http://www.h4ck.org.cn')
print('*' * 100)
def main(argv):
inputfile = ''
outputfile = ''
try:
opts, args = getopt.getopt(argv, "hi:o:", ["ilink=", "ofile="])
except getopt.GetoptError:
print_usage()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print_usage()
sys.exit()
elif opt in ("-i", "--ilink"):
inputfile = arg
elif opt in ("-o", "--ofile"):
outputfile = arg
if inputfile == '':
print_usage()
sys.exit(2)
if inputfile.endswith('.m3u8') or '.m3u8' in inputfile:
if outputfile == '':
title = inputfile.replace('https:','').replace('http:','').replace('/', '')
else:
title = outputfile
m3u8_link = inputfile
else:
m3u8_link, title = get_m3u8_link(inputfile) #
title = title.replace('\r', '').replace('\n', '').replace(' ', '')
mp4_file_name = title + '.mp4'
out_put_file_path = os.path.join(OUT_PATH, mp4_file_name)
if sys.platform.startswith('win32'):
play_lists = get_play_list(m3u8_link)
m3u8_local_file_name = str(m3u8_link).split('/')[-2] + '.m3u8'
for pl in play_lists:
mf = save_m3u8_file(pl, DIR_PATH, m3u8_local_file_name)
ul = str(pl).split('/')
del (ul[-1])
base_url = '/'.join(ul) + '/'
#print(mf)
format_lines(base_url, mf)
download_ts_with_ffmpeg(mf, out_put_file_path)
else:
download_ts_with_ffmpeg(m3u8_link,out_put_file_path)
if __name__ == '__main__':
main(sys.argv[1:])
如果在mac下代码要简单的多,直接将m3u8文件扔进去然后进行下载即可,也无需解析playlist(建议在osx 或者linux下运行)。
3 comments
你好,能解释一下 加上这个: -protocol_whitelist “file,http,crypto,tcp,https,tls”
的作用是什么吗?
还看到一些加 -allowed_extensions ALL,也不明白.
protocol_whitelist可以简单的理解为协议白名单,如果不在这个白名单内下载可能会导致: Protocol not on whitelist 。
这个是官方解释:
更多信息可以参考:https://ffmpeg.org/ffmpeg-all.html
看看怎么用的