diff --git a/bilibili.py b/bilibili.py index 78dd376..b9db72b 100644 --- a/bilibili.py +++ b/bilibili.py @@ -20,6 +20,8 @@ from youtube_dl.utils import DownloadError, UnavailableVideoError, MaxDownloadsR DEFAULT_VALUES = { 'merge': True, 'delete_flv': False, + 'retries': 10, + 'ratelimit': 513 * 2 ** 10, 'max_retry': 10, 'output': "%(title)s-|||||||-%(id)s.%(ext)s", 'output_dir': os.getcwd(), @@ -50,6 +52,8 @@ class Bilibili: self.url = params['url'] self.merge = params.get('merge', DEFAULT_VALUES['merge']) self.delete_flv = params.get('delete_flv', DEFAULT_VALUES['delete_flv']) + self.retries = params.get('retries', DEFAULT_VALUES['retries']) + self.ratelimit = params.get('ratelimit', DEFAULT_VALUES['ratelimit']) self.max_retry = params.get('max_retry', DEFAULT_VALUES['max_retry']) self.output_format = params.get('output', DEFAULT_VALUES['output']) self.output_dir = params.get('output_dir', DEFAULT_VALUES['output_dir']) @@ -62,6 +66,9 @@ class Bilibili: 'format': 'bestvideo+bestaudio/best', 'logger': MyLogger(self.debug), 'outtmpl': self.output_format, + 'retries': self.retries, + 'ratelimit': self.ratelimit, + # 'verbose': True } # Deal with playlist @@ -99,11 +106,14 @@ class Bilibili: # Rename tmp folder if ret == 0: - target_dir = os.path.join(self.output_dir, self.prepare_output_filename(res, p_num)) - if os.path.exists(target_dir): - shutil.rmtree(target_dir) - if not self.delete_flv: - shutil.copytree(output_dir, target_dir) + target_path = os.path.join(self.output_dir, self.prepare_output_filename(res, p_num)) + if not res.get("file_exist", False): + if os.path.exists(target_path): + shutil.rmtree(target_path) + if not self.delete_flv: + shutil.copytree(output_dir, target_path) + else: + print(target_path, " already exists, skipping.") else: ret_code = -1 shutil.rmtree(output_dir) @@ -111,8 +121,11 @@ class Bilibili: return ret_code, self def process_single(self, url, output_dir, p_num): - ret, res = self.download(url) - if ret != 0: + ret, res = self.download(url, p_num) + if ret == -2: + res["file_exist"] = True + return res # Skip existing file + elif ret != 0: raise RuntimeError("Download unsuccessful") file_list = self.get_file_list(output_dir, res) if self.merge: @@ -121,10 +134,29 @@ class Bilibili: raise RuntimeError("Convert/concat unsuccessful") return res - def download(self, url): + def download(self, url, p_num): ret = 0 with youtube_dl.YoutubeDL(self.options) as ydl: try: + res = ydl.extract_info( + url, download=False, + force_generic_extractor=ydl.params.get('force_generic_extractor', False)) + output_filename = ydl.prepare_filename(res) + output_filename, output_ext = os.path.splitext(output_filename) + if output_ext == '.m4s': + # Deal with mkv + output_ext = '.mkv' + else: + output_ext = '.mp4' + + output_filename = output_filename + output_ext + + res['output_filename'] = output_filename + output_file_abs_paths = [os.path.abspath(os.path.join( + '..', self.prepare_output_filename(res, p_num) + ext)) for ext in ('.mp4', '.mkv')] + for abs_path in output_file_abs_paths: + if os.path.exists(abs_path): + return -2, res res = ydl.extract_info(url, force_generic_extractor=ydl.params.get('force_generic_extractor', False)) except UnavailableVideoError: print("Failed video URL: " + url) @@ -142,13 +174,15 @@ class Bilibili: else: if ydl.params.get('dump_single_json', False): ydl.to_stdout(json.dumps(res)) - res['output_filename'] = ydl.prepare_filename(res) + res['output_filename'] = output_filename ret = ydl._download_retcode return ret, res def get_file_list(self, output_dir, res): ext = res.get('ext', 'flv') + if ext == 'm4s': + ext = 'mkv' file_list = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f)) and os.path.splitext(f)[-1] == os.path.extsep + ext] # and os.path.splitext(f)[0].startswith(title)] @@ -172,26 +206,34 @@ class Bilibili: def concat_videos(self, file_list, output_dir, res, p_num): # For ffmpeg concat demuxer, mp4 output ret = 0 - tmp_file = os.path.join(output_dir, 'temp_filelist.txt') - with open(tmp_file, 'w') as f: - for file in file_list: - # Deal with space in cmdline - f.write('file ' + "'" + file + "'" + '\n') - - if self.debug: - with open(tmp_file, 'r') as f: - print(''.join(f.readlines())) - - stream = ffmpeg.input(tmp_file, format='concat', safe=0) - stream = ffmpeg.output(stream, - os.path.join('..', self.prepare_output_filename(res, p_num) + os.path.extsep + 'mp4'), - c='copy') - if self.debug: - print(ffmpeg.compile(stream, overwrite_output=True)) - - ret = ffmpeg.run(stream, overwrite_output=True) - os.remove(tmp_file) - return ret + if res.get('ext') == 'm4s': + # Copy mkv + for f in file_list: + new_dst = os.path.join('..', self.prepare_output_filename(res, p_num) + os.path.extsep + 'mkv') + if os.path.exists(new_dst): + os.remove(new_dst) + dst = shutil.move(f, new_dst) + return ret, None + else: + tmp_file = os.path.join(output_dir, 'temp_filelist.txt') + with open(tmp_file, 'w') as f: + for file in file_list: + # Deal with space in cmdline + f.write('file ' + "'" + file + "'" + '\n') + + if self.debug: + with open(tmp_file, 'r') as f: + print(''.join(f.readlines())) + + stream = ffmpeg.input(tmp_file, format='concat', safe=0) + stream = ffmpeg.output(stream, os.path.join( + '..', self.prepare_output_filename(res, p_num) + os.path.extsep + 'mp4'), c='copy') + if self.debug: + print(ffmpeg.compile(stream, overwrite_output=True)) + + ret = ffmpeg.run(stream, overwrite_output=True) + os.remove(tmp_file) + return ret def prepare_output_filename(self, res, count): # title = utils.slugify(res['title'], allow_unicode=True, simple=True) diff --git a/multifile_wrapper.py b/multifile_wrapper.py index 97209a7..f290b9c 100644 --- a/multifile_wrapper.py +++ b/multifile_wrapper.py @@ -3,6 +3,10 @@ import csv import functools import os import multiprocessing as mp +import signal +import sys +import threading +import time import traceback from collections import OrderedDict @@ -25,41 +29,46 @@ def report_status(url_dict): print('"' + k + '": ' + v['status']) -def get_urls(input_dir, mode): - input_file = os.path.join(input_dir, 'input_list_' + mode + '.csv') +def get_urls(input_file): if not os.path.exists(input_file): raise FileNotFoundError("input url file not exist") - url_list = None with open(input_file, 'r') as f: reader = csv.DictReader(f) url_list = [dict(d) for d in reader] url_dict = OrderedDict() + url_dict_out = OrderedDict() url_keys = sorted(list(url_list[0].keys())) + url_keys_out = list(url_list[0].keys()) assert url_keys[1] == 'link' + assert url_keys_out[1] == 'link' for url_set in url_list: key = url_set['link'] value = {k: url_set[k] for k in url_keys[1:]} + value_out = {k: url_set[k] for k in url_keys_out} value['status'] = NOT_STARTED url_dict[key] = value + url_dict_out[key] = value_out - return url_dict + return url_dict, url_dict_out -def success_handler(result, url_dict=None): +def success_handler(result, url_dict=None, url_dict_out=None): ret_code = result[0] url = result[1].url if ret_code == 0: url_dict[url]['status'] = SUCCESS + del url_dict_out[url] report_status(url_dict) - # print('success') -def error_handler(e, url_dict=None): +def error_handler(e, url_dict=None, url_dict_out=None): # print('error') # print(dir(e), "\n") - print("-->{}<--".format(e.__cause__)) + sys.stdout.write("-->{}<--".format(e.__cause__)) traceback.print_exception(type(e), e, e.__traceback__) + sys.stdout.flush() + sys.stderr.flush() def single_url_wrapper(mode, url, params, this_url_info): @@ -67,13 +76,49 @@ def single_url_wrapper(mode, url, params, this_url_info): params = copy.deepcopy(params) params["url"] = url acc = processor[mode](params, copy.deepcopy(this_url_info)) - ret = acc.process_all() - return ret + ret, res = acc.process_all() + if ret < 0: + raise RuntimeError("%s download failed" % url) + return ret, res + + +def close_pool(pool=None): + pool.close() + pool.terminate() + pool.join() + + +def write_new_csv(input_file_new, url_dict_out): + sys.stdout.write("Writing new csv file\n") + url_list_out = list(url_dict_out.values()) + if len(url_list_out) == 0: + return + + header_list = list(url_list_out[0].keys()) + with open(input_file_new, 'w') as f: + writer = csv.DictWriter(f, fieldnames=header_list, extrasaction="ignore") + writer.writeheader() + writer.writerows(url_list_out) + + +def signal_handler(*args, mp_pool=None, url_dict=None, url_dict_out=None, input_file_new=""): + sys.stderr.write('\nStopping...') + + if mp_pool is not None: + stoppool = threading.Thread(target=functools.partial(close_pool, pool=mp_pool)) + stoppool.daemon = True + stoppool.start() + sys.stdout.write("Multiprocessing pool terminated\n") + sys.stdout.flush() + + write_new_csv(input_file_new, url_dict_out) + print("Signal handler finished") + print("Received signal: ", signal.Signals(args[0]).name) if __name__ == "__main__": mode = MODE_BILIBILI - num_of_processes = 2 + num_of_processes = 1 input_dir = 'Collections' output_dir = 'Collections/temp_output' @@ -84,21 +129,35 @@ if __name__ == "__main__": # 'output': "%(title)s.%(ext)s", } - url_dict = get_urls(input_dir, mode) - url_list = list(url_dict.keys()) + input_file = os.path.join(input_dir, 'input_list_' + mode + '.csv') + input_file_new = os.path.join(input_dir, 'input_list_' + mode + '_new.csv') + + url_dict, url_dict_out = get_urls(input_file) + # url_list = list(url_dict.keys()) + + on_error = functools.partial(error_handler, url_dict=url_dict, url_dict_out=url_dict_out) + on_success = functools.partial(success_handler, url_dict=url_dict, url_dict_out=url_dict_out) - on_error = functools.partial(error_handler, url_dict=url_dict) - on_success = functools.partial(success_handler, url_dict=url_dict) + pool = mp.Pool(processes=num_of_processes) + + signal_func = functools.partial(signal_handler, mp_pool=pool, + url_dict=url_dict, url_dict_out=url_dict_out, + input_file_new=input_file_new) + + for sig in (signal.SIGABRT, signal.SIGINT, signal.SIGTERM): + signal.signal(sig, signal_func) # with open(os.path.join(output_dir, 'logout.log'), 'w') as sys.stdout: # with open(os.path.join(output_dir, 'error.log'), 'w') as sys.stderr: - pool = mp.Pool(processes=num_of_processes) + for k in url_dict.keys(): # single_url_wrapper(mode, k, params, url_dict[k]) url_dict[k]['status'] = IN_PROCESS pool.apply_async(single_url_wrapper, args=(mode, k, params, url_dict[k]), error_callback=on_error, callback=on_success) + time.sleep(1) pool.close() pool.join() + write_new_csv(input_file_new, url_dict_out) report_status(url_dict) diff --git a/utils.py b/utils.py index 273dc3e..0646f97 100644 --- a/utils.py +++ b/utils.py @@ -6,6 +6,9 @@ import string import time import unicodedata +from urllib.error import URLError +from youtube_dl import DownloadError + def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.SystemRandom().choice(chars) for _ in range(size)) @@ -19,8 +22,14 @@ def retry_wrapper(func, max_retry, timeout=5): try: result = func() break + except DownloadError as e: + print(e) + time.sleep(5) + if not e.exc_info[0] is URLError: + n_retry += 1 except Exception as e: print(e) + time.sleep(1) n_retry += 1 if n_retry == max_retry: ret_code = -1 @@ -41,7 +50,7 @@ def slugify(value, allow_unicode=True, simple=True): else: value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') if simple: - value = re.sub(r'[/.]', '_', value) + value = re.sub(r'[/.:]', '_', value) return value else: value = re.sub(r'[^\w\s-]', '_', value).strip().lower() diff --git a/youtube-dl_mod/bilibili.py.modified b/youtube-dl_mod/bilibili.py.modified index 95dd63c..97085e6 100644 --- a/youtube-dl_mod/bilibili.py.modified +++ b/youtube-dl_mod/bilibili.py.modified @@ -15,6 +15,7 @@ from ..utils import ( float_or_none, parse_iso8601, smuggle_url, + str_or_none, strip_jsonp, unified_timestamp, unsmuggle_url, @@ -124,6 +125,7 @@ class BiliBiliIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') or mobj.group('id_bv') + print("video_id: ", video_id) anime_id = mobj.group('anime_id') page = mobj.group('page') or 1 webpage = self._download_webpage(url, video_id) @@ -172,49 +174,154 @@ class BiliBiliIE(InfoExtractor): entries = [] - RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') - for num, rendition in enumerate(RENDITIONS, start=1): - payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - - if not video_info: - continue - - if 'durl' not in video_info: - if num < len(RENDITIONS): + payload = 'bvid=%s&cid=%s' % (video_id, cid) + video_info = self._download_json( + 'https://api.bilibili.com/x/player/playurl?%s&qn=116&type=&otype=json&fnver=0&fnval=16&fourk=1' % payload, + video_id, note='Downloading video info page', + headers=headers, fatal=True) + + print("video_info: ", video_info) + + video_info = video_info['data'] + + def find_num_segments(video_list, qualities): + max_segment_num = 0 + for quality in qualities: + segment_num = sum(v.get('id') == quality for v in video_list) + if segment_num > max_segment_num: + max_segment_num = segment_num + return max_segment_num + + def sort_segments(video_info, qualities): + videos = {v['codecid']: [] for v in video_info['dash']['video']} + audios = {a['codecid']: [] for a in video_info['dash']['audio']} + + for v in video_info['dash']['video']: + videos[v['codecid']].append(v) + for a in video_info['dash']['audio']: + audios[a['codecid']].append(a) + for qualities in videos.values(): + qualities.sort(key=lambda x: x['id'], reverse=True) + for qualities in audios.values(): + qualities.sort(key=lambda x: x['id'], reverse=True) + videos = list(i[1] for i in sorted(videos.items())) + audios = list(i[1] for i in sorted(audios.items())) + + return videos, audios + + if 'durl' in video_info: + # Use old API + print("durl") + RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') + for num, rendition in enumerate(RENDITIONS, start=1): + payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + + video_info = self._download_json( + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + + if not video_info: continue - self._report_error(video_info) - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): + if 'durl' not in video_info: + if num < len(RENDITIONS): + continue + self._report_error(video_info) + + for idx, durl in enumerate(video_info['durl']): + formats = [{ + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), + }] + for backup_url in durl.get('backup_url', []): + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'preference': -2 if 'hd.mp4' in backup_url else -3, + }) + + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': "https://www.bilibili.com/", + 'Origin': "https://www.bilibili.com", + }) + + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, + }) + break + print("video_info_old: ", video_info) + elif 'dash' in video_info: + qualities = sorted(video_info['accept_quality'], reverse=True) + print("qualities: ", qualities) + + # video_segment_num = find_num_segments(video_info['dash']['video'], qualities) + # audio_segment_num = find_num_segments(video_info['dash']['audio'], qualities) + + # videos, audios = sort_segments(video_info, qualities) + formats = [] + + # Video + for v in video_info['dash']['video']: + formats.append({ + 'url': v['baseUrl'], + 'vcodec': v['codecs'], + 'acodec': 'none', + 'width': v['width'], + 'height': v['height'], + 'quality': v['id'] + 1 if 'hev' in v['codecs'] else v['id'] + }) + if v.get('backupUrl', None) is not None: formats.append({ - 'url': backup_url, + 'url': v['backupUrl'], # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url else -3, + 'acodec': 'none', + 'quality': v['id'] + 1 if 'hev' in v['codecs'] else v['id'], + 'preference': -3, }) - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, + # Audio + for a in video_info['dash']['audio']: + formats.append({ + 'url': a['baseUrl'], + 'vcodec': 'none', + 'acodec': a['codecs'], + 'quality': a['id'] - 30200 + }) + if a.get('backupUrl', None) is not None: + formats.append({ + 'url': a['backupUrl'], + # backup URLs have lower priorities + 'vcodec': 'none', + 'quality': a['id'] - 30200, + 'preference': -3, }) - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': "https://www.bilibili.com/", + 'Origin': "https://www.bilibili.com", + 'Sec-Fetch-Site': "cross-site", + 'Sec-Fetch-Mode': "cors", + 'Sec-Fetch-Dest': "empty", }) - break + + self._sort_formats(formats) + + entries.append({ + 'id': '%s' % (video_id), + 'duration': float_or_none(video_info.get('timelength'), 1000), + 'formats': formats, + }) + + else: + self._report_error(video_info) title = self._html_search_regex( (']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', @@ -249,8 +356,10 @@ class BiliBiliIE(InfoExtractor): info['uploader'] = self._html_search_meta( 'author', webpage, 'uploader', default=None) + print("entries: \n") for entry in entries: entry.update(info) + print(entry) if len(entries) == 1: return entries[0] diff --git a/youtube-dl_mod/bilibili.py.patch b/youtube-dl_mod/bilibili.py.patch new file mode 100644 index 0000000..4ad52f0 --- /dev/null +++ b/youtube-dl_mod/bilibili.py.patch @@ -0,0 +1,249 @@ +--- bilibili.py.orig 2020-09-06 12:43:02.935724487 -0400 ++++ bilibili.py.modified 2020-09-05 23:30:38.708500829 -0400 +@@ -34,7 +34,7 @@ + anime/(?P<anime_id>\d+)/play\# + )(?P<id_bv>\d+)| + video/[bB][vV](?P<id>[^/?#&]+) +- ) ++ )(?:/?\?p=(?P<page>\d+))? + ''' + + _TESTS = [{ +@@ -125,18 +125,30 @@ + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_bv') ++ print("video_id: ", video_id) + anime_id = mobj.group('anime_id') ++ page = mobj.group('page') or 1 + webpage = self._download_webpage(url, video_id) + + if 'anime/' not in url: +- cid = self._search_regex( +- r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', +- default=None +- ) or compat_parse_qs(self._search_regex( +- [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', +- r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', +- r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], +- webpage, 'player parameters'))['cid'][0] ++ mobj = re.findall(r'cid(?:["\']:|=)(\d+)', webpage) ++ seen_mobj = set() ++ mobj = [c for c in mobj if int(c) > 12 and not (c in seen_mobj or seen_mobj.add(c))] ++ print(mobj) ++ cid = mobj[int(page) - 1] or \ ++ compat_parse_qs(self._search_regex([ ++ r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', ++ r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', ++ r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], ++ webpage, 'player parameters'))['cid'][0] ++ # cid = self._search_regex( ++ # r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', ++ # default=None ++ # ) or compat_parse_qs(self._search_regex( ++ # [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', ++ # r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', ++ # r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], ++ # webpage, 'player parameters'))['cid'][0] + else: + if 'no_bangumi_tip' not in smuggled_data: + self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % ( +@@ -162,49 +174,154 @@ + + entries = [] + +- RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') +- for num, rendition in enumerate(RENDITIONS, start=1): +- payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) +- sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() +- +- video_info = self._download_json( +- 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), +- video_id, note='Downloading video info page', +- headers=headers, fatal=num == len(RENDITIONS)) ++ payload = 'bvid=%s&cid=%s' % (video_id, cid) ++ video_info = self._download_json( ++ 'https://api.bilibili.com/x/player/playurl?%s&qn=116&type=&otype=json&fnver=0&fnval=16&fourk=1' % payload, ++ video_id, note='Downloading video info page', ++ headers=headers, fatal=True) ++ ++ print("video_info: ", video_info) ++ ++ video_info = video_info['data'] ++ ++ def find_num_segments(video_list, qualities): ++ max_segment_num = 0 ++ for quality in qualities: ++ segment_num = sum(v.get('id') == quality for v in video_list) ++ if segment_num > max_segment_num: ++ max_segment_num = segment_num ++ return max_segment_num ++ ++ def sort_segments(video_info, qualities): ++ videos = {v['codecid']: [] for v in video_info['dash']['video']} ++ audios = {a['codecid']: [] for a in video_info['dash']['audio']} ++ ++ for v in video_info['dash']['video']: ++ videos[v['codecid']].append(v) ++ for a in video_info['dash']['audio']: ++ audios[a['codecid']].append(a) ++ for qualities in videos.values(): ++ qualities.sort(key=lambda x: x['id'], reverse=True) ++ for qualities in audios.values(): ++ qualities.sort(key=lambda x: x['id'], reverse=True) ++ videos = list(i[1] for i in sorted(videos.items())) ++ audios = list(i[1] for i in sorted(audios.items())) ++ ++ return videos, audios ++ ++ if 'durl' in video_info: ++ # Use old API ++ print("durl") ++ RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') ++ for num, rendition in enumerate(RENDITIONS, start=1): ++ payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) ++ sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() ++ ++ video_info = self._download_json( ++ 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), ++ video_id, note='Downloading video info page', ++ headers=headers, fatal=num == len(RENDITIONS)) + +- if not video_info: +- continue +- +- if 'durl' not in video_info: +- if num < len(RENDITIONS): ++ if not video_info: + continue +- self._report_error(video_info) + +- for idx, durl in enumerate(video_info['durl']): +- formats = [{ +- 'url': durl['url'], +- 'filesize': int_or_none(durl['size']), +- }] +- for backup_url in durl.get('backup_url', []): ++ if 'durl' not in video_info: ++ if num < len(RENDITIONS): ++ continue ++ self._report_error(video_info) ++ ++ for idx, durl in enumerate(video_info['durl']): ++ formats = [{ ++ 'url': durl['url'], ++ 'filesize': int_or_none(durl['size']), ++ }] ++ for backup_url in durl.get('backup_url', []): ++ formats.append({ ++ 'url': backup_url, ++ # backup URLs have lower priorities ++ 'preference': -2 if 'hd.mp4' in backup_url else -3, ++ }) ++ ++ for a_format in formats: ++ a_format.setdefault('http_headers', {}).update({ ++ 'Referer': "https://www.bilibili.com/", ++ 'Origin': "https://www.bilibili.com", ++ }) ++ ++ self._sort_formats(formats) ++ ++ entries.append({ ++ 'id': '%s_part%s' % (video_id, idx), ++ 'duration': float_or_none(durl.get('length'), 1000), ++ 'formats': formats, ++ }) ++ break ++ print("video_info_old: ", video_info) ++ elif 'dash' in video_info: ++ qualities = sorted(video_info['accept_quality'], reverse=True) ++ print("qualities: ", qualities) ++ ++ # video_segment_num = find_num_segments(video_info['dash']['video'], qualities) ++ # audio_segment_num = find_num_segments(video_info['dash']['audio'], qualities) ++ ++ # videos, audios = sort_segments(video_info, qualities) ++ formats = [] ++ ++ # Video ++ for v in video_info['dash']['video']: ++ formats.append({ ++ 'url': v['baseUrl'], ++ 'vcodec': v['codecs'], ++ 'acodec': 'none', ++ 'width': v['width'], ++ 'height': v['height'], ++ 'quality': v['id'] + 1 if 'hev' in v['codecs'] else v['id'] ++ }) ++ if v.get('backupUrl', None) is not None: + formats.append({ +- 'url': backup_url, ++ 'url': v['backupUrl'], + # backup URLs have lower priorities +- 'preference': -2 if 'hd.mp4' in backup_url else -3, ++ 'acodec': 'none', ++ 'quality': v['id'] + 1 if 'hev' in v['codecs'] else v['id'], ++ 'preference': -3, + }) + +- for a_format in formats: +- a_format.setdefault('http_headers', {}).update({ +- 'Referer': url, ++ # Audio ++ for a in video_info['dash']['audio']: ++ formats.append({ ++ 'url': a['baseUrl'], ++ 'vcodec': 'none', ++ 'acodec': a['codecs'], ++ 'quality': a['id'] - 30200 ++ }) ++ if a.get('backupUrl', None) is not None: ++ formats.append({ ++ 'url': a['backupUrl'], ++ # backup URLs have lower priorities ++ 'vcodec': 'none', ++ 'quality': a['id'] - 30200, ++ 'preference': -3, + }) + +- self._sort_formats(formats) +- +- entries.append({ +- 'id': '%s_part%s' % (video_id, idx), +- 'duration': float_or_none(durl.get('length'), 1000), +- 'formats': formats, ++ for a_format in formats: ++ a_format.setdefault('http_headers', {}).update({ ++ 'Referer': "https://www.bilibili.com/", ++ 'Origin': "https://www.bilibili.com", ++ 'Sec-Fetch-Site': "cross-site", ++ 'Sec-Fetch-Mode': "cors", ++ 'Sec-Fetch-Dest': "empty", + }) +- break ++ ++ self._sort_formats(formats) ++ ++ entries.append({ ++ 'id': '%s' % (video_id), ++ 'duration': float_or_none(video_info.get('timelength'), 1000), ++ 'formats': formats, ++ }) ++ ++ else: ++ self._report_error(video_info) + + title = self._html_search_regex( + ('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', +@@ -239,8 +356,10 @@ + info['uploader'] = self._html_search_meta( + 'author', webpage, 'uploader', default=None) + ++ print("entries: \n") + for entry in entries: + entry.update(info) ++ print(entry) + + if len(entries) == 1: + return entries[0] diff --git a/youtube-dl_mod/http.py.modified b/youtube-dl_mod/http.py.modified new file mode 100644 index 0000000..d0e8018 --- /dev/null +++ b/youtube-dl_mod/http.py.modified @@ -0,0 +1,373 @@ +from __future__ import unicode_literals + +import errno +import os +import socket +import time +import random +import re + +from .common import FileDownloader +from ..compat import ( + compat_str, + compat_urllib_error, +) +from ..utils import ( + ContentTooShortError, + encodeFilename, + int_or_none, + sanitize_open, + sanitized_Request, + write_xattr, + XAttrMetadataError, + XAttrUnavailableError, +) + + +class HttpFD(FileDownloader): + def real_download(self, filename, info_dict): + url = info_dict['url'] + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None + + # Do not include the Accept-Encoding header + headers = {} + add_headers = info_dict.get('http_headers') + print("add_headers: ", add_headers) + + if add_headers: + headers.update(add_headers) + + print("headers: ", headers) + print("url: ", url) + is_test = self.params.get('test', False) + chunk_size = self._TEST_FILE_SIZE if is_test else ( + info_dict.get('downloader_options', {}).get('http_chunk_size') + or self.params.get('http_chunk_size') or 0) + + ctx.open_mode = 'wb' + ctx.resume_len = 0 + ctx.data_len = None + ctx.block_size = self.params.get('buffersize', 1024) + ctx.start_time = time.time() + ctx.chunk_size = None + + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize( + encodeFilename(ctx.tmpfilename)) + + ctx.is_resume = ctx.resume_len > 0 + + count = 0 + retries = self.params.get('retries', 0) + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + class NextFragment(Exception): + pass + + def set_range(req, start, end): + range_header = 'bytes=%d-' % start + if end: + range_header += compat_str(end) + req.add_header('Range', range_header) + + def establish_connection(): + ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) + if not is_test and chunk_size else chunk_size) + if ctx.resume_len > 0: + range_start = ctx.resume_len + if ctx.is_resume: + self.report_resuming_byte(ctx.resume_len) + ctx.open_mode = 'ab' + elif ctx.chunk_size > 0: + range_start = 0 + else: + range_start = None + ctx.is_resume = False + range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None + if range_end and ctx.data_len is not None and range_end >= ctx.data_len: + range_end = ctx.data_len - 1 + has_range = range_start is not None + ctx.has_range = has_range + request = sanitized_Request(url, None, headers) + if has_range: + set_range(request, range_start, range_end) + # Establish connection + try: + ctx.data = self.ydl.urlopen(request) + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) + if has_range: + content_range = ctx.data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) + # Content-Range is present and matches requested Range, resume is possible + if content_range_m: + if range_start == int(content_range_m.group(1)): + content_range_end = int_or_none(content_range_m.group(2)) + content_len = int_or_none(content_range_m.group(3)) + accept_content_len = ( + # Non-chunked download + not ctx.chunk_size + # Chunked download and requested piece or + # its part is promised to be served + or content_range_end == range_end + or content_len < range_end) + if accept_content_len: + ctx.data_len = content_len + return + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload + self.report_unable_to_resume() + ctx.resume_len = 0 + ctx.open_mode = 'wb' + ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) + return + except (compat_urllib_error.HTTPError, ) as err: + if err.code == 416: + # Unable to resume (requested range not satisfiable) + try: + # Open the connection again without the range header + ctx.data = self.ydl.urlopen( + sanitized_Request(url, None, headers)) + content_length = ctx.data.info()['Content-Length'] + except (compat_urllib_error.HTTPError, ) as err: + if err.code < 500 or err.code >= 600: + raise + else: + # Examine the reported length + if (content_length is not None + and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): + # The file had already been fully downloaded. + # Explanation to the above condition: in issue #175 it was revealed that + # YouTube sometimes adds or removes a few bytes from the end of the file, + # changing the file size slightly and causing problems for some users. So + # I decided to implement a suggested change and consider the file + # completely downloaded if the file size differs less than 100 bytes from + # the one in the hard drive. + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) + self._hook_progress({ + 'filename': ctx.filename, + 'status': 'finished', + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, + }) + raise SucceedDownload() + else: + # The length does not match, we start the download over + self.report_unable_to_resume() + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + elif err.code < 500 or err.code >= 600: + # Unexpected HTTP error + raise + raise RetryDownload(err) + except socket.error as err: + if err.errno != errno.ECONNRESET: + # Connection reset is no problem, just retry + raise + raise RetryDownload(err) + + def download(): + data_len = ctx.data.info().get('Content-length', None) + print("data_len: ", data_len) + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE + + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) + return False + + byte_counter = 0 + ctx.resume_len + block_size = ctx.block_size + start = time.time() + + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring + + def retry(e): + to_stdout = ctx.tmpfilename == '-' + if not to_stdout and ctx.stream is not None: + ctx.stream.close() + ctx.stream = None + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) + raise RetryDownload(e) + + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if data_len is None else min(block_size, data_len - byte_counter)) + # socket.timeout is a subclass of socket.error but may not have + # errno set + except socket.timeout as e: + retry(e) + except socket.error as e: + if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): + raise + retry(e) + + byte_counter += len(data_block) + print("byte_counter: ", byte_counter) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: + try: + ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except (OSError, IOError) as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False + + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) + + try: + ctx.stream.write(data_block) + except (IOError, OSError) as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) + return False + + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if ctx.data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': ctx.data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - ctx.start_time, + }) + + if data_len is not None and byte_counter == data_len: + break + + if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: + ctx.resume_len = byte_counter + # ctx.block_size = block_size + raise NextFragment() + + # Deal with bilibili last fragment shorter than expected length + if info_dict.get('extractor') == 'BiliBili' and info_dict.get('n_entries', 1) > 1 \ + and info_dict.get('n_entries', 0) == info_dict.get('playlist_index', -1): + if ctx.stream is None: + self.to_stderr('\nDid not get any data blocks') + elif ctx.tmpfilename != '-': + ctx.stream.close() + + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + else: + if ctx.stream is None: + self.to_stderr('\n') + self.report_error('Did not get any data blocks') + return False + if ctx.tmpfilename != '-': + ctx.stream.close() + + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + raise err + + self.try_rename(ctx.tmpfilename, ctx.filename) + + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) + + self._hook_progress({ + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - ctx.start_time, + }) + + return True + + while count <= retries: + try: + establish_connection() + return download() + except RetryDownload as e: + count += 1 + if count <= retries: + self.report_retry(e.source_error, count, retries) + continue + except NextFragment: + continue + except SucceedDownload: + return True + + self.report_error('giving up after %s retries' % retries) + return False diff --git a/youtube-dl_mod/http.py.patch b/youtube-dl_mod/http.py.patch new file mode 100644 index 0000000..4f395d5 --- /dev/null +++ b/youtube-dl_mod/http.py.patch @@ -0,0 +1,88 @@ +--- http.py 2020-09-06 12:40:29.065733507 -0400 ++++ http.py.modified 2020-09-06 12:41:50.335728741 -0400 +@@ -39,11 +39,15 @@ + ctx.stream = None + + # Do not include the Accept-Encoding header +- headers = {'Youtubedl-no-compression': 'True'} ++ headers = {} + add_headers = info_dict.get('http_headers') ++ print("add_headers: ", add_headers) ++ + if add_headers: + headers.update(add_headers) + ++ print("headers: ", headers) ++ print("url: ", url) + is_test = self.params.get('test', False) + chunk_size = self._TEST_FILE_SIZE if is_test else ( + info_dict.get('downloader_options', {}).get('http_chunk_size') +@@ -188,6 +192,7 @@ + + def download(): + data_len = ctx.data.info().get('Content-length', None) ++ print("data_len: ", data_len) + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). +@@ -218,7 +223,7 @@ + + def retry(e): + to_stdout = ctx.tmpfilename == '-' +- if not to_stdout: ++ if not to_stdout and ctx.stream is not None: + ctx.stream.close() + ctx.stream = None + ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) +@@ -238,6 +243,7 @@ + retry(e) + + byte_counter += len(data_block) ++ print("byte_counter: ", byte_counter) + + # exit loop when download is finished + if len(data_block) == 0: +@@ -307,18 +313,31 @@ + # ctx.block_size = block_size + raise NextFragment() + +- if ctx.stream is None: +- self.to_stderr('\n') +- self.report_error('Did not get any data blocks') +- return False +- if ctx.tmpfilename != '-': +- ctx.stream.close() +- +- if data_len is not None and byte_counter != data_len: +- err = ContentTooShortError(byte_counter, int(data_len)) +- if count <= retries: +- retry(err) +- raise err ++ # Deal with bilibili last fragment shorter than expected length ++ if info_dict.get('extractor') == 'BiliBili' and info_dict.get('n_entries', 1) > 1 \ ++ and info_dict.get('n_entries', 0) == info_dict.get('playlist_index', -1): ++ if ctx.stream is None: ++ self.to_stderr('\nDid not get any data blocks') ++ elif ctx.tmpfilename != '-': ++ ctx.stream.close() ++ ++ if data_len is not None and byte_counter != data_len: ++ err = ContentTooShortError(byte_counter, int(data_len)) ++ if count <= retries: ++ retry(err) ++ else: ++ if ctx.stream is None: ++ self.to_stderr('\n') ++ self.report_error('Did not get any data blocks') ++ return False ++ if ctx.tmpfilename != '-': ++ ctx.stream.close() ++ ++ if data_len is not None and byte_counter != data_len: ++ err = ContentTooShortError(byte_counter, int(data_len)) ++ if count <= retries: ++ retry(err) ++ raise err + + self.try_rename(ctx.tmpfilename, ctx.filename) +