diff --git a/bilibili.py b/bilibili.py index 78dd376..b9db72b 100644 --- a/bilibili.py +++ b/bilibili.py @@ -20,6 +20,8 @@ from youtube_dl.utils import DownloadError, UnavailableVideoError, MaxDownloadsR DEFAULT_VALUES = { 'merge': True, 'delete_flv': False, + 'retries': 10, + 'ratelimit': 513 * 2 ** 10, 'max_retry': 10, 'output': "%(title)s-|||||||-%(id)s.%(ext)s", 'output_dir': os.getcwd(), @@ -50,6 +52,8 @@ class Bilibili: self.url = params['url'] self.merge = params.get('merge', DEFAULT_VALUES['merge']) self.delete_flv = params.get('delete_flv', DEFAULT_VALUES['delete_flv']) + self.retries = params.get('retries', DEFAULT_VALUES['retries']) + self.ratelimit = params.get('ratelimit', DEFAULT_VALUES['ratelimit']) self.max_retry = params.get('max_retry', DEFAULT_VALUES['max_retry']) self.output_format = params.get('output', DEFAULT_VALUES['output']) self.output_dir = params.get('output_dir', DEFAULT_VALUES['output_dir']) @@ -62,6 +66,9 @@ class Bilibili: 'format': 'bestvideo+bestaudio/best', 'logger': MyLogger(self.debug), 'outtmpl': self.output_format, + 'retries': self.retries, + 'ratelimit': self.ratelimit, + # 'verbose': True } # Deal with playlist @@ -99,11 +106,14 @@ class Bilibili: # Rename tmp folder if ret == 0: - target_dir = os.path.join(self.output_dir, self.prepare_output_filename(res, p_num)) - if os.path.exists(target_dir): - shutil.rmtree(target_dir) - if not self.delete_flv: - shutil.copytree(output_dir, target_dir) + target_path = os.path.join(self.output_dir, self.prepare_output_filename(res, p_num)) + if not res.get("file_exist", False): + if os.path.exists(target_path): + shutil.rmtree(target_path) + if not self.delete_flv: + shutil.copytree(output_dir, target_path) + else: + print(target_path, " already exists, skipping.") else: ret_code = -1 shutil.rmtree(output_dir) @@ -111,8 +121,11 @@ class Bilibili: return ret_code, self def process_single(self, url, output_dir, p_num): - ret, res = self.download(url) - if ret != 0: + ret, res = self.download(url, p_num) + if ret == -2: + res["file_exist"] = True + return res # Skip existing file + elif ret != 0: raise RuntimeError("Download unsuccessful") file_list = self.get_file_list(output_dir, res) if self.merge: @@ -121,10 +134,29 @@ class Bilibili: raise RuntimeError("Convert/concat unsuccessful") return res - def download(self, url): + def download(self, url, p_num): ret = 0 with youtube_dl.YoutubeDL(self.options) as ydl: try: + res = ydl.extract_info( + url, download=False, + force_generic_extractor=ydl.params.get('force_generic_extractor', False)) + output_filename = ydl.prepare_filename(res) + output_filename, output_ext = os.path.splitext(output_filename) + if output_ext == '.m4s': + # Deal with mkv + output_ext = '.mkv' + else: + output_ext = '.mp4' + + output_filename = output_filename + output_ext + + res['output_filename'] = output_filename + output_file_abs_paths = [os.path.abspath(os.path.join( + '..', self.prepare_output_filename(res, p_num) + ext)) for ext in ('.mp4', '.mkv')] + for abs_path in output_file_abs_paths: + if os.path.exists(abs_path): + return -2, res res = ydl.extract_info(url, force_generic_extractor=ydl.params.get('force_generic_extractor', False)) except UnavailableVideoError: print("Failed video URL: " + url) @@ -142,13 +174,15 @@ class Bilibili: else: if ydl.params.get('dump_single_json', False): ydl.to_stdout(json.dumps(res)) - res['output_filename'] = ydl.prepare_filename(res) + res['output_filename'] = output_filename ret = ydl._download_retcode return ret, res def get_file_list(self, output_dir, res): ext = res.get('ext', 'flv') + if ext == 'm4s': + ext = 'mkv' file_list = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f)) and os.path.splitext(f)[-1] == os.path.extsep + ext] # and os.path.splitext(f)[0].startswith(title)] @@ -172,26 +206,34 @@ class Bilibili: def concat_videos(self, file_list, output_dir, res, p_num): # For ffmpeg concat demuxer, mp4 output ret = 0 - tmp_file = os.path.join(output_dir, 'temp_filelist.txt') - with open(tmp_file, 'w') as f: - for file in file_list: - # Deal with space in cmdline - f.write('file ' + "'" + file + "'" + '\n') - - if self.debug: - with open(tmp_file, 'r') as f: - print(''.join(f.readlines())) - - stream = ffmpeg.input(tmp_file, format='concat', safe=0) - stream = ffmpeg.output(stream, - os.path.join('..', self.prepare_output_filename(res, p_num) + os.path.extsep + 'mp4'), - c='copy') - if self.debug: - print(ffmpeg.compile(stream, overwrite_output=True)) - - ret = ffmpeg.run(stream, overwrite_output=True) - os.remove(tmp_file) - return ret + if res.get('ext') == 'm4s': + # Copy mkv + for f in file_list: + new_dst = os.path.join('..', self.prepare_output_filename(res, p_num) + os.path.extsep + 'mkv') + if os.path.exists(new_dst): + os.remove(new_dst) + dst = shutil.move(f, new_dst) + return ret, None + else: + tmp_file = os.path.join(output_dir, 'temp_filelist.txt') + with open(tmp_file, 'w') as f: + for file in file_list: + # Deal with space in cmdline + f.write('file ' + "'" + file + "'" + '\n') + + if self.debug: + with open(tmp_file, 'r') as f: + print(''.join(f.readlines())) + + stream = ffmpeg.input(tmp_file, format='concat', safe=0) + stream = ffmpeg.output(stream, os.path.join( + '..', self.prepare_output_filename(res, p_num) + os.path.extsep + 'mp4'), c='copy') + if self.debug: + print(ffmpeg.compile(stream, overwrite_output=True)) + + ret = ffmpeg.run(stream, overwrite_output=True) + os.remove(tmp_file) + return ret def prepare_output_filename(self, res, count): # title = utils.slugify(res['title'], allow_unicode=True, simple=True) diff --git a/multifile_wrapper.py b/multifile_wrapper.py index 97209a7..f290b9c 100644 --- a/multifile_wrapper.py +++ b/multifile_wrapper.py @@ -3,6 +3,10 @@ import csv import functools import os import multiprocessing as mp +import signal +import sys +import threading +import time import traceback from collections import OrderedDict @@ -25,41 +29,46 @@ def report_status(url_dict): print('"' + k + '": ' + v['status']) -def get_urls(input_dir, mode): - input_file = os.path.join(input_dir, 'input_list_' + mode + '.csv') +def get_urls(input_file): if not os.path.exists(input_file): raise FileNotFoundError("input url file not exist") - url_list = None with open(input_file, 'r') as f: reader = csv.DictReader(f) url_list = [dict(d) for d in reader] url_dict = OrderedDict() + url_dict_out = OrderedDict() url_keys = sorted(list(url_list[0].keys())) + url_keys_out = list(url_list[0].keys()) assert url_keys[1] == 'link' + assert url_keys_out[1] == 'link' for url_set in url_list: key = url_set['link'] value = {k: url_set[k] for k in url_keys[1:]} + value_out = {k: url_set[k] for k in url_keys_out} value['status'] = NOT_STARTED url_dict[key] = value + url_dict_out[key] = value_out - return url_dict + return url_dict, url_dict_out -def success_handler(result, url_dict=None): +def success_handler(result, url_dict=None, url_dict_out=None): ret_code = result[0] url = result[1].url if ret_code == 0: url_dict[url]['status'] = SUCCESS + del url_dict_out[url] report_status(url_dict) - # print('success') -def error_handler(e, url_dict=None): +def error_handler(e, url_dict=None, url_dict_out=None): # print('error') # print(dir(e), "\n") - print("-->{}<--".format(e.__cause__)) + sys.stdout.write("-->{}<--".format(e.__cause__)) traceback.print_exception(type(e), e, e.__traceback__) + sys.stdout.flush() + sys.stderr.flush() def single_url_wrapper(mode, url, params, this_url_info): @@ -67,13 +76,49 @@ def single_url_wrapper(mode, url, params, this_url_info): params = copy.deepcopy(params) params["url"] = url acc = processor[mode](params, copy.deepcopy(this_url_info)) - ret = acc.process_all() - return ret + ret, res = acc.process_all() + if ret < 0: + raise RuntimeError("%s download failed" % url) + return ret, res + + +def close_pool(pool=None): + pool.close() + pool.terminate() + pool.join() + + +def write_new_csv(input_file_new, url_dict_out): + sys.stdout.write("Writing new csv file\n") + url_list_out = list(url_dict_out.values()) + if len(url_list_out) == 0: + return + + header_list = list(url_list_out[0].keys()) + with open(input_file_new, 'w') as f: + writer = csv.DictWriter(f, fieldnames=header_list, extrasaction="ignore") + writer.writeheader() + writer.writerows(url_list_out) + + +def signal_handler(*args, mp_pool=None, url_dict=None, url_dict_out=None, input_file_new=""): + sys.stderr.write('\nStopping...') + + if mp_pool is not None: + stoppool = threading.Thread(target=functools.partial(close_pool, pool=mp_pool)) + stoppool.daemon = True + stoppool.start() + sys.stdout.write("Multiprocessing pool terminated\n") + sys.stdout.flush() + + write_new_csv(input_file_new, url_dict_out) + print("Signal handler finished") + print("Received signal: ", signal.Signals(args[0]).name) if __name__ == "__main__": mode = MODE_BILIBILI - num_of_processes = 2 + num_of_processes = 1 input_dir = 'Collections' output_dir = 'Collections/temp_output' @@ -84,21 +129,35 @@ if __name__ == "__main__": # 'output': "%(title)s.%(ext)s", } - url_dict = get_urls(input_dir, mode) - url_list = list(url_dict.keys()) + input_file = os.path.join(input_dir, 'input_list_' + mode + '.csv') + input_file_new = os.path.join(input_dir, 'input_list_' + mode + '_new.csv') + + url_dict, url_dict_out = get_urls(input_file) + # url_list = list(url_dict.keys()) + + on_error = functools.partial(error_handler, url_dict=url_dict, url_dict_out=url_dict_out) + on_success = functools.partial(success_handler, url_dict=url_dict, url_dict_out=url_dict_out) - on_error = functools.partial(error_handler, url_dict=url_dict) - on_success = functools.partial(success_handler, url_dict=url_dict) + pool = mp.Pool(processes=num_of_processes) + + signal_func = functools.partial(signal_handler, mp_pool=pool, + url_dict=url_dict, url_dict_out=url_dict_out, + input_file_new=input_file_new) + + for sig in (signal.SIGABRT, signal.SIGINT, signal.SIGTERM): + signal.signal(sig, signal_func) # with open(os.path.join(output_dir, 'logout.log'), 'w') as sys.stdout: # with open(os.path.join(output_dir, 'error.log'), 'w') as sys.stderr: - pool = mp.Pool(processes=num_of_processes) + for k in url_dict.keys(): # single_url_wrapper(mode, k, params, url_dict[k]) url_dict[k]['status'] = IN_PROCESS pool.apply_async(single_url_wrapper, args=(mode, k, params, url_dict[k]), error_callback=on_error, callback=on_success) + time.sleep(1) pool.close() pool.join() + write_new_csv(input_file_new, url_dict_out) report_status(url_dict) diff --git a/utils.py b/utils.py index 273dc3e..0646f97 100644 --- a/utils.py +++ b/utils.py @@ -6,6 +6,9 @@ import string import time import unicodedata +from urllib.error import URLError +from youtube_dl import DownloadError + def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.SystemRandom().choice(chars) for _ in range(size)) @@ -19,8 +22,14 @@ def retry_wrapper(func, max_retry, timeout=5): try: result = func() break + except DownloadError as e: + print(e) + time.sleep(5) + if not e.exc_info[0] is URLError: + n_retry += 1 except Exception as e: print(e) + time.sleep(1) n_retry += 1 if n_retry == max_retry: ret_code = -1 @@ -41,7 +50,7 @@ def slugify(value, allow_unicode=True, simple=True): else: value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') if simple: - value = re.sub(r'[/.]', '_', value) + value = re.sub(r'[/.:]', '_', value) return value else: value = re.sub(r'[^\w\s-]', '_', value).strip().lower() diff --git a/youtube-dl_mod/bilibili.py.modified b/youtube-dl_mod/bilibili.py.modified index 95dd63c..97085e6 100644 --- a/youtube-dl_mod/bilibili.py.modified +++ b/youtube-dl_mod/bilibili.py.modified @@ -15,6 +15,7 @@ from ..utils import ( float_or_none, parse_iso8601, smuggle_url, + str_or_none, strip_jsonp, unified_timestamp, unsmuggle_url, @@ -124,6 +125,7 @@ class BiliBiliIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') or mobj.group('id_bv') + print("video_id: ", video_id) anime_id = mobj.group('anime_id') page = mobj.group('page') or 1 webpage = self._download_webpage(url, video_id) @@ -172,49 +174,154 @@ class BiliBiliIE(InfoExtractor): entries = [] - RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') - for num, rendition in enumerate(RENDITIONS, start=1): - payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - - if not video_info: - continue - - if 'durl' not in video_info: - if num < len(RENDITIONS): + payload = 'bvid=%s&cid=%s' % (video_id, cid) + video_info = self._download_json( + 'https://api.bilibili.com/x/player/playurl?%s&qn=116&type=&otype=json&fnver=0&fnval=16&fourk=1' % payload, + video_id, note='Downloading video info page', + headers=headers, fatal=True) + + print("video_info: ", video_info) + + video_info = video_info['data'] + + def find_num_segments(video_list, qualities): + max_segment_num = 0 + for quality in qualities: + segment_num = sum(v.get('id') == quality for v in video_list) + if segment_num > max_segment_num: + max_segment_num = segment_num + return max_segment_num + + def sort_segments(video_info, qualities): + videos = {v['codecid']: [] for v in video_info['dash']['video']} + audios = {a['codecid']: [] for a in video_info['dash']['audio']} + + for v in video_info['dash']['video']: + videos[v['codecid']].append(v) + for a in video_info['dash']['audio']: + audios[a['codecid']].append(a) + for qualities in videos.values(): + qualities.sort(key=lambda x: x['id'], reverse=True) + for qualities in audios.values(): + qualities.sort(key=lambda x: x['id'], reverse=True) + videos = list(i[1] for i in sorted(videos.items())) + audios = list(i[1] for i in sorted(audios.items())) + + return videos, audios + + if 'durl' in video_info: + # Use old API + print("durl") + RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') + for num, rendition in enumerate(RENDITIONS, start=1): + payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() + + video_info = self._download_json( + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + + if not video_info: continue - self._report_error(video_info) - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): + if 'durl' not in video_info: + if num < len(RENDITIONS): + continue + self._report_error(video_info) + + for idx, durl in enumerate(video_info['durl']): + formats = [{ + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), + }] + for backup_url in durl.get('backup_url', []): + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'preference': -2 if 'hd.mp4' in backup_url else -3, + }) + + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': "https://www.bilibili.com/", + 'Origin': "https://www.bilibili.com", + }) + + self._sort_formats(formats) + + entries.append({ + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, + }) + break + print("video_info_old: ", video_info) + elif 'dash' in video_info: + qualities = sorted(video_info['accept_quality'], reverse=True) + print("qualities: ", qualities) + + # video_segment_num = find_num_segments(video_info['dash']['video'], qualities) + # audio_segment_num = find_num_segments(video_info['dash']['audio'], qualities) + + # videos, audios = sort_segments(video_info, qualities) + formats = [] + + # Video + for v in video_info['dash']['video']: + formats.append({ + 'url': v['baseUrl'], + 'vcodec': v['codecs'], + 'acodec': 'none', + 'width': v['width'], + 'height': v['height'], + 'quality': v['id'] + 1 if 'hev' in v['codecs'] else v['id'] + }) + if v.get('backupUrl', None) is not None: formats.append({ - 'url': backup_url, + 'url': v['backupUrl'], # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url else -3, + 'acodec': 'none', + 'quality': v['id'] + 1 if 'hev' in v['codecs'] else v['id'], + 'preference': -3, }) - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, + # Audio + for a in video_info['dash']['audio']: + formats.append({ + 'url': a['baseUrl'], + 'vcodec': 'none', + 'acodec': a['codecs'], + 'quality': a['id'] - 30200 + }) + if a.get('backupUrl', None) is not None: + formats.append({ + 'url': a['backupUrl'], + # backup URLs have lower priorities + 'vcodec': 'none', + 'quality': a['id'] - 30200, + 'preference': -3, }) - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': "https://www.bilibili.com/", + 'Origin': "https://www.bilibili.com", + 'Sec-Fetch-Site': "cross-site", + 'Sec-Fetch-Mode': "cors", + 'Sec-Fetch-Dest': "empty", }) - break + + self._sort_formats(formats) + + entries.append({ + 'id': '%s' % (video_id), + 'duration': float_or_none(video_info.get('timelength'), 1000), + 'formats': formats, + }) + + else: + self._report_error(video_info) title = self._html_search_regex( ('