Browse Source

Initial commit

master
wchen342 3 years ago
commit
4ded652f5b
Signed by: wchen342 GPG Key ID: 720B70365E800508
  1. 264
      bilibili.py
  2. 209
      bilibili_search.py
  3. 90
      checksum.py
  4. 0
      main.py
  5. 104
      multifile_wrapper.py
  6. 53
      utils.py
  7. 335
      youtube-dl_mod/bilibili.py.modified

264
bilibili.py

@ -0,0 +1,264 @@
import functools
import json
import logging
import os
import random
import re
import shutil
import sys
import time
import argparse
import ffmpeg
import natsort
import youtube_dl
import utils
from youtube_dl.utils import DownloadError, UnavailableVideoError, MaxDownloadsReached
DEFAULT_VALUES = {
'merge': True,
'delete_flv': False,
'max_retry': 5,
'output': "%(title)s-|||||||-%(id)s.%(ext)s",
'output_dir': os.getcwd(),
'debug': False,
"debug_tmp_dir_name": "", # Not Implemented
}
class MyLogger(object):
def __init__(self, debug=False):
self._debug = debug
def debug(self, msg):
if self._debug:
print(msg)
def warning(self, msg):
logging.warning(msg)
def error(self, msg):
print(msg, file=sys.stderr)
# TODO string replace \r
class Bilibili:
def __init__(self, params, extra_params=None):
self.url = params['url']
self.merge = params.get('merge', DEFAULT_VALUES['merge'])
self.delete_flv = params.get('delete_flv', DEFAULT_VALUES['delete_flv'])
self.max_retry = params.get('max_retry', DEFAULT_VALUES['max_retry'])
self.output_format = params.get('output', DEFAULT_VALUES['output'])
self.output_dir = params.get('output_dir', DEFAULT_VALUES['output_dir'])
self.debug = params.get('debug', DEFAULT_VALUES['debug'])
self.p_start = None
self.p_end = None
if not os.path.exists(self.output_dir) or not os.path.isdir(self.output_dir):
raise FileNotFoundError('Output path does not exist!')
self.options = {
'format': 'bestvideo+bestaudio/best',
'logger': MyLogger(self.debug),
'outtmpl': self.output_format,
}
# Deal with playlist
if extra_params is not None and extra_params.get('p_start', '') and extra_params.get('p_end', ''):
self.p_start = extra_params['p_start']
self.p_end = extra_params['p_end']
def process_all(self):
cwd = os.getcwd()
ret_code = 0
# Expand playlist
this_url_list = self.expand_playlist_urls(self.url)
for i in range(len(this_url_list)):
url = this_url_list[i]
if self.p_start is not None:
p_num = i + int(self.p_start)
print("Playlist Num: " + str(p_num))
else:
p_num = 0
# Get into temp dir
output_dir = os.path.join(self.output_dir, 'tmp_' + utils.id_generator(8))
if os.path.exists(output_dir):
raise FileExistsError('tmp path already exists!')
os.mkdir(output_dir)
os.chdir(output_dir)
# Download, merge
func = functools.partial(self.process_single, url, output_dir, p_num)
ret, res = utils.retry_wrapper(func, max_retry=self.max_retry)
# Get out of tmp folder
os.chdir(cwd)
# Rename tmp folder
if ret == 0:
target_dir = os.path.join(self.output_dir, self.prepare_output_filename(res, p_num))
if os.path.exists(target_dir):
shutil.rmtree(target_dir)
if not self.delete_flv:
shutil.copytree(output_dir, target_dir)
else:
ret_code = -1
shutil.rmtree(output_dir)
return ret_code, self
def process_single(self, url, output_dir, p_num):
ret, res = self.download(url)
if ret != 0:
raise RuntimeError("Download unsuccessful")
file_list = self.get_file_list(output_dir, res)
if self.merge:
ret = self.concat_videos(file_list, output_dir, res, p_num)
if ret[1] is not None:
raise RuntimeError("Convert/concat unsuccessful")
return res
def download(self, url):
ret = 0
with youtube_dl.YoutubeDL(self.options) as ydl:
try:
res = ydl.extract_info(url, force_generic_extractor=ydl.params.get('force_generic_extractor', False))
except UnavailableVideoError:
print("Failed video URL: " + url)
ydl.report_error('unable to download video')
except MaxDownloadsReached:
print("Failed video URL: " + url)
ydl.to_screen('[info] Maximum number of downloaded files reached.')
raise
except DownloadError as e:
print("Failed video URL: " + url)
raise e
except Exception as e:
print("Failed video URL: " + url)
raise e
else:
if ydl.params.get('dump_single_json', False):
ydl.to_stdout(json.dumps(res))
res['output_filename'] = ydl.prepare_filename(res)
ret = ydl._download_retcode
return ret, res
def get_file_list(self, output_dir, res):
ext = res.get('ext', 'flv')
file_list = [f for f in os.listdir(output_dir) if
os.path.isfile(os.path.join(output_dir, f)) and os.path.splitext(f)[-1] == os.path.extsep + ext]
# and os.path.splitext(f)[0].startswith(title)]
# Eliminate ' in filenames
for i in range(len(file_list)):
if "'" in file_list[i]:
old_file_path = os.path.join(output_dir, file_list[i])
file_list[i] = file_list[i].replace("'", '_')
new_file_path = os.path.join(output_dir, file_list[i])
shutil.move(old_file_path, new_file_path)
file_list = natsort.natsorted(file_list)
if len(file_list) == 0:
raise FileNotFoundError("Empty file list")
if self.debug:
print('\n'.join(file_list))
return file_list
def concat_videos(self, file_list, output_dir, res, p_num):
# For ffmpeg concat demuxer, mp4 output
ret = 0
tmp_file = os.path.join(output_dir, 'temp_filelist.txt')
with open(tmp_file, 'w') as f:
for file in file_list:
# Deal with space in cmdline
f.write('file ' + "'" + file + "'" + '\n')
if self.debug:
with open(tmp_file, 'r') as f:
print(''.join(f.readlines()))
stream = ffmpeg.input(tmp_file, format='concat', safe=0)
stream = ffmpeg.output(stream, os.path.join(self.output_dir,
self.prepare_output_filename(res, p_num) + os.path.extsep + 'mp4'),
c='copy')
if self.debug:
print(ffmpeg.compile(stream, overwrite_output=True))
ret = ffmpeg.run(stream, overwrite_output=True)
os.remove(tmp_file)
return ret
def prepare_output_filename(self, res, count):
# title = utils.slugify(res['title'], allow_unicode=True, simple=True)
# title = res['title']
title = os.path.splitext(res['output_filename'])[0]
title = title.split('-|||||||-')[0] # Arbitrary split, since _part_n is in %id
title = utils.slugify(title, allow_unicode=True, simple=True)
if self.p_start is not None and self.p_end is not None:
if count < int(self.p_start) or count > int(self.p_end):
raise RuntimeError("Count number outside playlist range!")
filename = title + '_' + str(count)
else:
filename = title
return filename
def expand_playlist_urls(self, url):
if self.p_start is None or self.p_end is None:
return [url]
else:
_VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)(?:/?\?p=(?P<page>\d+))?'
mobj = re.match(_VALID_URL, url)
video_id = mobj.group('id')
anime_id = mobj.group('anime_id')
page = mobj.group('page') or 1
url_list = []
pos = mobj.regs[3]
# Get last position of video id, matching group 2
v_pos = mobj.regs[2]
base_url = url[:v_pos[1] + 1]
if not base_url:
raise RuntimeError("Regex matching failed")
for i in range(int(self.p_start), int(self.p_end) + 1):
# We know 'page' is matching group 3
if pos[0] == -1 or pos[1] == -1: # No p= part
new_url = base_url + '/?p=' + str(i)
else: # Has p= part
url_part1 = url[:pos[0]]
url_part2 = url[pos[1]:]
new_url = url_part1 + str(i) + url_part2
url_list.append(new_url)
print()
return url_list
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Download and merge single video from bilibili')
parser.add_argument('url', nargs='?', type=str, default="", help="url of the webpage")
parser.add_argument('-m', '--merge', action='store_true', help="Whether merge the resulting flv files")
parser.add_argument('-c', '--delete-flv', action='store_true', help="Whether delete intermediate flv files")
parser.add_argument('-o', '--output', type=str, default=DEFAULT_VALUES['output'], help="Output format")
parser.add_argument('-d', '--output-dir', type=str, default=DEFAULT_VALUES['output_dir'], help="Output directory")
parser.add_argument('--max-retry', type=int, default=DEFAULT_VALUES['max_retry'], help="How many times to retry if fails")
parser.add_argument('--debug', action='store_true', help="Debug Mode")
parser.add_argument('--debug_tmp_dir_name', type=str, default=DEFAULT_VALUES['debug_tmp_dir_name'], help="Fixed tmp dir name for debugging")
args = parser.parse_args()
params = {"url": args.url,
"merge": args.merge,
"output": args.output,
"output_dir": args.output_dir,
"delete_flv": args.delete_flv,
"max_retry": args.max_retry,
"debug": args.debug,
"debug_tmp_dir_name": args.debug_tmp_dir_name,
}
acc = Bilibili(params)
acc.process_all()

209
bilibili_search.py

@ -0,0 +1,209 @@
import copy
import csv
import functools
import os
import re
import time
import urllib
from collections import OrderedDict
from logging import warning
import natsort
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
# assert "Python" in driver.title
# elem = driver.find_element_by_name("q")
# elem.clear()
# elem.send_keys("pycon")
# elem.send_keys(Keys.RETURN)
# assert "No results found." not in driver.page_source
# # driver.close()
from utils import retry_wrapper, slugify
class BrowserSearch(object):
_video_base_url = "https://www.bilibili.com/video/"
def __init__(self, query, search_mode='video', mode="chrome", headless=False):
self.driver = None
self.set_driver(mode, headless)
if not isinstance(query, str):
raise TypeError("Query is not string.")
if not query:
warning("Query string is empty!")
self.query = query
self.search_base_url = 'https://search.bilibili.com/' + search_mode + '?keyword='
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.driver is not None:
self.driver.close()
def set_driver(self, mode, headless):
if mode == "chrome":
options = webdriver.ChromeOptions()
options.add_argument('--js-flags=--noexpose_wasm')
if headless:
options.add_argument("--headless")
self.driver = webdriver.Chrome(options=options)
elif mode == "firefox":
raise NotImplementedError
else:
raise NotImplementedError
def init_search(self):
url = self.search_base_url + urllib.parse.quote(self.query)
self.driver.get(url)
if "搜索结果" not in self.driver.title:
raise ConnectionError("Error loading webpage")
# Wait until AJAX finish loading
wait = WebDriverWait(self.driver, timeout=20)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.video.matrix')))
def ayalyze_cur_page(self, secondary_keywords=None, exclude_keywords=None, prev_result=None):
if prev_result is None:
links_out = OrderedDict()
elif not isinstance(prev_result, dict):
raise TypeError("Result from previous run is not Dictionary.")
else:
links_out = copy.deepcopy(prev_result)
video_links = self.driver.find_elements_by_css_selector(".video.matrix")
for video in video_links:
a_elem = video.find_elements_by_class_name("title")
if len(a_elem) != 1:
warning("Incorrect number of <a> elements!")
title = a_elem[0].get_attribute("title")
if secondary_keywords is not None:
if all(keyword not in title for keyword in secondary_keywords):
continue
if exclude_keywords is not None:
if any(keyword in title for keyword in exclude_keywords):
continue
screenshot = video.screenshot_as_png
# innerText, innerHTML, textContent
id = video.find_elements_by_css_selector(".type.avid")[0].get_attribute("innerText")
assert re.search('^av[0-9]{5,8}\Z', id) is not None
v_link = BrowserSearch._video_base_url + id
description = video.find_elements_by_css_selector(".des.hide")[0].get_attribute("innerText")
duration = video.find_elements_by_css_selector(".so-imgTag_rb")[0].get_attribute("innerText")
upload_time = video.find_elements_by_css_selector(".so-icon.time")[0].get_attribute("innerText")
uploader = video.find_elements_by_class_name("up-name")[0].get_attribute("innerText")
links_out[id] = {
"id": id,
"title": title,
"link": v_link,
"description": description,
"duration": duration,
"upload_time": upload_time,
"uploader": uploader,
"screenshot": screenshot,
}
return links_out
def get_page_count(self):
last_page_link = self.driver.find_elements_by_css_selector(".page-item.last")
if len(last_page_link) > 0:
# Pages in the middle are hidden
page_count = int(last_page_link[0].get_attribute("innerText"))
else:
# Not hidden
page_links = self.driver.find_elements_by_css_selector(".page-item:not(.next):not(.prev)")
page_count = len(page_links)
return page_count
def nav_retry_wrapper(self, func, mode='refresh'):
try:
func()
except Exception as e:
if mode == 'refresh':
self.driver.refresh()
elif mode == 'back':
self.driver.back()
print(e)
raise e
def next_page(self):
next_page_link = self.driver.find_elements_by_css_selector(".page-item.next")
if len(next_page_link) > 0:
next_page_link[0].click()
else:
warning("No next page found!")
return
# Wait until AJAX finish loading
wait = WebDriverWait(self.driver, timeout=20)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.page-item')))
def prev_page(self):
prev_page_link = self.driver.find_elements_by_css_selector(".page-item.prev")
if len(prev_page_link) > 0:
prev_page_link[0].click()
else:
warning("No previous page found!")
return
# Wait until AJAX finish loading
wait = WebDriverWait(self.driver, timeout=20)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.page-item')))
def goto_page(self, page_num):
num_pages = self.driver.get_page_count()
assert page_num <= num_pages
url = "https://search.bilibili.com/video?keyword=" + urllib.parse.quote(self.query)
url += "&page=" + str(page_num)
self.driver.get(url)
# Wait until AJAX finish loading
wait = WebDriverWait(self.driver, timeout=20)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.page-item')))
if __name__ == "__main__":
output_dir = 'Collections/crawl_out/Pyxis'
query_str = "Pyxis的闪亮大作战!"
secondary_keywords = None
exclude_keywords = None
# Need to repeat switching between all and video, since each search will not return all results
# Didn't work
dict_out = None
# search_mode = {True: 'all', False: 'video'}
# all_mode = False
# for i in range(5):
with BrowserSearch(query_str, headless=True) as driver:
retry_wrapper(driver.init_search, max_retry=5, timeout=5)
num_pages = max(1, driver.get_page_count())
for j in range(num_pages):
dict_out = driver.ayalyze_cur_page(secondary_keywords=secondary_keywords, exclude_keywords=exclude_keywords,
prev_result=dict_out)
retry_wrapper(functools.partial(driver.nav_retry_wrapper, driver.next_page), max_retry=5, timeout=3)
# all_mode = not all_mode
# time.sleep(5)
list_out = natsort.natsorted(dict_out.values(), key=lambda v: v['title']) # Natural sort by title
# Write images
img_out_path = os.path.join(output_dir, 'img')
if not os.path.exists(img_out_path):
os.makedirs(img_out_path, exist_ok=True)
for v in list_out:
title = slugify(v["title"])
path = os.path.join(img_out_path, title + os.path.extsep + 'png')
with open(path, 'wb') as f:
f.write(v["screenshot"])
del v["screenshot"]
# Write csv
csv_path = os.path.join(output_dir, 'links.csv')
header_list = list(list_out[0].keys())
header_list[:] = [x for x in header_list if x != "description" and x != "id"]
with open(csv_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=header_list, extrasaction="ignore")
writer.writeheader()
writer.writerows(list_out)
print()

90
checksum.py

@ -0,0 +1,90 @@
import argparse
import csv
import hashlib
import os
import natsort
from natsort.ns_enum import ns
BUF_SIZE = 65536
def scantree(path):
"""Recursively yield DirEntry objects for given directory. Files only."""
for entry in os.scandir(path):
if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path)
else:
yield entry
def hashing(file, alg='blake2b'):
hash_func = getattr(hashlib, alg, hashlib.blake2b)()
if not os.path.exists(file):
raise FileNotFoundError
with open(file, 'rb') as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
hash_func.update(data)
return hash_func.hexdigest()
def hash_dir(input_dir, output_dir, alg='blake2b'):
# List files
output = []
# for path, dirs, files in os.walk(input_dir):
total_file_size = sum(os.path.getsize(dir_entry.path) for dir_entry in scantree(input_dir))
total_file_num = sum(1 for _ in scantree(input_dir))
processed_file_size = 0
processed_file_num = 0
for dir_entry in scantree(input_dir):
full_path = dir_entry.path
file_name = dir_entry.name
relative_path = os.path.relpath(full_path, input_dir)
file_hash = hashing(full_path, alg=alg)
output.append({
"filename": file_name,
# "full_path": full_path,
"rel_path": relative_path,
"checksum": file_hash,
})
processed_file_size += os.path.getsize(full_path)
processed_file_num += 1
# Print progress
if processed_file_num % 100:
print("Now at %f%%. %d/%d bytes of files processed. " % (
processed_file_size / total_file_size * 100, processed_file_size, total_file_size))
list_out = natsort.natsorted(output, key=lambda v: v['rel_path'], alg=ns.IC)
csv_path = os.path.join(output_dir, 'checksum.csv')
header_list = list(output[0].keys())
with open(csv_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=header_list, extrasaction="ignore")
writer.writeheader()
writer.writerows(list_out)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Output checksum of each file under a directory')
parser.add_argument('-i', '--input-dir', type=str, help="Input directory")
parser.add_argument('-o', '--output-dir', type=str, help="Output directory")
parser.add_argument('-m', '--alg', type=str, default='blake2b', help="Hashing algorithm")
args = parser.parse_args()
hash_dir(args.input_dir, args.output_dir, alg=args.alg)
# # DEBUG
# input_dir = 'Collections/crawl_out'
# output_dir = 'Collections'
# hash_dir(input_dir, output_dir, alg='blake2b')
# print("{0}: {1}".format(alg, hashing(file, alg=alg)))

0
main.py

104
multifile_wrapper.py

@ -0,0 +1,104 @@
import copy
import csv
import functools
import os
import multiprocessing as mp
import traceback
from collections import OrderedDict
from bilibili import Bilibili
import utils
NOT_STARTED = 'Not Started'
IN_PROCESS = 'In Process'
SUCCESS = 'Successful'
MODE_BILIBILI = 'bilibili'
processor = {
MODE_BILIBILI: Bilibili,
}
def report_status(url_dict):
print("Status Report:")
for k, v in url_dict.items():
print('"' + k + '": ' + v['status'])
def get_urls(input_dir, mode):
input_file = os.path.join(input_dir, 'input_list_' + mode + '.csv')
if not os.path.exists(input_file):
raise FileNotFoundError("input url file not exist")
url_list = None
with open(input_file, 'r') as f:
reader = csv.DictReader(f)
url_list = [dict(d) for d in reader]
url_dict = OrderedDict()
url_keys = list(url_list[0].keys())
assert url_keys[0] == 'URL'
for url_set in url_list:
key = url_set['URL']
value = {k: url_set[k] for k in url_keys[1:]}
value['status'] = NOT_STARTED
url_dict[key] = value
return url_dict
def success_handler(result, url_dict=None):
ret_code = result[0]
url = result[1].url
if ret_code == 0:
url_dict[url]['status'] = SUCCESS
report_status(url_dict)
# print('success')
def error_handler(e, url_dict=None):
# print('error')
# print(dir(e), "\n")
print("-->{}<--".format(e.__cause__))
traceback.print_exception(type(e), e, e.__traceback__)
def single_url_wrapper(mode, url, params, this_url_info):
# One url per process
params = copy.deepcopy(params)
params["url"] = url
acc = processor[mode](params, copy.deepcopy(this_url_info))
ret = acc.process_all()
return ret
if __name__ == "__main__":
mode = MODE_BILIBILI
num_of_processes = 5
input_dir = 'Collections'
output_dir = 'Collections/temp_output'
params = {"merge": True,
"debug": True,
"delete_flv": True,
"output_dir": output_dir,
# 'output': "%(title)s.%(ext)s",
}
url_dict = get_urls(input_dir, mode)
url_list = list(url_dict.keys())
on_error = functools.partial(error_handler, url_dict=url_dict)
on_success = functools.partial(success_handler, url_dict=url_dict)
# with open(os.path.join(output_dir, 'logout.log'), 'w') as sys.stdout:
# with open(os.path.join(output_dir, 'error.log'), 'w') as sys.stderr:
pool = mp.Pool(processes=num_of_processes)
for k in url_dict.keys():
# single_url_wrapper(mode, k, params, url_dict[k])
url_dict[k]['status'] = IN_PROCESS
pool.apply_async(single_url_wrapper, args=(mode, k, params, url_dict[k]),
error_callback=on_error, callback=on_success)
pool.close()
pool.join()
report_status(url_dict)

53
utils.py

@ -0,0 +1,53 @@
import calendar
import datetime
import random
import re
import string
import time
import unicodedata
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.SystemRandom().choice(chars) for _ in range(size))
def retry_wrapper(func, max_retry, timeout=5):
result = None
ret_code = 0
n_retry = 0
while n_retry < max_retry:
try:
result = func()
break
except Exception as e:
print(e)
n_retry += 1
if n_retry == max_retry:
ret_code = -1
time.sleep(timeout)
return ret_code, result
def slugify(value, allow_unicode=True, simple=True):
"""
Modified from Django
Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
Remove characters that aren't alphanumerics, underscores, or hyphens.
Convert to lowercase. Also strip leading and trailing whitespace.
"""
value = str(value)
if allow_unicode:
value = unicodedata.normalize('NFKC', value)
else:
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
if simple:
value = re.sub(r'[/.]', '_', value)
return value
else:
value = re.sub(r'[^\w\s-]', '_', value).strip().lower()
return re.sub(r'[-\s]+', '-', value)
def get_day_of_week(year, month, day):
date = datetime.date(year, month, day)
return calendar.day_name[date.weekday()]

335
youtube-dl_mod/bilibili.py.modified

@ -0,0 +1,335 @@
# coding: utf-8
from __future__ import unicode_literals
import hashlib
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urlparse,
)
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
parse_iso8601,
smuggle_url,
strip_jsonp,
unified_timestamp,
unsmuggle_url,
urlencode_postdata,
)
class BiliBiliIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)(?:/?\?p=(?P<page>\d+))?'
_TESTS = [{
'url': 'http://www.bilibili.tv/video/av1074402/',
'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
'info_dict': {
'id': '1074402',
'ext': 'flv',
'title': '【金坷垃】金泡沫',
'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
'duration': 308.067,
'timestamp': 1398012678,
'upload_date': '20140420',
'thumbnail': r're:^https?://.+\.jpg',
'uploader': '菊子桑',
'uploader_id': '156160',
},
}, {
# Tested in BiliBiliBangumiIE
'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
'only_matching': True,
}, {
'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
'md5': '3f721ad1e75030cc06faf73587cfec57',
'info_dict': {
'id': '100643',
'ext': 'mp4',
'title': 'CHAOS;CHILD',
'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
},
'skip': 'Geo-restricted to China',
}, {
# Title with double quotes
'url': 'http://www.bilibili.com/video/av8903802/',
'info_dict': {
'id': '8903802',
'title': '阿滴英文|英文歌分享#6 "Closer',
'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文',
},
'playlist': [{
'info_dict': {
'id': '8903802_part1',
'ext': 'flv',
'title': '阿滴英文|英文歌分享#6 "Closer',
'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
'uploader': '阿滴英文',
'uploader_id': '65880958',
'timestamp': 1488382634,
'upload_date': '20170301',
},
'params': {
'skip_download': True, # Test metadata only
},
}, {
'info_dict': {
'id': '8903802_part2',
'ext': 'flv',
'title': '阿滴英文|英文歌分享#6 "Closer',
'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a',
'uploader': '阿滴英文',
'uploader_id': '65880958',
'timestamp': 1488382634,
'upload_date': '20170301',
},
'params': {
'skip_download': True, # Test metadata only
},
}]
}, {
# Test pages
'url': 'http://www.bilibili.com/video/av19608864/?p=3',
'md5': 'e0c3368a3a1d53ac5892da1abe57887d',
'info_dict': {
'id': '19608864',
'ext': 'flv',
'title': '【iu】【penta kill】第七届Gaon Chart K-POP Awards~各种屏录~',
'description': 'md5:d9f59c5658f4f2eafc04a191e1ab0828',
'duration': 100.264,
'timestamp': 1518671057,
'upload_date': '20180215',
'thumbnail': r're:^https?://.+\.jpg',
'uploader': '普通的柒月初叁',
'uploader_id': '260251749',
},
}]
_APP_KEY = 'iVGUTjsxvpLeuDCf'
_BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
def _report_error(self, result):
if 'message' in result:
raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
elif 'code' in result:
raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
else:
raise ExtractorError('Can\'t extract Bangumi episode ID')
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
anime_id = mobj.group('anime_id')
page = mobj.group('page') or 1
webpage = self._download_webpage(url, video_id)
if 'anime/' not in url:
mobj = re.findall(r'cid(?:["\']:|=)(\d+)', webpage)
seen_mobj = set()
mobj = [c for c in mobj if int(c) > 10 and not (c in seen_mobj or seen_mobj.add(c))]
print(mobj)
cid = mobj[int(page) - 1] or \
compat_parse_qs(self._search_regex([
r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
webpage, 'player parameters'))['cid'][0]
# cid = self._search_regex(
# r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid',
# default=None
#) or compat_parse_qs(self._search_regex(
# [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
# r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)',
# r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
# webpage, 'player parameters'))['cid'][0]
else:
if 'no_bangumi_tip' not in smuggled_data:
self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % (
video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': url
}
headers.update(self.geo_verification_headers())
js = self._download_json(
'http://bangumi.bilibili.com/web_api/get_source', video_id,
data=urlencode_postdata({'episode_id': video_id}),
headers=headers)
if 'result' not in js:
self._report_error(js)
cid = js['result']['cid']
headers = {
'Referer': url
}
headers.update(self.geo_verification_headers())
entries = []
RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4')
for num, rendition in enumerate(RENDITIONS, start=1):
payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition)
sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest()
video_info = self._download_json(
'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign),
video_id, note='Downloading video info page',
headers=headers, fatal=num == len(RENDITIONS))
if not video_info:
continue
if 'durl' not in video_info:
if num < len(RENDITIONS):
continue
self._report_error(video_info)
for idx, durl in enumerate(video_info['durl']):
formats = [{
'url': durl['url'],
'filesize': int_or_none(durl['size']),
}]
for backup_url in durl.get('backup_url', []):
formats.append({
'url': backup_url,
# backup URLs have lower priorities
'preference': -2 if 'hd.mp4' in backup_url else -3,
})
for a_format in formats:
a_format.setdefault('http_headers', {}).update({
'Referer': url,
})
self._sort_formats(formats)
entries.append({
'id': '%s_part%s' % (video_id, idx),
'duration': float_or_none(durl.get('length'), 1000),
'formats': formats,
})
break
title = self._html_search_regex(
('<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1',
'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title',
group='title')
description = self._html_search_meta('description', webpage)
timestamp = unified_timestamp(self._html_search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time',
default=None) or self._html_search_meta(
'uploadDate', webpage, 'timestamp', default=None))
thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
# TODO 'view_count' requires deobfuscating Javascript
info = {
'id': video_id,
'title': title,
'description': description,
'timestamp': timestamp,
'thumbnail': thumbnail,
'duration': float_or_none(video_info.get('timelength'), scale=1000),
}
uploader_mobj = re.search(
r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>(?P<name>[^<]+)',
webpage)
if uploader_mobj:
info.update({
'uploader': uploader_mobj.group('name'),
'uploader_id': uploader_mobj.group('id'),
})
if not info.get('uploader'):
info['uploader'] = self._html_search_meta(
'author', webpage, 'uploader', default=None)
for entry in entries:
entry.update(info)
if len(entries) == 1:
return entries[0]
else:
for idx, entry in enumerate(entries):
entry['id'] = '%s_part%d' % (video_id, (idx + 1))
return {
'_type': 'multi_video',
'id': video_id,
'title': title,
'description': description,
'entries': entries,
}
class BiliBiliBangumiIE(InfoExtractor):
_VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
IE_NAME = 'bangumi.bilibili.com'
IE_DESC = 'BiliBili番剧'
_TESTS = [{
'url': 'http://bangumi.bilibili.com/anime/1869',
'info_dict': {
'id': '1869',
'title': '混沌武士',
'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
},
'playlist_count': 26,
}, {
'url': 'http://bangumi.bilibili.com/anime/1869',
'info_dict': {
'id': '1869',
'title': '混沌武士',
'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
},
'playlist': [{
'md5': '91da8621454dd58316851c27c68b0c13',
'info_dict': {
'id': '40062',
'ext': 'mp4',
'title': '混沌武士',
'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...',
'timestamp': 1414538739,
'upload_date': '20141028',
'episode': '疾风怒涛 Tempestuous Temperaments',
'episode_number': 1,
},
}],
'params': {
'playlist_items': '1',
},
}]
@classmethod
def suitable(cls, url):
return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
def _real_extract(self, url):
bangumi_id = self._match_id(url)
# Sometimes this API returns a JSONP response
season_info = self._download_json(
'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
bangumi_id, transform_source=strip_jsonp)['result']
entries = [{
'_type': 'url_transparent',
'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
'ie_key': BiliBiliIE.ie_key(),
'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
'episode': episode.get('index_title'),
'episode_number': int_or_none(episode.get('index')),
} for episode in season_info['episodes']]
entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
return self.playlist_result(
entries, bangumi_id,
season_info.get('bangumi_title'), season_info.get('evaluate'))
Loading…
Cancel
Save