Browse Source

Search nP support

master
wchen342 3 years ago
parent
commit
2e13fedf71
Signed by: wchen342 GPG Key ID: 720B70365E800508
  1. 199
      bilibili_search.py

199
bilibili_search.py

@ -4,9 +4,12 @@ import functools
import os
import re
import time
import traceback
import urllib
import multiprocessing as mp
from collections import OrderedDict
from logging import warning
from queue import Empty
import natsort
from selenium import webdriver
@ -15,20 +18,13 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
# assert "Python" in driver.title
# elem = driver.find_element_by_name("q")
# elem.clear()
# elem.send_keys("pycon")
# elem.send_keys(Keys.RETURN)
# assert "No results found." not in driver.page_source
# # driver.close()
from utils import retry_wrapper, slugify
class BrowserSearch(object):
_video_base_url = "https://www.bilibili.com/video/"
def __init__(self, query, search_mode='video', mode="chrome", headless=False):
def __init__(self, query, search_mode='video', mode="chrome", headless=False, queue=None, lock=None):
self.driver = None
self.set_driver(mode, headless)
if not isinstance(query, str):
@ -37,6 +33,12 @@ class BrowserSearch(object):
warning("Query string is empty!")
self.query = query
self.search_base_url = 'https://search.bilibili.com/' + search_mode + '?keyword='
if lock is None:
raise RuntimeError("Lock object does not exist")
self.lock = lock
if queue is None:
raise RuntimeError("Queue object does not exist")
self.queue = queue
def __enter__(self):
return self
@ -66,14 +68,7 @@ class BrowserSearch(object):
wait = WebDriverWait(self.driver, timeout=20)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.video.matrix')))
def ayalyze_cur_page(self, secondary_keywords=None, exclude_keywords=None, prev_result=None):
if prev_result is None:
links_out = OrderedDict()
elif not isinstance(prev_result, dict):
raise TypeError("Result from previous run is not Dictionary.")
else:
links_out = copy.deepcopy(prev_result)
def ayalyze_cur_page(self, secondary_keywords=None, exclude_keywords=None):
video_links = self.driver.find_elements_by_css_selector(".video.matrix")
for video in video_links:
a_elem = video.find_elements_by_class_name("title")
@ -97,7 +92,8 @@ class BrowserSearch(object):
duration = video.find_elements_by_css_selector(".so-imgTag_rb")[0].get_attribute("innerText")
upload_time = video.find_elements_by_css_selector(".so-icon.time")[0].get_attribute("innerText")
uploader = video.find_elements_by_class_name("up-name")[0].get_attribute("innerText")
links_out[id] = {
# links_out[id] = {
out = {
"id": id,
"title": title,
"link": v_link,
@ -106,8 +102,10 @@ class BrowserSearch(object):
"upload_time": upload_time,
"uploader": uploader,
"screenshot": screenshot,
"pages": 0,
}
return links_out
with self.lock:
self.queue.put_nowait(out)
def get_page_count(self):
last_page_link = self.driver.find_elements_by_css_selector(".page-item.last")
@ -164,28 +162,149 @@ class BrowserSearch(object):
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.page-item')))
if __name__ == "__main__":
output_dir = 'Collections/crawl_out/Pyxis'
query_str = "Pyxis的闪亮大作战!"
secondary_keywords = None
exclude_keywords = None
class SubPageSearch(object):
def __init__(self, mode="chrome", headless=False, queue=None, dict_out=None, lock=None):
self.driver = None
self.set_driver(mode, headless)
if queue is None:
raise RuntimeError("Queue object does not exist")
self.queue = queue
if dict_out is None:
raise RuntimeError("Dict object does not exist")
self.dict_out = dict_out
if lock is None:
raise RuntimeError("Lock object does not exist")
self.lock = lock
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if self.driver is not None:
self.driver.close()
def set_driver(self, mode, headless):
if mode == "chrome":
options = webdriver.ChromeOptions()
options.add_argument('--js-flags=--noexpose_wasm')
if headless:
options.add_argument("--headless")
self.driver = webdriver.Chrome(options=options)
elif mode == "firefox":
raise NotImplementedError
else:
raise NotImplementedError
def sub_page(self, main_status):
# Need to be wrapped in a while true loop
while True:
time.sleep(2)
with self.lock:
if main_status.value == 0 and self.queue.qsize() == 0:
return
try:
item = self.queue.get_nowait()
except Empty as e:
warning("Empty queue")
continue
# Need to repeat switching between all and video, since each search will not return all results
# Didn't work
dict_out = None
# search_mode = {True: 'all', False: 'video'}
# all_mode = False
# for i in range(5):
with BrowserSearch(query_str, headless=True) as driver:
retry_wrapper(functools.partial(self._sub_page_exec, item), max_retry=5, timeout=3)
def _sub_page_exec(self, item):
v_link = item['link']
self.driver.get(v_link)
if "bilibili" not in self.driver.title:
raise ConnectionError("Error loading webpage")
# Wait until AJAX finish loading
wait = WebDriverWait(self.driver, timeout=20)
wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.bilibili-player')))
multi_page_elem = self.driver.find_elements_by_css_selector('.multi-page')
if len(multi_page_elem) == 1:
cur_page_elem = multi_page_elem[0].find_elements_by_css_selector(".cur-page")
if len(cur_page_elem) != 1:
warning("Incorrect number of cur-page elements!")
text = cur_page_elem[0].get_attribute("innerText")
match_out = re.findall(r'([0-9]{1,3})/([0-9]{1,3})', text)
if len(match_out) != 1:
raise RuntimeError("Incorrect number of matches")
item["pages"] = int(match_out[0][1])
elif len(multi_page_elem) > 1:
warning("Incorrect number of multi-page elements!")
with self.lock:
self.dict_out[item["id"]] = item
def search_main(headless=False, queue=None, lock=None, status=None):
if status is None:
raise RuntimeError("Status object does not exist")
with BrowserSearch(query_str, headless=headless, queue=queue, lock=lock) as driver:
retry_wrapper(driver.init_search, max_retry=5, timeout=5)
num_pages = max(1, driver.get_page_count())
for j in range(num_pages):
dict_out = driver.ayalyze_cur_page(secondary_keywords=secondary_keywords, exclude_keywords=exclude_keywords,
prev_result=dict_out)
for j in range(max(1, driver.get_page_count())):
driver.ayalyze_cur_page(secondary_keywords=secondary_keywords, exclude_keywords=exclude_keywords)
retry_wrapper(functools.partial(driver.nav_retry_wrapper, driver.next_page), max_retry=5, timeout=3)
# all_mode = not all_mode
# time.sleep(5)
with driver.lock:
status.value = 0
def search_sub(headless=False, queue=None, dict_out=None, lock=None, status=None):
if status is None:
raise RuntimeError("Status object does not exist")
with SubPageSearch(headless=headless, queue=queue, dict_out=dict_out, lock=lock) as sub:
sub.sub_page(status)
def success_handler():
# ret_code = result[0]
# url = result[1].url
# if ret_code == 0:
# url_dict[url]['status'] = SUCCESS
# report_status(url_dict)
print('success')
def error_handler(e):
# print('error')
# print(dir(e), "\n")
print("-->{}<--".format(e.__cause__))
traceback.print_exception(type(e), e, e.__traceback__)
if __name__ == "__main__":
num_of_processes = 5
output_dir = 'Collections/crawl_out/temp'
query_str = ""
secondary_keywords = None
exclude_keywords = None
headless = True
pool = mp.Pool(processes=num_of_processes)
mg = mp.Manager()
main_status = mg.Value('b', 1)
lock = mg.Lock()
queue = mg.Queue()
dict_out = mg.dict()
# for k in url_dict.keys():
# url_dict[k]['status'] = IN_PROCESS
# pool.apply_async(single_url_wrapper, args=(mode, k, params, url_dict[k]),
# error_callback=on_error, callback=on_success)
pool.apply_async(search_main, args=(headless, queue, lock, main_status,), error_callback=error_handler)
# search_main(queue)
# Visit sub-pages
for i in range(num_of_processes - 1):
pool.apply_async(search_sub, args=(headless, queue, dict_out, lock, main_status,), error_callback=error_handler)
pool.close()
pool.join()
# Assemble output
dict_out = OrderedDict(dict_out)
list_out = natsort.natsorted(dict_out.values(), key=lambda v: v['title']) # Natural sort by title
# Write images
img_out_path = os.path.join(output_dir, 'img')
@ -193,11 +312,23 @@ if __name__ == "__main__":
os.makedirs(img_out_path, exist_ok=True)
for v in list_out:
title = slugify(v["title"])
# Screenshot
path = os.path.join(img_out_path, title + os.path.extsep + 'png')
with open(path, 'wb') as f:
f.write(v["screenshot"])
del v["screenshot"]
# nP
if v["pages"] == 0:
v["p_start"] = ''
v["p_end"] = ''
del v["pages"]
else:
v["p_start"] = '1'
v["p_end"] = str(v["pages"])
del v["pages"]
# Write csv
csv_path = os.path.join(output_dir, 'links.csv')
header_list = list(list_out[0].keys())

Loading…
Cancel
Save