Wrapper over youtube-dl for searching and batch downloading bilibili videos.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

166 lines
4.7 KiB

import argparse
import csv
import hashlib
import os
import natsort
from natsort.ns_enum import ns
BUF_SIZE = 65536
OUTPUT_FILE_NAME = 'checksum.csv'
HEADER_LIST = ["rel_path", "checksum"]
# bytes pretty-printing
UNITS_MAPPING = [
(1 << 50, ' PB'),
(1 << 40, ' TB'),
(1 << 30, ' GB'),
(1 << 20, ' MB'),
(1 << 10, ' KB'),
(1, (' byte', ' bytes')),
]
def pretty_size(bytes, units=UNITS_MAPPING):
"""Get human-readable file sizes.
simplified version of https://pypi.python.org/pypi/hurry.filesize/
"""
for factor, suffix in units:
if bytes >= factor:
break
amount = int(bytes / factor)
if isinstance(suffix, tuple):
singular, multiple = suffix
if amount == 1:
suffix = singular
else:
suffix = multiple
return str(amount) + suffix
def scan_tree(path):
"""Recursively yield DirEntry objects for given directory. Files only."""
for entry in os.scandir(path):
if entry.is_dir(follow_symlinks=False):
yield from scan_tree(entry.path)
else:
yield entry
def hashing(file, alg='blake2b'):
hash_func = getattr(hashlib, alg, hashlib.blake2b)()
if not os.path.exists(file):
raise FileNotFoundError
with open(file, 'rb') as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
hash_func.update(data)
return hash_func.hexdigest()
def build_file_list(input_dir):
file_list = []
for dir_entry in scan_tree(input_dir):
full_path = dir_entry.path
# file_name = dir_entry.name
relative_path = os.path.relpath(full_path, input_dir)
file_list.append(relative_path)
file_list = natsort.natsorted(file_list, alg=ns.PATH)
return file_list
def read_existing_output(output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
return None
csv_path = os.path.join(output_dir, OUTPUT_FILE_NAME)
if not os.path.exists(csv_path):
return None
with open(csv_path, 'r') as f:
reader = csv.DictReader(f, fieldnames=HEADER_LIST)
next(reader)
existing_files = [item['rel_path'] for item in reader]
existing_files = natsort.natsorted(existing_files, alg=ns.PATH)
return existing_files
def hash_dir(input_dir, output_dir, alg='blake2b'):
# for path, dirs, files in os.walk(input_dir):
total_file_size = sum(os.path.getsize(dir_entry.path) for dir_entry in scan_tree(input_dir))
total_file_num = sum(1 for _ in scan_tree(input_dir))
processed_file_size = 0
processed_file_num = 0
# Build file list
file_list = build_file_list(input_dir)
# Read existing output file4
output_exist = read_existing_output(output_dir)
# print(*file_list, sep='\n')
# if output_exist is not None:
# print(*output_exist, sep='\n')
# Write hashes
csv_path = os.path.join(output_dir, OUTPUT_FILE_NAME)
with open(csv_path, 'a') as f:
# Write headers
writer = csv.DictWriter(f, fieldnames=HEADER_LIST, extrasaction="ignore")
if output_exist is None:
writer.writeheader()
output_exist = []
# Iterations
for rel_path in file_list:
full_path = os.path.join(input_dir, rel_path)
processed_file_size += os.path.getsize(full_path)
processed_file_num += 1
# Print progress
if processed_file_num % 100:
print("Now at %f%%. %s/%s bytes of files processed. " % (
processed_file_size / total_file_size * 100,
pretty_size(processed_file_size), pretty_size(total_file_size)))
print("%d/%d files processed. " % (
processed_file_num, total_file_num))
if rel_path in output_exist:
continue
file_hash = hashing(full_path, alg=alg)
output = {
"rel_path": rel_path,
"checksum": file_hash,
}
writer.writerow(output)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Output checksum of each file under a directory')
parser.add_argument('-i', '--input-dir', type=str, help="Input directory")
parser.add_argument('-o', '--output-dir', type=str, help="Output directory")
parser.add_argument('-m', '--alg', type=str, default='blake2b', help="Hashing algorithm")
args = parser.parse_args()
hash_dir(args.input_dir, args.output_dir, alg=args.alg)
# # DEBUG
# input_dir = 'Collections/crawl_out'
# output_dir = 'Collections'
# hash_dir(input_dir, output_dir, alg='blake2b')
# print("{0}: {1}".format(alg, hashing(file, alg=alg)))