|
|
@ -7,13 +7,43 @@ import natsort |
|
|
|
from natsort.ns_enum import ns |
|
|
|
|
|
|
|
BUF_SIZE = 65536 |
|
|
|
OUTPUT_FILE_NAME = 'checksum.csv' |
|
|
|
HEADER_LIST = ["rel_path", "checksum"] |
|
|
|
|
|
|
|
# bytes pretty-printing |
|
|
|
UNITS_MAPPING = [ |
|
|
|
(1 << 50, ' PB'), |
|
|
|
(1 << 40, ' TB'), |
|
|
|
(1 << 30, ' GB'), |
|
|
|
(1 << 20, ' MB'), |
|
|
|
(1 << 10, ' KB'), |
|
|
|
(1, (' byte', ' bytes')), |
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
def pretty_size(bytes, units=UNITS_MAPPING): |
|
|
|
"""Get human-readable file sizes. |
|
|
|
simplified version of https://pypi.python.org/pypi/hurry.filesize/ |
|
|
|
""" |
|
|
|
for factor, suffix in units: |
|
|
|
if bytes >= factor: |
|
|
|
break |
|
|
|
amount = int(bytes / factor) |
|
|
|
|
|
|
|
if isinstance(suffix, tuple): |
|
|
|
singular, multiple = suffix |
|
|
|
if amount == 1: |
|
|
|
suffix = singular |
|
|
|
else: |
|
|
|
suffix = multiple |
|
|
|
return str(amount) + suffix |
|
|
|
|
|
|
|
|
|
|
|
def scantree(path): |
|
|
|
def scan_tree(path): |
|
|
|
"""Recursively yield DirEntry objects for given directory. Files only.""" |
|
|
|
for entry in os.scandir(path): |
|
|
|
if entry.is_dir(follow_symlinks=False): |
|
|
|
yield from scantree(entry.path) |
|
|
|
yield from scan_tree(entry.path) |
|
|
|
else: |
|
|
|
yield entry |
|
|
|
|
|
|
@ -34,42 +64,88 @@ def hashing(file, alg='blake2b'): |
|
|
|
return hash_func.hexdigest() |
|
|
|
|
|
|
|
|
|
|
|
def hash_dir(input_dir, output_dir, alg='blake2b'): |
|
|
|
# List files |
|
|
|
output = [] |
|
|
|
def build_file_list(input_dir): |
|
|
|
file_list = [] |
|
|
|
|
|
|
|
for dir_entry in scan_tree(input_dir): |
|
|
|
full_path = dir_entry.path |
|
|
|
# file_name = dir_entry.name |
|
|
|
relative_path = os.path.relpath(full_path, input_dir) |
|
|
|
file_list.append(relative_path) |
|
|
|
|
|
|
|
file_list = natsort.natsorted(file_list, alg=ns.PATH) |
|
|
|
|
|
|
|
return file_list |
|
|
|
|
|
|
|
|
|
|
|
def read_existing_output(output_dir): |
|
|
|
if not os.path.exists(output_dir): |
|
|
|
os.makedirs(output_dir) |
|
|
|
return None |
|
|
|
|
|
|
|
csv_path = os.path.join(output_dir, OUTPUT_FILE_NAME) |
|
|
|
if not os.path.exists(csv_path): |
|
|
|
return None |
|
|
|
|
|
|
|
with open(csv_path, 'r') as f: |
|
|
|
reader = csv.DictReader(f, fieldnames=HEADER_LIST) |
|
|
|
next(reader) |
|
|
|
existing_files = [item['rel_path'] for item in reader] |
|
|
|
|
|
|
|
existing_files = natsort.natsorted(existing_files, alg=ns.PATH) |
|
|
|
return existing_files |
|
|
|
|
|
|
|
|
|
|
|
def hash_dir(input_dir, output_dir, alg='blake2b'): |
|
|
|
# for path, dirs, files in os.walk(input_dir): |
|
|
|
total_file_size = sum(os.path.getsize(dir_entry.path) for dir_entry in scantree(input_dir)) |
|
|
|
total_file_num = sum(1 for _ in scantree(input_dir)) |
|
|
|
total_file_size = sum(os.path.getsize(dir_entry.path) for dir_entry in scan_tree(input_dir)) |
|
|
|
total_file_num = sum(1 for _ in scan_tree(input_dir)) |
|
|
|
processed_file_size = 0 |
|
|
|
processed_file_num = 0 |
|
|
|
|
|
|
|
for dir_entry in scantree(input_dir): |
|
|
|
full_path = dir_entry.path |
|
|
|
file_name = dir_entry.name |
|
|
|
relative_path = os.path.relpath(full_path, input_dir) |
|
|
|
file_hash = hashing(full_path, alg=alg) |
|
|
|
output.append({ |
|
|
|
"filename": file_name, |
|
|
|
# "full_path": full_path, |
|
|
|
"rel_path": relative_path, |
|
|
|
"checksum": file_hash, |
|
|
|
}) |
|
|
|
processed_file_size += os.path.getsize(full_path) |
|
|
|
processed_file_num += 1 |
|
|
|
|
|
|
|
# Print progress |
|
|
|
if processed_file_num % 100: |
|
|
|
print("Now at %f%%. %d/%d bytes of files processed. " % ( |
|
|
|
processed_file_size / total_file_size * 100, processed_file_size, total_file_size)) |
|
|
|
|
|
|
|
list_out = natsort.natsorted(output, key=lambda v: v['rel_path'], alg=ns.IC) |
|
|
|
csv_path = os.path.join(output_dir, 'checksum.csv') |
|
|
|
header_list = list(output[0].keys()) |
|
|
|
with open(csv_path, 'w') as f: |
|
|
|
writer = csv.DictWriter(f, fieldnames=header_list, extrasaction="ignore") |
|
|
|
writer.writeheader() |
|
|
|
writer.writerows(list_out) |
|
|
|
# Build file list |
|
|
|
file_list = build_file_list(input_dir) |
|
|
|
|
|
|
|
# Read existing output file4 |
|
|
|
output_exist = read_existing_output(output_dir) |
|
|
|
|
|
|
|
# print(*file_list, sep='\n') |
|
|
|
# if output_exist is not None: |
|
|
|
# print(*output_exist, sep='\n') |
|
|
|
|
|
|
|
# Write hashes |
|
|
|
csv_path = os.path.join(output_dir, OUTPUT_FILE_NAME) |
|
|
|
with open(csv_path, 'a') as f: |
|
|
|
# Write headers |
|
|
|
writer = csv.DictWriter(f, fieldnames=HEADER_LIST, extrasaction="ignore") |
|
|
|
if output_exist is None: |
|
|
|
writer.writeheader() |
|
|
|
output_exist = [] |
|
|
|
|
|
|
|
# Iterations |
|
|
|
for rel_path in file_list: |
|
|
|
full_path = os.path.join(input_dir, rel_path) |
|
|
|
|
|
|
|
processed_file_size += os.path.getsize(full_path) |
|
|
|
processed_file_num += 1 |
|
|
|
|
|
|
|
# Print progress |
|
|
|
if processed_file_num % 100: |
|
|
|
print("Now at %f%%. %s/%s bytes of files processed. " % ( |
|
|
|
processed_file_size / total_file_size * 100, |
|
|
|
pretty_size(processed_file_size), pretty_size(total_file_size))) |
|
|
|
print("%d/%d files processed. " % ( |
|
|
|
processed_file_num, total_file_num)) |
|
|
|
|
|
|
|
if rel_path in output_exist: |
|
|
|
continue |
|
|
|
|
|
|
|
file_hash = hashing(full_path, alg=alg) |
|
|
|
output = { |
|
|
|
"rel_path": rel_path, |
|
|
|
"checksum": file_hash, |
|
|
|
} |
|
|
|
writer.writerow(output) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|