Browse Source

Update checksum alg

master
wchen342 2 years ago
parent
commit
32a4e02858
Signed by: wchen342 GPG Key ID: 720B70365E800508
  1. 142
      checksum.py

142
checksum.py

@ -7,13 +7,43 @@ import natsort
from natsort.ns_enum import ns
BUF_SIZE = 65536
OUTPUT_FILE_NAME = 'checksum.csv'
HEADER_LIST = ["rel_path", "checksum"]
# bytes pretty-printing
UNITS_MAPPING = [
(1 << 50, ' PB'),
(1 << 40, ' TB'),
(1 << 30, ' GB'),
(1 << 20, ' MB'),
(1 << 10, ' KB'),
(1, (' byte', ' bytes')),
]
def pretty_size(bytes, units=UNITS_MAPPING):
"""Get human-readable file sizes.
simplified version of https://pypi.python.org/pypi/hurry.filesize/
"""
for factor, suffix in units:
if bytes >= factor:
break
amount = int(bytes / factor)
if isinstance(suffix, tuple):
singular, multiple = suffix
if amount == 1:
suffix = singular
else:
suffix = multiple
return str(amount) + suffix
def scantree(path):
def scan_tree(path):
"""Recursively yield DirEntry objects for given directory. Files only."""
for entry in os.scandir(path):
if entry.is_dir(follow_symlinks=False):
yield from scantree(entry.path)
yield from scan_tree(entry.path)
else:
yield entry
@ -34,42 +64,88 @@ def hashing(file, alg='blake2b'):
return hash_func.hexdigest()
def hash_dir(input_dir, output_dir, alg='blake2b'):
# List files
output = []
def build_file_list(input_dir):
file_list = []
for dir_entry in scan_tree(input_dir):
full_path = dir_entry.path
# file_name = dir_entry.name
relative_path = os.path.relpath(full_path, input_dir)
file_list.append(relative_path)
file_list = natsort.natsorted(file_list, alg=ns.PATH)
return file_list
def read_existing_output(output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
return None
csv_path = os.path.join(output_dir, OUTPUT_FILE_NAME)
if not os.path.exists(csv_path):
return None
with open(csv_path, 'r') as f:
reader = csv.DictReader(f, fieldnames=HEADER_LIST)
next(reader)
existing_files = [item['rel_path'] for item in reader]
existing_files = natsort.natsorted(existing_files, alg=ns.PATH)
return existing_files
def hash_dir(input_dir, output_dir, alg='blake2b'):
# for path, dirs, files in os.walk(input_dir):
total_file_size = sum(os.path.getsize(dir_entry.path) for dir_entry in scantree(input_dir))
total_file_num = sum(1 for _ in scantree(input_dir))
total_file_size = sum(os.path.getsize(dir_entry.path) for dir_entry in scan_tree(input_dir))
total_file_num = sum(1 for _ in scan_tree(input_dir))
processed_file_size = 0
processed_file_num = 0
for dir_entry in scantree(input_dir):
full_path = dir_entry.path
file_name = dir_entry.name
relative_path = os.path.relpath(full_path, input_dir)
file_hash = hashing(full_path, alg=alg)
output.append({
"filename": file_name,
# "full_path": full_path,
"rel_path": relative_path,
"checksum": file_hash,
})
processed_file_size += os.path.getsize(full_path)
processed_file_num += 1
# Print progress
if processed_file_num % 100:
print("Now at %f%%. %d/%d bytes of files processed. " % (
processed_file_size / total_file_size * 100, processed_file_size, total_file_size))
list_out = natsort.natsorted(output, key=lambda v: v['rel_path'], alg=ns.IC)
csv_path = os.path.join(output_dir, 'checksum.csv')
header_list = list(output[0].keys())
with open(csv_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=header_list, extrasaction="ignore")
writer.writeheader()
writer.writerows(list_out)
# Build file list
file_list = build_file_list(input_dir)
# Read existing output file4
output_exist = read_existing_output(output_dir)
# print(*file_list, sep='\n')
# if output_exist is not None:
# print(*output_exist, sep='\n')
# Write hashes
csv_path = os.path.join(output_dir, OUTPUT_FILE_NAME)
with open(csv_path, 'a') as f:
# Write headers
writer = csv.DictWriter(f, fieldnames=HEADER_LIST, extrasaction="ignore")
if output_exist is None:
writer.writeheader()
output_exist = []
# Iterations
for rel_path in file_list:
full_path = os.path.join(input_dir, rel_path)
processed_file_size += os.path.getsize(full_path)
processed_file_num += 1
# Print progress
if processed_file_num % 100:
print("Now at %f%%. %s/%s bytes of files processed. " % (
processed_file_size / total_file_size * 100,
pretty_size(processed_file_size), pretty_size(total_file_size)))
print("%d/%d files processed. " % (
processed_file_num, total_file_num))
if rel_path in output_exist:
continue
file_hash = hashing(full_path, alg=alg)
output = {
"rel_path": rel_path,
"checksum": file_hash,
}
writer.writerow(output)
if __name__ == '__main__':

Loading…
Cancel
Save