diff --git a/checksum.py b/checksum.py index ec0a53c..7c11657 100644 --- a/checksum.py +++ b/checksum.py @@ -7,13 +7,43 @@ import natsort from natsort.ns_enum import ns BUF_SIZE = 65536 +OUTPUT_FILE_NAME = 'checksum.csv' +HEADER_LIST = ["rel_path", "checksum"] + +# bytes pretty-printing +UNITS_MAPPING = [ + (1 << 50, ' PB'), + (1 << 40, ' TB'), + (1 << 30, ' GB'), + (1 << 20, ' MB'), + (1 << 10, ' KB'), + (1, (' byte', ' bytes')), +] + + +def pretty_size(bytes, units=UNITS_MAPPING): + """Get human-readable file sizes. + simplified version of https://pypi.python.org/pypi/hurry.filesize/ + """ + for factor, suffix in units: + if bytes >= factor: + break + amount = int(bytes / factor) + + if isinstance(suffix, tuple): + singular, multiple = suffix + if amount == 1: + suffix = singular + else: + suffix = multiple + return str(amount) + suffix -def scantree(path): +def scan_tree(path): """Recursively yield DirEntry objects for given directory. Files only.""" for entry in os.scandir(path): if entry.is_dir(follow_symlinks=False): - yield from scantree(entry.path) + yield from scan_tree(entry.path) else: yield entry @@ -34,42 +64,88 @@ def hashing(file, alg='blake2b'): return hash_func.hexdigest() -def hash_dir(input_dir, output_dir, alg='blake2b'): - # List files - output = [] +def build_file_list(input_dir): + file_list = [] + + for dir_entry in scan_tree(input_dir): + full_path = dir_entry.path + # file_name = dir_entry.name + relative_path = os.path.relpath(full_path, input_dir) + file_list.append(relative_path) + + file_list = natsort.natsorted(file_list, alg=ns.PATH) + + return file_list + +def read_existing_output(output_dir): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + return None + + csv_path = os.path.join(output_dir, OUTPUT_FILE_NAME) + if not os.path.exists(csv_path): + return None + + with open(csv_path, 'r') as f: + reader = csv.DictReader(f, fieldnames=HEADER_LIST) + next(reader) + existing_files = [item['rel_path'] for item in reader] + + existing_files = natsort.natsorted(existing_files, alg=ns.PATH) + return existing_files + + +def hash_dir(input_dir, output_dir, alg='blake2b'): # for path, dirs, files in os.walk(input_dir): - total_file_size = sum(os.path.getsize(dir_entry.path) for dir_entry in scantree(input_dir)) - total_file_num = sum(1 for _ in scantree(input_dir)) + total_file_size = sum(os.path.getsize(dir_entry.path) for dir_entry in scan_tree(input_dir)) + total_file_num = sum(1 for _ in scan_tree(input_dir)) processed_file_size = 0 processed_file_num = 0 - for dir_entry in scantree(input_dir): - full_path = dir_entry.path - file_name = dir_entry.name - relative_path = os.path.relpath(full_path, input_dir) - file_hash = hashing(full_path, alg=alg) - output.append({ - "filename": file_name, - # "full_path": full_path, - "rel_path": relative_path, - "checksum": file_hash, - }) - processed_file_size += os.path.getsize(full_path) - processed_file_num += 1 - - # Print progress - if processed_file_num % 100: - print("Now at %f%%. %d/%d bytes of files processed. " % ( - processed_file_size / total_file_size * 100, processed_file_size, total_file_size)) - - list_out = natsort.natsorted(output, key=lambda v: v['rel_path'], alg=ns.IC) - csv_path = os.path.join(output_dir, 'checksum.csv') - header_list = list(output[0].keys()) - with open(csv_path, 'w') as f: - writer = csv.DictWriter(f, fieldnames=header_list, extrasaction="ignore") - writer.writeheader() - writer.writerows(list_out) + # Build file list + file_list = build_file_list(input_dir) + + # Read existing output file4 + output_exist = read_existing_output(output_dir) + + # print(*file_list, sep='\n') + # if output_exist is not None: + # print(*output_exist, sep='\n') + + # Write hashes + csv_path = os.path.join(output_dir, OUTPUT_FILE_NAME) + with open(csv_path, 'a') as f: + # Write headers + writer = csv.DictWriter(f, fieldnames=HEADER_LIST, extrasaction="ignore") + if output_exist is None: + writer.writeheader() + output_exist = [] + + # Iterations + for rel_path in file_list: + full_path = os.path.join(input_dir, rel_path) + + processed_file_size += os.path.getsize(full_path) + processed_file_num += 1 + + # Print progress + if processed_file_num % 100: + print("Now at %f%%. %s/%s bytes of files processed. " % ( + processed_file_size / total_file_size * 100, + pretty_size(processed_file_size), pretty_size(total_file_size))) + print("%d/%d files processed. " % ( + processed_file_num, total_file_num)) + + if rel_path in output_exist: + continue + + file_hash = hashing(full_path, alg=alg) + output = { + "rel_path": rel_path, + "checksum": file_hash, + } + writer.writerow(output) if __name__ == '__main__':