diff --git a/.gitignore b/.gitignore index e1334f8..08bbb5b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .idea/ __pycache__/ *.log +/dup.txt diff --git a/db.py b/db.py index daced1e..9b6b023 100644 --- a/db.py +++ b/db.py @@ -25,6 +25,15 @@ def get_md5_by_path(_path): return r +def get_path_by_cate(_cate): + get_path_sql = "SELECT SUBSTRING(path,3) as path FROM files WHERE cate = %s;" + db_cursor.execute(get_path_sql, (_cate,)) + r = db_cursor.fetchall() + file_list = [f['path'] for f in r] + print('get_path_by_cate查询完成') + return file_list + + def is_exist(_md5): is_exist_sql = "SELECT * FROM files WHERE md5 = %s;" db_cursor.execute(is_exist_sql, (_md5,)) diff --git a/main.py b/main.py index 2047419..e48dc0a 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,13 @@ from getFiles import get_file_list from getMd5 import get_file_md5 from db import * +from time import time as t def main_process(): - disk = "Myxx_Backup_16T_2" - cate = "hj" - path = r"Y:\hj" + disk = "Myxx_Backup_16T_3" + cate = "leshe" + path = r"X:\leshe" _exist_file_list = [] @@ -14,24 +15,43 @@ def main_process(): print("检查参数") return + print(f"开始获取文件列表:{t()}") file_list = get_file_list(path) - file_list_len = len(file_list) - for (i, f) in enumerate(file_list): - print(f"({i+1}/{file_list_len}) - 开始处理: {f}") - if md5 := get_md5_by_path(f): - print(f"请注意!!!文件的md5({md5})已经存在: {f}") + print(f"结束获取文件列表:{t()}") + + unchecked_list = [] + cate_file_list = get_path_by_cate(cate) + + total_len = len(file_list) + for (index, file) in enumerate(file_list): + if file[2:] in cate_file_list: + print(f"{index}/{total_len}文件已经存在: {file}") else: - md5 = get_file_md5(f) - if exist := is_exist(md5): - _exist_file_list.append(f) - print(f"重复文件!!!md5为({md5})的文件已经存在,文件路径:{exist['path']} -> {f}") - continue - insert_data((cate, disk, f, md5)) - print(f"文件的md5({md5})成功插入: {f}") + print(f"{index}/{total_len}未处理文件: {file}") + unchecked_list.append(file) + + file_list_len = len(unchecked_list) + for (i, f) in enumerate(unchecked_list): + print(f"({i+1}/{file_list_len}) - 开始处理: {f} - {t()}") + t_start = t() + md5 = get_file_md5(f) + print(f"完成MD5计算耗时:{t()-t_start}") + if exist := is_exist(md5): + _exist_file_list.append(f) + print(f"重复文件!!!md5为({md5})的文件已经存在,文件路径:{exist['path']} -> {f}") + continue + insert_data((cate, disk, f, md5)) + print(f"文件的md5({md5})成功插入: {f}") return _exist_file_list if __name__ == "__main__": exist_file_list = main_process() - for ef in exist_file_list: - print(ef) + if len(exist_file_list) > 0: + with open('dup.txt', 'w', encoding='utf-8') as dup_file: + for ef in exist_file_list: + print(ef) + dup_file.write(ef) + dup_file.close() + else: + print("没有找到重复文件")