优化查询效率

main
roger_home_pc 1 year ago
parent d947eb3f09
commit ea0d80aaec
  1. 1
      .gitignore
  2. 9
      db.py
  3. 54
      main.py

1
.gitignore vendored

@ -1,3 +1,4 @@
.idea/ .idea/
__pycache__/ __pycache__/
*.log *.log
/dup.txt

@ -25,6 +25,15 @@ def get_md5_by_path(_path):
return r return r
def get_path_by_cate(_cate):
get_path_sql = "SELECT SUBSTRING(path,3) as path FROM files WHERE cate = %s;"
db_cursor.execute(get_path_sql, (_cate,))
r = db_cursor.fetchall()
file_list = [f['path'] for f in r]
print('get_path_by_cate查询完成')
return file_list
def is_exist(_md5): def is_exist(_md5):
is_exist_sql = "SELECT * FROM files WHERE md5 = %s;" is_exist_sql = "SELECT * FROM files WHERE md5 = %s;"
db_cursor.execute(is_exist_sql, (_md5,)) db_cursor.execute(is_exist_sql, (_md5,))

@ -1,12 +1,13 @@
from getFiles import get_file_list from getFiles import get_file_list
from getMd5 import get_file_md5 from getMd5 import get_file_md5
from db import * from db import *
from time import time as t
def main_process(): def main_process():
disk = "Myxx_Backup_16T_2" disk = "Myxx_Backup_16T_3"
cate = "hj" cate = "leshe"
path = r"Y:\hj" path = r"X:\leshe"
_exist_file_list = [] _exist_file_list = []
@ -14,24 +15,43 @@ def main_process():
print("检查参数") print("检查参数")
return return
print(f"开始获取文件列表:{t()}")
file_list = get_file_list(path) file_list = get_file_list(path)
file_list_len = len(file_list) print(f"结束获取文件列表:{t()}")
for (i, f) in enumerate(file_list):
print(f"({i+1}/{file_list_len}) - 开始处理: {f}") unchecked_list = []
if md5 := get_md5_by_path(f): cate_file_list = get_path_by_cate(cate)
print(f"请注意!!!文件的md5({md5})已经存在: {f}")
total_len = len(file_list)
for (index, file) in enumerate(file_list):
if file[2:] in cate_file_list:
print(f"{index}/{total_len}文件已经存在: {file}")
else: else:
md5 = get_file_md5(f) print(f"{index}/{total_len}未处理文件: {file}")
if exist := is_exist(md5): unchecked_list.append(file)
_exist_file_list.append(f)
print(f"重复文件!!!md5为({md5})的文件已经存在,文件路径:{exist['path']} -> {f}") file_list_len = len(unchecked_list)
continue for (i, f) in enumerate(unchecked_list):
insert_data((cate, disk, f, md5)) print(f"({i+1}/{file_list_len}) - 开始处理: {f} - {t()}")
print(f"文件的md5({md5})成功插入: {f}") t_start = t()
md5 = get_file_md5(f)
print(f"完成MD5计算耗时:{t()-t_start}")
if exist := is_exist(md5):
_exist_file_list.append(f)
print(f"重复文件!!!md5为({md5})的文件已经存在,文件路径:{exist['path']} -> {f}")
continue
insert_data((cate, disk, f, md5))
print(f"文件的md5({md5})成功插入: {f}")
return _exist_file_list return _exist_file_list
if __name__ == "__main__": if __name__ == "__main__":
exist_file_list = main_process() exist_file_list = main_process()
for ef in exist_file_list: if len(exist_file_list) > 0:
print(ef) with open('dup.txt', 'w', encoding='utf-8') as dup_file:
for ef in exist_file_list:
print(ef)
dup_file.write(ef)
dup_file.close()
else:
print("没有找到重复文件")

Loading…
Cancel
Save