优化查询效率

3 years ago · ea0d80aaec
parent d947eb3f09
commit ea0d80aaec
3 changed files with 47 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 .idea/
 __pycache__/
 *.log
 /dup.txt
--- a/db.py
+++ b/db.py
@ -25,6 +25,15 @@ def get_md5_by_path(_path):
    return r
 def get_path_by_cate(_cate):
    get_path_sql = "SELECT SUBSTRING(path,3) as path FROM files WHERE cate = %s;"
    db_cursor.execute(get_path_sql, (_cate,))
    r = db_cursor.fetchall()
    file_list = [f['path'] for f in r]
    print('get_path_by_cate查询完成')
    return file_list
 def is_exist(_md5):
    is_exist_sql = "SELECT * FROM files WHERE md5 = %s;"
    db_cursor.execute(is_exist_sql, (_md5,))
--- a/main.py
+++ b/main.py
@ -1,12 +1,13 @@
 from getFiles import get_file_list
 from getMd5 import get_file_md5
 from db import *
 from time import time as t
 def main_process():
-    disk = "Myxx_Backup_16T_2"
+    disk = "Myxx_Backup_16T_3"
-    cate = "hj"
+    cate = "leshe"
-    path = r"Y:\hj"
+    path = r"X:\leshe"
    _exist_file_list = []
@ -14,24 +15,43 @@ def main_process():
        print("检查参数")
        return
    print(f"开始获取文件列表：{t()}")
    file_list = get_file_list(path)
-    file_list_len = len(file_list)
+    print(f"结束获取文件列表：{t()}")
-    for (i, f) in enumerate(file_list):
+
-        print(f"({i+1}/{file_list_len}) - 开始处理: {f}")
+    unchecked_list = []
-        if md5 := get_md5_by_path(f):
+    cate_file_list = get_path_by_cate(cate)
-            print(f"请注意！！！文件的md5({md5})已经存在: {f}")
+
    total_len = len(file_list)
    for (index, file) in enumerate(file_list):
        if file[2:] in cate_file_list:
            print(f"{index}/{total_len}文件已经存在: {file}")
        else:
-            md5 = get_file_md5(f)
+            print(f"{index}/{total_len}未处理文件: {file}")
-            if exist := is_exist(md5):
+            unchecked_list.append(file)
-                _exist_file_list.append(f)
+
-                print(f"重复文件！！！md5为({md5})的文件已经存在，文件路径:{exist['path']} -> {f}")
+    file_list_len = len(unchecked_list)
-                continue
+    for (i, f) in enumerate(unchecked_list):
-            insert_data((cate, disk, f, md5))
+        print(f"({i+1}/{file_list_len}) - 开始处理: {f} - {t()}")
-            print(f"文件的md5({md5})成功插入: {f}")
+        t_start = t()
        md5 = get_file_md5(f)
        print(f"完成MD5计算耗时：{t()-t_start}")
        if exist := is_exist(md5):
            _exist_file_list.append(f)
            print(f"重复文件！！！md5为({md5})的文件已经存在，文件路径:{exist['path']} -> {f}")
            continue
        insert_data((cate, disk, f, md5))
        print(f"文件的md5({md5})成功插入: {f}")
    return _exist_file_list
 if __name__ == "__main__":
    exist_file_list = main_process()
-    for ef in exist_file_list:
+    if len(exist_file_list) > 0:
-        print(ef)
+        with open('dup.txt', 'w', encoding='utf-8') as dup_file:
            for ef in exist_file_list:
                print(ef)
                dup_file.write(ef)
            dup_file.close()
    else:
        print("没有找到重复文件")