优化查询效率

3 years ago · ea0d80aaec
parent d947eb3f09
commit ea0d80aaec
3 changed files with 47 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 .idea/
 __pycache__/
 *.log
+/dup.txt
--- a/db.py
+++ b/db.py
@ -25,6 +25,15 @@ def get_md5_by_path(_path):
    return r


+def get_path_by_cate(_cate):
+    get_path_sql = "SELECT SUBSTRING(path,3) as path FROM files WHERE cate = %s;"
+    db_cursor.execute(get_path_sql, (_cate,))
+    r = db_cursor.fetchall()
+    file_list = [f['path'] for f in r]
+    print('get_path_by_cate查询完成')
+    return file_list
+
+
 def is_exist(_md5):
    is_exist_sql = "SELECT * FROM files WHERE md5 = %s;"
    db_cursor.execute(is_exist_sql, (_md5,))
--- a/main.py
+++ b/main.py
@ -1,12 +1,13 @@
 from getFiles import get_file_list
 from getMd5 import get_file_md5
 from db import *
+from time import time as t


 def main_process():
-    disk = "Myxx_Backup_16T_2"
-    cate = "hj"
-    path = r"Y:\hj"
+    disk = "Myxx_Backup_16T_3"
+    cate = "leshe"
+    path = r"X:\leshe"

    _exist_file_list = []

@ -14,24 +15,43 @@ def main_process():
        print("检查参数")
        return

+    print(f"开始获取文件列表：{t()}")
    file_list = get_file_list(path)
-    file_list_len = len(file_list)
-    for (i, f) in enumerate(file_list):
-        print(f"({i+1}/{file_list_len}) - 开始处理: {f}")
-        if md5 := get_md5_by_path(f):
-            print(f"请注意！！！文件的md5({md5})已经存在: {f}")
+    print(f"结束获取文件列表：{t()}")
+
+    unchecked_list = []
+    cate_file_list = get_path_by_cate(cate)
+
+    total_len = len(file_list)
+    for (index, file) in enumerate(file_list):
+        if file[2:] in cate_file_list:
+            print(f"{index}/{total_len}文件已经存在: {file}")
        else:
-            md5 = get_file_md5(f)
-            if exist := is_exist(md5):
-                _exist_file_list.append(f)
-                print(f"重复文件！！！md5为({md5})的文件已经存在，文件路径:{exist['path']} -> {f}")
-                continue
-            insert_data((cate, disk, f, md5))
-            print(f"文件的md5({md5})成功插入: {f}")
+            print(f"{index}/{total_len}未处理文件: {file}")
+            unchecked_list.append(file)
+
+    file_list_len = len(unchecked_list)
+    for (i, f) in enumerate(unchecked_list):
+        print(f"({i+1}/{file_list_len}) - 开始处理: {f} - {t()}")
+        t_start = t()
+        md5 = get_file_md5(f)
+        print(f"完成MD5计算耗时：{t()-t_start}")
+        if exist := is_exist(md5):
+            _exist_file_list.append(f)
+            print(f"重复文件！！！md5为({md5})的文件已经存在，文件路径:{exist['path']} -> {f}")
+            continue
+        insert_data((cate, disk, f, md5))
+        print(f"文件的md5({md5})成功插入: {f}")
    return _exist_file_list


 if __name__ == "__main__":
    exist_file_list = main_process()
-    for ef in exist_file_list:
-        print(ef)
+    if len(exist_file_list) > 0:
+        with open('dup.txt', 'w', encoding='utf-8') as dup_file:
+            for ef in exist_file_list:
+                print(ef)
+                dup_file.write(ef)
+            dup_file.close()
+    else:
+        print("没有找到重复文件")