diff --git a/cookie.txt b/cookie.txt index d2874b3..7f803a7 100644 --- a/cookie.txt +++ b/cookie.txt @@ -1 +1 @@ -Hm_lvt_7a3960b6f067eb0085b7f96ff5e660b0=1690080220,1690550836,1690723621,1690809059; ndut_fmt=2F3F7209152B005542DB9E27E1FE8B25A229DCC706598E1B99139109B3AD5791; BDCLND=Kiz1b8Bpv31XBFDJNdGFjZSNEpvCREEMgiHA5oAF99k%3D; BDUSS=3dlMEhZLWtLS2tGeVh2T1ozYWtYaXdFTWV6QnRrYzByMEwzd2hWWmxkamd4dk5pRVFBQUFBJCQAAAAAAAAAAAEAAADgajfycm9nZXJzdW4wOTAxAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOA5zGLgOcxid; BIDUPSID=2947C9BCCF77B5D4BFC420C602BE3DF5; PSTM=1658846657; ZFY=0Vc50oTr7EmlM6WxzWE2Eewo0ltT5VQJ2KG9s8DcdoY:C; MCITY=-131%3A; STOKEN=d598c65d443486ba06db34f988e6ac8d108c6b2e4f4eae47aeec796b8c487453; BAIDUID=8DC7F291FA0638DF89C98F0178FCDC7C:FG=1; BAIDUID_BFESS=8DC7F291FA0638DF89C98F0178FCDC7C:FG=1; PANWEB=1; newlogin=1; csrfToken=2r7JSCtVulGN3eOExz3JqsxP; PANPSC=12925419710669722059%3AKkwrx6t0uHBNjb%2BA%2BPLlBZgtJeEFa7WQw1jWL8y1tqu8ztnSQmWL1wZuq6kOUHWChFbm%2BExtunGiz1cJOClRr8ZIzuhpmLh6b55KeZe4CQBn3K3RJ8ZwedL9vR6DsgcTu1tPRVPr6y7%2FwyO%2B4eG7s0I0NZhR03fFuueAr2t%2FRhNXpcvfoZtUp%2B4PuTzNr1rr; ab_sr=1.0.1_NDE5NTY0NjYxYjNjZmVjZmQ2YTY1YjYzMjFlY2E5MGViZWI2MWY5MzA1Yzk4N2M1YzM3YmFlZmM2MmM1NmNmYjgzYTk4NWQxNzdiZWQxZTAzN2RlYjQ2NmI1NjAzNTAwNmJjNjNlZGFkN2JmNTUxNDMzNmRkM2RkZWVhZTZhMTM4ZjA3ZjJlMTAxYTc2MTk2YmM0YTA5OGUzMWIzNzE5Yjg0MTFkNzdjOThhMjA5ZDVhODczOTZiYzM2ZmEzZjhk \ No newline at end of file +Hm_lvt_7a3960b6f067eb0085b7f96ff5e660b0=1690550836,1690723621,1690809059,1691281792; ndut_fmt=08603C41725B9D1E463435C4C64207479195C09D833C50A24353D22EB2E4456E; BDCLND=mZG1nNwbvm0KuvJr0nMrsqnZaPHR9lfsfkzVrAx5mig%3D; BDUSS=3dlMEhZLWtLS2tGeVh2T1ozYWtYaXdFTWV6QnRrYzByMEwzd2hWWmxkamd4dk5pRVFBQUFBJCQAAAAAAAAAAAEAAADgajfycm9nZXJzdW4wOTAxAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOA5zGLgOcxid; BIDUPSID=2947C9BCCF77B5D4BFC420C602BE3DF5; PSTM=1658846657; ZFY=0Vc50oTr7EmlM6WxzWE2Eewo0ltT5VQJ2KG9s8DcdoY:C; MCITY=-131%3A; STOKEN=d598c65d443486ba06db34f988e6ac8d108c6b2e4f4eae47aeec796b8c487453; BAIDUID=8DC7F291FA0638DF89C98F0178FCDC7C:FG=1; BAIDUID_BFESS=8DC7F291FA0638DF89C98F0178FCDC7C:FG=1; PANWEB=1; newlogin=1; csrfToken=1xjPBP_dJWHgaoiwR4nQ7VBL; PANPSC=12489461165957612349%3AKkwrx6t0uHBNjb%2BA%2BPLlBZgtJeEFa7WQw1jWL8y1tqu8ztnSQmWL1wZuq6kOUHWCO5C65PtcmdJbCWSrkiaxSMZIzuhpmLh6b55KeZe4CQBn3K3RJ8ZwedL9vR6DsgcTu1tPRVPr6y7%2FwyO%2B4eG7s0I0NZhR03fF1bSRWp4nHGroivDAyr3Yne4PuTzNr1rr; ab_sr=1.0.1_NTI5MWQ4OGQ2ZTcwMTM0ZTgzZDQ5YzEzZjc5NmJlMWMxOGVjODEyOGM1ZWFlZGRkMDc1YzRiYjE3OGVjOWFlOWFkNmViNzVlMzU0YmM0YmFmM2EyNWViZWFhY2U4MWEyOTdiOWM2NGRlOTE5OWUyZWI2NGNmYTE3YmE0OTFkZTUxYjQ0YmQ1Mjk5Njk1NGZlMWI5Mzg3Y2Y0OWZhZTE0NDQ0YzBkYjE4NWNiM2Y2YTVjZGZmZjhlYjExOTAzZTY2; Hm_lpvt_7a3960b6f067eb0085b7f96ff5e660b0=1691281801; Hm_lvt_fa0277816200010a74ab7d2895df481b=1691281797; Hm_lpvt_fa0277816200010a74ab7d2895df481b=1691281797 \ No newline at end of file diff --git a/db.py b/db.py index d28befc..d0a9073 100644 --- a/db.py +++ b/db.py @@ -17,10 +17,21 @@ class DbAction: self.date_list = self.get_date_list(end_date) def get_db_data(self): - query_sql = f"SELECT h.id, h.cate, h.date, h.`name`, h.save_link, h.`code`, h.unzip_pwd FROM scrapyh h WHERE h.date IN ({self.date_list});" + query_sql = f"SELECT h.id, h.cate, h.date, h.`name`, h.save_link, h.`code`, h.unzip_pwd FROM scrapyh h WHERE h.date IN ({self.date_list})" + # 错误调试 + # query_sql = query_sql + " AND id IN ('35334', '35335', '35336', '35337', '35338', '35339')" self.cursor.execute(query_sql) return self.cursor.fetchall() + def update_file_name(self, data_id, file_name): + query_sql = f"UPDATE scrapyh SET file_name = %s WHERE id = %s" + self.cursor.execute(query_sql, (file_name, data_id)) + self.conn.commit() + + def disconnect_db(self): + self.cursor.close() + self.conn.close() + @staticmethod def get_date_list(_end_date): date_list = [] diff --git a/main.py b/main.py index e69de29..44e48ab 100644 --- a/main.py +++ b/main.py @@ -0,0 +1,70 @@ +from db import DbAction +from req import ReqAction +from time import sleep +import random + +# config +# get_end_date = 'today' +get_end_date = '2023-08-03' + + +def format_link(link): + if '?pwd=' in link: + link = link[:-9] + if 'init?surl=' in link: + link = 'https://pan.baidu.com/s/1' + link[38:60] + return link + + +def req(_data, req_obj, db_obj): + base_temp_path = r'/Temp/' + save_path = [base_temp_path, _data['cate'] + r'/', str(_data['id'])] + print('-' * 30 + str(_data['id']) + '-' * 30) + print(save_path) + link = format_link(_data['save_link']) + print(_data['save_link'], ' -> ', link, ' ', _data['code']) + result, name = req_obj.process(save_path, link, _data['code']) + if result: + db_obj.update_file_name(_data['id'], name) + print(f'{_data["id"]}保存成功' + '\n' * 2) + return True + else: + print(f'{_data["id"]}保存失败,请检查!' + '\n' * 2) + return _data + + +def main(): + db_obj = DbAction(get_end_date) + req_obj = ReqAction() + try: + data_list = db_obj.get_db_data() + failed_list = [] + req_obj.prepare() + for _data in data_list: + result = req(_data, req_obj, db_obj) + if isinstance(result, dict): + failed_list.append(result) + sleep(random.randint(0, 3) + random.random()) + i = 3 + while len(failed_list) > 0 and i > 0: + _temp_list = [] + for _data in failed_list: + result = req(_data, req_obj, db_obj) + if isinstance(result, dict): + _temp_list.append(result) + sleep(random.randint(0, 3) + random.random()) + failed_list = _temp_list + i -= 1 + + if len(failed_list): + failed_id = [_data['id'] for _data in failed_list] + print('重试后依然失败:') + print(failed_id) + except Exception as e: + print(e) + finally: + db_obj.disconnect_db() + + +if __name__ == '__main__': + main() diff --git a/req.py b/req.py index 0103017..79bb679 100644 --- a/req.py +++ b/req.py @@ -66,6 +66,7 @@ class ReqAction: url = f'{BASE_URL}/api/gettemplatevariable?clienttype=0&app_id=250528&web=1&fields=[%22bdstoken%22,%22token%22,%22uk%22,%22isdocuser%22,%22servertime%22]' response = self.session.get(url=url, headers=self.request_header, timeout=20, allow_redirects=True, verify=False) + print('get_bdstoken') print(response.text) return response.json()['errno'] if response.json()['errno'] != 0 else response.json()['result']['bdstoken'] @@ -75,6 +76,7 @@ class ReqAction: url = f'{BASE_URL}/api/list?order=time&desc=1&showempty=0&web=1&page=1&num=1000&dir={dir_path}&bdstoken={self.bdstoken}' response = self.session.get(url=url, headers=self.request_header, timeout=15, allow_redirects=False, verify=False) + print('get_dir_list') print(response.text) return response.json()['errno'] if response.json()['errno'] != 0 else response.json()['list'] @@ -85,7 +87,8 @@ class ReqAction: post_data = {'path': target_directory_name, 'isdir': '1', 'block_list': '[]', } response = self.session.post(url=url, headers=self.request_header, data=post_data, timeout=15, allow_redirects=False, verify=False) - print(response.text) + print('create_dir') + print(response.text.strip()) return response.json()['errno'] # 更新 cookie 函数 @@ -103,7 +106,8 @@ class ReqAction: post_data = {'pwd': pass_code, 'vcode': '', 'vcode_str': '', } response = self.session.post(url=check_url, headers=self.request_header, data=post_data, timeout=10, allow_redirects=False, verify=False) - print(response.text) + print('verify_pass_code') + print(response.text.strip()) return response.json()['errno'] if response.json()['errno'] != 0 else response.json()['randsk'] # 验证链接函数 @@ -117,7 +121,8 @@ class ReqAction: response = self.session.get(url=link_url, headers=self.request_header, timeout=15, allow_redirects=True, verify=False).content.decode("utf-8") - print(response) + print('verify_links') + # print(response) shareid_list = re.findall('"shareid":(\\d+?),"', response) user_id_list = re.findall('"share_uk":"(\\d+?)","', response) fs_id_list = re.findall('"fs_id":(\\d+?),"', response) @@ -140,30 +145,33 @@ class ReqAction: 'path': f'/{target_directory_name}', } response = self.session.post(url=url, headers=self.request_header, data=post_data, timeout=15, allow_redirects=False, verify=False) + print('transfer_files') print(response.text) - return response.json()['errno'] - - def main(self): - base_dir = '/Temp/' - cate_dir = 'leshe/' - target_dir_name = '2' - target_dir = os.path.join(base_dir, cate_dir, target_dir_name) - print(target_dir) - # link_list = [] - # self.get_cookies() - # self.bdstoken = self.get_bdstoken() - # dir_list = self.get_dir_list('/Temp/leshe') - # print([_dir['path'] for _dir in dir_list]) - # if target_dir in [_dir['path'] for _dir in dir_list]: - # print('找到了') - # # if dir_name and dir_name not in [_dir('path') for _dir in self.dir_list]: - # # self.create_dir(r'/Temp/leshe/1') - # # 执行转存 - # for link in link_list: - # verified_links = self.verify_links(link[0], link[1]) - # self.transfer_files(verified_links, target_dir) + file_name = response.json()['extra']['list'][0]['from'] + return response.json()['errno'], file_name + + def prepare(self): + self.get_cookies() + self.bdstoken = self.get_bdstoken() + + def process(self, path_list, save_link, save_code): + base_path = os.path.join(path_list[0], path_list[1]) + save_path = os.path.join(*path_list) + # print(save_path, base_path) + temp_path_list = self.get_dir_list(path_list[0]) + if isinstance(temp_path_list, list) and base_path[:-1] not in [_dir['path'] for _dir in temp_path_list]: + self.create_dir(base_path) + base_path_list = self.get_dir_list(base_path) + if isinstance(base_path_list, list) and save_path not in [_dir['path'] for _dir in base_path_list]: + self.create_dir(save_path) + verified_links = self.verify_links(save_link, save_code) + if isinstance(verified_links, list): + result, name = self.transfer_files(verified_links, save_path) + if result == 0: + return True, name + return False, None if __name__ == '__main__': req = ReqAction() - req.main() + # req.process()