fix: 修改搜索信息获取逻辑，增加参数以支持不同级别的请求和视频时长过滤

2025-05-17 20:06:15 +08:00 · 2025-05-17 20:06:15 +08:00 · 84c6f3afd9
commit 84c6f3afd9
parent 69137dd128
1 changed files with 25 additions and 15 deletions
--- a/main.py
+++ b/main.py
@ -6,6 +6,8 @@ import uuid
 import concurrent.futures
 import requests
 import datetime
+
+from mpmath import limit
 from requests import RequestException
 from DB import DBVidcon
 from dateutil import parser as date_parser
@ -109,7 +111,7 @@ def get_proxies(g):
        proxy_data = response.json()['data'][0]
    except Exception:
        print(g)
-        print("数据返回解析错误!"+ str(response.text))
+        print("数据返回解析错误!" + str(response.text))
        time.sleep(5)
        return get_proxies(g)
    proxies_url = f"http://{proxy_data['username']}:{proxy_data['password']}@{proxy_data['ip']}:{proxy_data['port']}"
@ -188,7 +190,7 @@ def gettoken():
    u = uuid.uuid4()
    uuid_with_dash = str(u)
    uuid_no_dash = u.hex
-    traffic_segment = str(random.randint(10**8, 10**9 - 1))
+    traffic_segment = str(random.randint(10 ** 8, 10 ** 9 - 1))
    data = {
        'client_id': 'f1a362d288c1b98099c7',
        'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
@ -209,9 +211,14 @@ def gettoken():
        pass


-def get_searchInfo(keyword):
+def get_searchInfo(keyword, level):
    video_list = []
-    for j in range(1, 3):
+    max_page = 2
+    limit = 30
+    if level == 1:
+        max_page = 3
+        limit = 100
+    for j in range(1, max_page):
        # 别展开 = = !
        data = {
            "operationName": "SEARCH_QUERY",
@ -224,7 +231,7 @@ def get_searchInfo(keyword):
                "shouldIncludeVideos": False,
                "shouldIncludeLives": False,
                "page": j,
-                "limit": 100,
+                "limit": limit,
                "recaptchaToken": None
            },
            "query": """
@ -580,12 +587,14 @@ def get_searchInfo(keyword):
            if node['__typename'] != "Video":
                continue
            creator = node['creator']
-            video_tasks.append({
-                "index": calculated_index,
-                "xid": node.get('xid'),
-                "node": node,
-                "creator": creator,
-            })
+            duration = node.get('duration')
+            if duration > 300:
+                video_tasks.append({
+                    "index": calculated_index,
+                    "xid": node.get('xid'),
+                    "node": node,
+                    "creator": creator,
+                })

        def safe_fetch(task, max_try=2):
            attempt = 0
@ -704,16 +713,15 @@ def integrate_data():
                    if not v_list:
                        for i in range(3):
                            time.sleep(i * 5)
-                            v_list = get_searchInfo(kitem["keyword"])
+                            v_list = get_searchInfo(kitem["keyword"], kitem['level'])
                            if v_list:
                                break
                            time.sleep(2)

-
                    for item in v_list:
                        record = {
                            "keyword": kitem.get("keyword"),
-                            "v_name" : kitem.get("v_name"),
+                            "v_name": kitem.get("v_name"),
                            "v_id": item.get("v_id"),
                            "v_xid": item.get("v_xid"),
                            "link": item.get("link"),
@ -748,6 +756,7 @@ def integrate_data():
                    time.sleep(5)
                    break

+
 def parse_args() -> argparse.Namespace:
    global MACHINE_ID, MAX_WORKERS

@ -778,10 +787,11 @@ def parse_args() -> argparse.Namespace:
        raise ValueError("请指定机器编号")
    return args

+
 if __name__ == '__main__':
    parse_args()
    start_time = datetime.datetime.now()
    print(f"开始时间：{start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    integrate_data()
    end_time = datetime.datetime.now()
-    duration = end_time - start_time
+    duration = end_time - start_time