From 2c0a55232d4c4569764a98c44f006efe93c7cafb Mon Sep 17 00:00:00 2001
From: Franklin-F <dewujie64@gmail.com>
Date: Sun, 1 Jun 2025 15:01:57 +0800
Subject: [PATCH] feat:  using Playwright

---
 getVideoInfoForPlaywright.py | 133 +++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 getVideoInfoForPlaywright.py

diff --git a/getVideoInfoForPlaywright.py b/getVideoInfoForPlaywright.py
new file mode 100644
index 0000000..0690231
--- /dev/null
+++ b/getVideoInfoForPlaywright.py
@@ -0,0 +1,133 @@
+import json
+import random
+import time
+
+from box import Box
+from playwright.sync_api import sync_playwright
+from DB import DBVidcon, DBSA
+from urllib.parse import quote, quote_plus
+
+db = DBVidcon()
+kitm, flag = db.item_keyword(1)
+kw = json.loads(kitm[0][0]).get('keyword')
+print(kw)
+parsekw = quote(kw)
+url = f"https://www.dailymotion.com/search/{parsekw}/top-results"
+print(url)
+
+target_count = 200
+
+
+def wait_for_search_response(page, timeout: int = 30000):
+    monitor_url = "https://graphql.api.dailymotion.com"
+    with page.expect_response(
+            lambda resp: (
+                    monitor_url in resp.url
+                    and resp.status == 200
+                    and resp.request.post_data
+                    and '"operationName":"SEARCH_QUERY"' in resp.request.post_data
+            ),
+            timeout=timeout
+    ) as resp_info:
+        pass
+    return resp_info.value
+
+def human_scroll(page):
+    vs = page.viewport_size
+    # 大范围下滚：随机 5~8 次，每次滚动 视口高度的 30%~50%
+    # 先做几次大幅度下滚，确保触发下一页加载
+    for _ in range(random.randint(4, 6)):
+        # 随机抖动鼠标
+        x = random.randint(0, vs["width"])
+        y = random.randint(vs["height"] // 3, vs["height"])
+        page.mouse.move(x, y, steps=random.randint(5, 15))
+        page.wait_for_timeout(random.randint(50, 100))
+
+        # 向下滚动 视口高度的 40%~60%
+        amount = int(vs["height"] * random.uniform(0.4, 0.6))
+        page.mouse.wheel(0, amount)
+        page.wait_for_timeout(random.randint(200, 400))
+
+    # 若要“回滚”但不干扰下一次下滚触发，在下滚后隔几次才做小幅上滚
+    if random.random() < 0.3:  # 30% 概率才做上滚
+        up_steps = random.randint(1, 2)
+        for _ in range(up_steps):
+            x = random.randint(0, vs["width"])
+            y = random.randint(vs["height"] // 3, vs["height"])
+            page.mouse.move(x, y, steps=random.randint(5, 15))
+            page.wait_for_timeout(random.randint(50, 100))
+
+            # 上滚 视口高度的 5%~10%
+            up_amount = int(vs["height"] * random.uniform(0.05, 0.1))
+            page.mouse.wheel(0, -up_amount)
+            page.wait_for_timeout(random.randint(150, 300))
+
+
+with sync_playwright() as pw:
+    browser = pw.chromium.launch(
+        headless=False,
+        proxy={
+            "server": "http://127.0.0.1:7890"
+        }
+
+    )
+    page = browser.new_page()
+
+
+    def handle_route(route, request):
+        if request.resource_type in ["image", "font"]:
+            return route.abort()
+        return route.continue_()
+
+
+    page.route("**/*", handle_route)
+
+    page.goto(url)
+    resp = wait_for_search_response(page, timeout=20000)
+    box = Box(resp.json())
+    edges = box.data.search.stories.edges or []
+    all_edges = list(edges)
+
+    if len(edges) == 0:
+        page.reload()
+        page.wait_for_load_state("networkidle")
+        resp = wait_for_search_response(page)
+        box = Box(resp.json())
+        edges = box.data.search.stories.edges or []
+        all_edges = list(edges)
+
+    # 如果拿到了少于20条但大于0条，则直接退出
+    if 0 < len(edges) < 20:
+        pass  # all_edges 里已经是这些条目
+
+    # 如果正好是20条，就进入上下滚循环，直到累计 >= target_count 或拿到少于20但大于0条时退出
+    elif len(edges) == 20:
+        while True:
+            human_scroll(page)
+            time.sleep(0.5)
+            resp = wait_for_search_response(page)
+            box = Box(resp.json())
+            edges = box.data.search.stories.edges or []
+            all_edges.extend(edges)
+
+            # 累计超过 target_count，就终止
+            if len(all_edges) >= target_count:
+                break
+            # 如果当前批次小于20条但大于0条，就终止
+            if 0 < len(edges) < 20:
+                break
+            # 如果这一轮连一条也没拿到，就刷新页面再重试
+            if len(edges) == 0:
+                page.reload()
+                page.wait_for_load_state("networkidle")
+                resp = wait_for_search_response(page)
+                box = Box(resp.json())
+                edges = box.data.search.stories.edges or []
+                all_edges.extend(edges)
+                if edges and len(edges) < 20:
+                    break
+
+    # 最终截断到 target_count
+    all_edges = all_edges[:target_count]
+
+    browser.close()