From fe96e23cc2f5cdb4b322b92f4415187d562fdd10 Mon Sep 17 00:00:00 2001 From: xiaofeng wang Date: Thu, 17 Jul 2025 14:21:15 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E8=A7=86=E9=A2=91?= =?UTF-8?q?=E4=BF=A1=E6=81=AF=E5=A4=84=E7=90=86=E5=92=8C=E6=97=B6=E9=97=B4?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- oneget.py | 832 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 467 insertions(+), 365 deletions(-) diff --git a/oneget.py b/oneget.py index 6826fd8..dbd9df7 100644 --- a/oneget.py +++ b/oneget.py @@ -1,3 +1,6 @@ +import base64 +from datetime import datetime + import requests import uuid import random @@ -7,11 +10,39 @@ from threading import Lock import logging from DB import DBVidcon import json +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +MACHINE_ID = 3 logger = logging.getLogger(__name__) db = DBVidcon() proxiesdict = db.get_proxy_agent_dict() +def clean_dash_to_zero(val): + if val in ('-', '', None): + return 0 + try: + return int(val) + except (ValueError, TypeError) as e: + logger.exception(f"[字段异常] val = {val} → {str(e)}") + return 0 + + +def format_create_time(timestr): + try: + dt = date_parser.isoparse(timestr) + return dt.strftime("%Y-%m-%d %H:%M:%S") + except Exception as e: + logger.exception(f"[时间格式错误] {timestr} → {str(e)}") + return "1970-01-01 00:00:00" + + +def format_duration(seconds): + try: + seconds = int(seconds) + return f"{seconds // 60:02}:{seconds % 60:02}" + except Exception: + return "00:00" class DMHeaderManager: _headers_template = { @@ -124,400 +155,471 @@ class DMHeaderManager: return new_headers -kwdata = db.get_web_items() -if not kwdata: - logger.error("没有获取到关键词数据") - exit(1) -kwdata = kwdata[0][1] -rn = kwdata['rn'] -proxy_name = proxiesdict.get(rn) -proxies_str = db.get_proxy(proxy_name, '-1') -proxies = { - 'http': proxies_str, - 'https': proxies_str -} -kw = kwdata['keyword'] -print(kw) -print("=" * 30) -dmheader_manager = DMHeaderManager(proxies=proxies) +class DMVideoInfo: + def __init__(self, proxies: dict = None, max_retries: int = 3, backoff_factor: float = 0.5): + self.proxies = proxies + self.max_retries = max_retries + self.backoff_factor = backoff_factor + self.session = self._create_session() -headers = dmheader_manager.get_headers() + def _create_session(self): + session = requests.Session() + retry = Retry( + total=self.max_retries, + connect=self.max_retries, + read=self.max_retries, + backoff_factor=self.backoff_factor, + status_forcelist=[500, 502, 503, 504], + allowed_methods=["GET"] + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) -data = { - "operationName": "SEARCH_QUERY", - "variables": { - "query": kw, - "shouldIncludeTopResults": True, # 是否包含热门结果 - "shouldIncludeChannels": False, # 是否包含频道 - "shouldIncludePlaylists": False, # 是否包含播放列表 - "shouldIncludeHashtags": False, # 是否包含标签 - "shouldIncludeVideos": False, # 是否包含视频 - "shouldIncludeLives": False, # 是否包含直播 - "page": 1, - "limit": 20, - "recaptchaToken": None - }, - "query": """ -fragment VIDEO_BASE_FRAGMENT on Video { - id - xid - title - createdAt - duration - aspectRatio - thumbnail(height: PORTRAIT_240) { - id - url - __typename - } - creator { - id - xid - name - displayName - accountType - avatar(height: SQUARE_60) { - id - url - __typename + if self.proxies: + session.proxies.update(self.proxies) + + return session + + def get_video_info(self, data: dict) -> dict: + v_xid = data.get('v_xid') + url = f'https://api.dailymotion.com/video/{v_xid}' + params = { + 'fields': 'id,title,created_time,thumbnail_240_url,duration,' + 'owner.id,owner.screenname,likes_total,views_total,' + 'owner.avatar_60_url,owner.followers_total,owner.videos_total' + } + + try: + resp = self.session.get(url, params=params, timeout=10) + resp.raise_for_status() + r_data = resp.json() + xid = r_data["id"] + vid = base64.b64encode(f"Video:{xid}".encode('utf-8')).decode('utf-8') + uxid = r_data["owner.id"] + uid = base64.b64encode(f"Channel:{uxid}".encode('utf-8')).decode('utf-8') + data["v_id"] = vid + data["v_title"] = r_data["title"] + data["link"] = "https://www.dailymotion.com/video/" + xid, + data["duration"] = r_data["duration"] + data['createdtime'] = datetime.fromtimestamp(r_data.get("created_time")).strftime("%Y-%m-%d %H:%M:%S"), + data[''] + except requests.RequestException as e: + print(f"[ERROR] 请求失败 vxid={v_xid} : {e}") + return None + + + +def main(): + kwdata = db.get_web_items() + if not kwdata: + logger.error("没有获取到关键词数据") + exit(1) + + kwdata = kwdata[0][1] + rn = kwdata['rn'] + proxy_name = proxiesdict.get(rn) + proxies_str = db.get_proxy(proxy_name, '-1') + proxies = { + 'http': proxies_str, + 'https': proxies_str } - __typename - } - __typename -} + kw = kwdata['keyword'] -fragment CHANNEL_BASE_FRAG on Channel { - id - xid - name - displayName - accountType - isFollowed - avatar(height: SQUARE_120) { - id - url - __typename - } - followerEngagement { - id - followDate - __typename - } - metrics { - id - engagement { - id - followers { - edges { - node { + dmheader_manager = DMHeaderManager(proxies=proxies) + + headers = dmheader_manager.get_headers() + for i in range(1, 11): + data = { + "operationName": "SEARCH_QUERY", + "variables": { + "query": kw, + "shouldIncludeTopResults": True, # 是否包含热门结果 + "shouldIncludeChannels": False, # 是否包含频道 + "shouldIncludePlaylists": False, # 是否包含播放列表 + "shouldIncludeHashtags": False, # 是否包含标签 + "shouldIncludeVideos": False, # 是否包含视频 + "shouldIncludeLives": False, # 是否包含直播 + "page": i, + "limit": 20, + "recaptchaToken": None + }, + "query": """ + fragment VIDEO_BASE_FRAGMENT on Video { + id + xid + title + createdAt + duration + aspectRatio + thumbnail(height: PORTRAIT_240) { id - total + url + __typename + } + creator { + id + xid + name + displayName + accountType + avatar(height: SQUARE_60) { + id + url + __typename + } __typename } __typename } - __typename - } - __typename - } - __typename - } - __typename -} - -fragment PLAYLIST_BASE_FRAG on Collection { - id - xid - name - description - thumbnail(height: PORTRAIT_240) { - id - url - __typename - } - creator { - id - xid - name - displayName - accountType - avatar(height: SQUARE_60) { - id - url - __typename - } - __typename - } - metrics { - id - engagement { - id - videos(filter: {visibility: {eq: PUBLIC}}) { - edges { - node { + + fragment CHANNEL_BASE_FRAG on Channel { + id + xid + name + displayName + accountType + isFollowed + avatar(height: SQUARE_120) { id - total + url + __typename + } + followerEngagement { + id + followDate + __typename + } + metrics { + id + engagement { + id + followers { + edges { + node { + id + total + __typename + } + __typename + } + __typename + } + __typename + } __typename } __typename } - __typename - } - __typename - } - __typename - } - __typename -} - -fragment HASHTAG_BASE_FRAG on Hashtag { - id - xid - name - metrics { - id - engagement { - id - videos { - edges { - node { + + fragment PLAYLIST_BASE_FRAG on Collection { + id + xid + name + description + thumbnail(height: PORTRAIT_240) { id - total + url + __typename + } + creator { + id + xid + name + displayName + accountType + avatar(height: SQUARE_60) { + id + url + __typename + } + __typename + } + metrics { + id + engagement { + id + videos(filter: {visibility: {eq: PUBLIC}}) { + edges { + node { + id + total + __typename + } + __typename + } + __typename + } + __typename + } __typename } __typename } - __typename - } - __typename - } - __typename - } - __typename -} - -fragment LIVE_BASE_FRAGMENT on Live { - id - xid - title - audienceCount - aspectRatio - isOnAir - thumbnail(height: PORTRAIT_240) { - id - url - __typename - } - creator { - id - xid - name - displayName - accountType - avatar(height: SQUARE_60) { - id - url - __typename - } - __typename - } - __typename -} - -query SEARCH_QUERY( - $query: String!, - $shouldIncludeTopResults: Boolean!, - $shouldIncludeVideos: Boolean!, - $shouldIncludeChannels: Boolean!, - $shouldIncludePlaylists: Boolean!, - $shouldIncludeHashtags: Boolean!, - $shouldIncludeLives: Boolean!, - $page: Int, - $limit: Int, - $sortByVideos: SearchVideoSort, - $durationMinVideos: Int, - $durationMaxVideos: Int, - $createdAfterVideos: DateTime, - $recaptchaToken: String -) { - search(token: $recaptchaToken) { - id - - stories(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeTopResults) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - ...VIDEO_BASE_FRAGMENT - ...CHANNEL_BASE_FRAG - ...PLAYLIST_BASE_FRAG - ...HASHTAG_BASE_FRAG - ...LIVE_BASE_FRAGMENT - __typename - } - __typename - } - __typename - } - - videos( - query: $query, - first: $limit, - page: $page, - sort: $sortByVideos, - durationMin: $durationMinVideos, - durationMax: $durationMaxVideos, - createdAfter: $createdAfterVideos - ) @include(if: $shouldIncludeVideos) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { + + fragment HASHTAG_BASE_FRAG on Hashtag { id - ...VIDEO_BASE_FRAGMENT + xid + name + metrics { + id + engagement { + id + videos { + edges { + node { + id + total + __typename + } + __typename + } + __typename + } + __typename + } + __typename + } __typename } - __typename - } - __typename - } - - lives(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeLives) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { + + fragment LIVE_BASE_FRAGMENT on Live { id - ...LIVE_BASE_FRAGMENT + xid + title + audienceCount + aspectRatio + isOnAir + thumbnail(height: PORTRAIT_240) { + id + url + __typename + } + creator { + id + xid + name + displayName + accountType + avatar(height: SQUARE_60) { + id + url + __typename + } + __typename + } __typename } - __typename - } - __typename - } - - channels(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeChannels) { - metadata { - id - algorithm { - uuid - __typename + + query SEARCH_QUERY( + $query: String!, + $shouldIncludeTopResults: Boolean!, + $shouldIncludeVideos: Boolean!, + $shouldIncludeChannels: Boolean!, + $shouldIncludePlaylists: Boolean!, + $shouldIncludeHashtags: Boolean!, + $shouldIncludeLives: Boolean!, + $page: Int, + $limit: Int, + $sortByVideos: SearchVideoSort, + $durationMinVideos: Int, + $durationMaxVideos: Int, + $createdAfterVideos: DateTime, + $recaptchaToken: String + ) { + search(token: $recaptchaToken) { + id + + stories(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeTopResults) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + ...VIDEO_BASE_FRAGMENT + ...CHANNEL_BASE_FRAG + ...PLAYLIST_BASE_FRAG + ...HASHTAG_BASE_FRAG + ...LIVE_BASE_FRAGMENT + __typename + } + __typename + } + __typename + } + + videos( + query: $query, + first: $limit, + page: $page, + sort: $sortByVideos, + durationMin: $durationMinVideos, + durationMax: $durationMaxVideos, + createdAfter: $createdAfterVideos + ) @include(if: $shouldIncludeVideos) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...VIDEO_BASE_FRAGMENT + __typename + } + __typename + } + __typename + } + + lives(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeLives) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...LIVE_BASE_FRAGMENT + __typename + } + __typename + } + __typename + } + + channels(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeChannels) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...CHANNEL_BASE_FRAG + __typename + } + __typename + } + __typename + } + + playlists: collections(query: $query, first: $limit, page: $page) @include(if: $shouldIncludePlaylists) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...PLAYLIST_BASE_FRAG + __typename + } + __typename + } + __typename + } + + hashtags(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeHashtags) { + metadata { + id + algorithm { + uuid + __typename + } + __typename + } + pageInfo { + hasNextPage + nextPage + __typename + } + edges { + node { + id + ...HASHTAG_BASE_FRAG + __typename + } + __typename + } + __typename + } + + __typename + } } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - id - ...CHANNEL_BASE_FRAG - __typename + """ } - __typename - } - __typename - } - playlists: collections(query: $query, first: $limit, page: $page) @include(if: $shouldIncludePlaylists) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - id - ...PLAYLIST_BASE_FRAG - __typename - } - __typename - } - __typename - } + payload = json.dumps(data).encode() - hashtags(query: $query, first: $limit, page: $page) @include(if: $shouldIncludeHashtags) { - metadata { - id - algorithm { - uuid - __typename - } - __typename - } - pageInfo { - hasNextPage - nextPage - __typename - } - edges { - node { - id - ...HASHTAG_BASE_FRAG - __typename - } - __typename - } - __typename - } + response = requests.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload, + proxies=proxies) - __typename - } -} -""" -} + data = response.json() + edges = data['data']['search']['stories']['edges'] + edges_len = len(edges) + dm_video_info = DMVideoInfo(proxies=proxies) + tancks = [] + for j, edge in enumerate(edges): + node = edge.get("node", {}) + tancks.append({ + "keyword": kw, + "v_name": kwdata.get("v_name", ""), + "v_xid": node.get("xid"), + "batch": kwdata.get("batch"), + "rn": kwdata.get("rn"), + "machine_id": MACHINE_ID, + "index": (i - 1) * 20 + j + 1, + "level": 0, + }) -payload = json.dumps(data).encode() - -response = requests.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload, - proxies=proxies) - -data = response.json() -edges = data['data']['search']['stories']['edges'] - -for i, edge in enumerate(edges): - print(i, edge['node']['xid']) + if edges_len < 20: + break