feat: 添加重试机制和并发处理以优化视频信息请求

This commit is contained in:
晓丰 2025-07-17 16:38:18 +08:00
parent b512b05e8b
commit d1307039c8

133
oneget.py
View File

@ -1,6 +1,6 @@
import base64 import base64
from datetime import datetime from datetime import datetime
import concurrent.futures
import requests import requests
import uuid import uuid
import random import random
@ -8,16 +8,57 @@ import time
import copy import copy
from threading import Lock from threading import Lock
import logging import logging
from DB import DBVidcon from DB import DBVidcon, DBSA
import json import json
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry from urllib3.util.retry import Retry
from dateutil import parser as date_parser
MACHINE_ID = 3 MACHINE_ID = 3
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
db = DBVidcon() db = DBVidcon()
proxiesdict = db.get_proxy_agent_dict() proxiesdict = db.get_proxy_agent_dict()
class RetryRequests:
def __init__(
self,
proxies: dict = None,
timeout: int = 10,
total: int = 3,
backoff_factor: float = 1.0,
status_forcelist: tuple = (500, 502, 503, 504),
allowed_methods: tuple = ("GET", "POST"),
):
self.session = requests.Session()
self.timeout = timeout
self.proxies = proxies
retry = Retry(
total=total,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
allowed_methods=allowed_methods,
raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retry)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
def get(self, url, **kwargs):
kwargs.setdefault("timeout", self.timeout)
if self.proxies:
kwargs.setdefault("proxies", self.proxies)
return self.session.get(url, **kwargs)
def post(self, url, **kwargs):
kwargs.setdefault("timeout", self.timeout)
if self.proxies:
kwargs.setdefault("proxies", self.proxies)
return self.session.post(url, **kwargs)
req = RetryRequests()
def clean_dash_to_zero(val): def clean_dash_to_zero(val):
if val in ('-', '', None): if val in ('-', '', None):
return 0 return 0
@ -44,6 +85,7 @@ def format_duration(seconds):
except Exception: except Exception:
return "00:00" return "00:00"
class DMHeaderManager: class DMHeaderManager:
_headers_template = { _headers_template = {
'Accept': '*/*, */*', 'Accept': '*/*, */*',
@ -86,22 +128,6 @@ class DMHeaderManager:
self._proxies = proxies self._proxies = proxies
def get_headers(self, retry: int = 2) -> dict: def get_headers(self, retry: int = 2) -> dict:
for attempt in range(retry + 1):
try:
return self._generate_headers()
except Exception as e:
logger.warning(f"[get_headers] 第 {attempt + 1} 次尝试失败: {e}")
time.sleep(2)
with self._cache_lock:
if self._headers_cache:
logger.info("[get_headers]")
return copy.deepcopy(self._headers_cache)
logger.warning("[get_headers] 基础 headers")
return copy.deepcopy(self._headers_template)
def _generate_headers(self) -> dict:
visitor_id = str(uuid.uuid4()) visitor_id = str(uuid.uuid4())
visit_id = str(int(time.time() * 1000)) visit_id = str(int(time.time() * 1000))
traffic_segment = str(random.randint(100_000, 999_999)) traffic_segment = str(random.randint(100_000, 999_999))
@ -133,7 +159,7 @@ class DMHeaderManager:
'visitor_id': visitor_id, 'visitor_id': visitor_id,
} }
response = requests.post( response = req.post(
'https://graphql.api.dailymotion.com/oauth/token', 'https://graphql.api.dailymotion.com/oauth/token',
headers=token_headers, headers=token_headers,
data=data, data=data,
@ -155,32 +181,11 @@ class DMHeaderManager:
return new_headers return new_headers
class DMVideoInfo: class DMVideoInfo:
def __init__(self, proxies: dict = None, max_retries: int = 3, backoff_factor: float = 0.5): def __init__(self, proxies: dict = None, max_retries: int = 3, backoff_factor: float = 0.5):
self.proxies = proxies self.proxies = proxies
self.max_retries = max_retries self.max_retries = max_retries
self.backoff_factor = backoff_factor self.backoff_factor = backoff_factor
self.session = self._create_session()
def _create_session(self):
session = requests.Session()
retry = Retry(
total=self.max_retries,
connect=self.max_retries,
read=self.max_retries,
backoff_factor=self.backoff_factor,
status_forcelist=[500, 502, 503, 504],
allowed_methods=["GET"]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
if self.proxies:
session.proxies.update(self.proxies)
return session
def get_video_info(self, data: dict) -> dict: def get_video_info(self, data: dict) -> dict:
v_xid = data.get('v_xid') v_xid = data.get('v_xid')
@ -192,25 +197,37 @@ class DMVideoInfo:
} }
try: try:
resp = self.session.get(url, params=params, timeout=10) resp = req.get(url, params=params, timeout=10)
resp.raise_for_status() resp.raise_for_status()
r_data = resp.json() r_data = resp.json()
xid = r_data["id"] xid = r_data["id"]
vid = base64.b64encode(f"Video:{xid}".encode('utf-8')).decode('utf-8') vid = base64.b64encode(f"Video:{xid}".encode('utf-8')).decode('utf-8')
uxid = r_data["owner.id"] uxid = r_data["owner.id"]
uid = base64.b64encode(f"Channel:{uxid}".encode('utf-8')).decode('utf-8') uid = base64.b64encode(f"Channel:{uxid}".encode('utf-8')).decode('utf-8')
duration = r_data.get("duration", 0)
if duration < 30:
return None
data["v_id"] = vid data["v_id"] = vid
data["v_title"] = r_data["title"] data["title"] = r_data.get("title", "")
data["link"] = "https://www.dailymotion.com/video/" + xid, data["link"] = "https://www.dailymotion.com/video/" + xid
data["duration"] = r_data["duration"] data["duration"] = format_duration(r_data.get("duration", 0))
data['createdtime'] = datetime.fromtimestamp(r_data.get("created_time")).strftime("%Y-%m-%d %H:%M:%S"), data['create_time'] = format(
data[''] datetime.fromtimestamp(r_data.get("created_time")).strftime("%Y-%m-%d %H:%M:%S"))
data['fans'] = clean_dash_to_zero(r_data.get("owner.followers_total", 0))
data['videos'] = clean_dash_to_zero(r_data.get("owner.videos_total", 0))
data['watch_number'] = clean_dash_to_zero(r_data.get("views_total", 0))
data['cover_pic'] = r_data.get('thumbnail_240_url')
data['u_id'] = uid
data['u_xid'] = uxid
data['u_name'] = r_data.get("owner.screenname", "")
data['u_pic'] = r_data.get("owner.avatar_60_url", "")
DBSA.upsert_video(data)
DBSA.flush()
except requests.RequestException as e: except requests.RequestException as e:
print(f"[ERROR] 请求失败 vxid={v_xid} : {e}") print(f"[ERROR] 请求失败 vxid={v_xid} : {e}")
return None return None
def main(): def main():
kwdata = db.get_web_items() kwdata = db.get_web_items()
if not kwdata: if not kwdata:
@ -220,15 +237,15 @@ def main():
kwdata = kwdata[0][1] kwdata = kwdata[0][1]
rn = kwdata['rn'] rn = kwdata['rn']
proxy_name = proxiesdict.get(rn) proxy_name = proxiesdict.get(rn)
proxies_str = db.get_proxy(proxy_name, '-1') proxies_str = "http://127.0.0.1:10808"
# proxies_str = db.get_proxy(proxy_name, '-1')
proxies = { proxies = {
'http': proxies_str, 'http': proxies_str,
'https': proxies_str 'https': proxies_str
} }
kw = kwdata['keyword'] kw = kwdata['keyword']
dmheader_manager = DMHeaderManager(proxies=proxies) dmheader_manager = DMHeaderManager(proxies=proxies)
dmvideo_info = DMVideoInfo(proxies=proxies)
headers = dmheader_manager.get_headers() headers = dmheader_manager.get_headers()
for i in range(1, 11): for i in range(1, 11):
data = { data = {
@ -600,17 +617,16 @@ def main():
payload = json.dumps(data).encode() payload = json.dumps(data).encode()
response = requests.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload, response = req.post('https://graphql.api.dailymotion.com/', headers=headers, data=payload,
proxies=proxies) proxies=proxies)
data = response.json() data = response.json()
edges = data['data']['search']['stories']['edges'] edges = data['data']['search']['stories']['edges']
edges_len = len(edges) edges_len = len(edges)
dm_video_info = DMVideoInfo(proxies=proxies)
tancks = [] tancks = []
for j, edge in enumerate(edges): for j, edge in enumerate(edges):
node = edge.get("node", {}) node = edge.get("node", {})
tancks.append({ s_data = {
"keyword": kw, "keyword": kw,
"v_name": kwdata.get("v_name", ""), "v_name": kwdata.get("v_name", ""),
"v_xid": node.get("xid"), "v_xid": node.get("xid"),
@ -619,7 +635,14 @@ def main():
"machine_id": MACHINE_ID, "machine_id": MACHINE_ID,
"index": (i - 1) * 20 + j + 1, "index": (i - 1) * 20 + j + 1,
"level": 0, "level": 0,
}) }
tancks.append(s_data)
# 我想在这加入20 个线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
executor.map(dmvideo_info.get_video_info, tancks)
if edges_len < 20: if edges_len < 20:
break break
if __name__ == '__main__':
main()