diff --git a/main.py b/main.py index 3ee4209..bb57755 100644 --- a/main.py +++ b/main.py @@ -16,26 +16,11 @@ import copy from threading import Lock from concurrent.futures import ThreadPoolExecutor, as_completed from logger import logger -import os -import urllib3 - db = DBVidcon() MACHINE_ID = None MAX_WORKERS = 10 executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) -UserAgent = [ - 'User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', - 'User-Agent,Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1', - 'User-Agent,Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', - 'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', - 'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400', - 'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.0.3.4000 Chrome/47.0.2526.73 Safari/537.36', - 'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)'] def get_part_ids(part_num: int, take: int, offset: int = 0): @@ -77,178 +62,6 @@ def format_duration(seconds): return "00:00" -headers1 = { - 'Accept': '*/*, */*', - # 'Accept-Encoding': 'gzip, deflate, br', - 'Cache-Control': 'no-cache', - 'Connection': 'keep-alive', - # 'Content-Length': '8512', - 'Content-Type': 'application/json, application/json', - 'Host': 'graphql.api.dailymotion.com', - 'Origin': 'https://www.dailymotion.com', - 'Referer': 'https://www.dailymotion.com/', - 'Sec-Fetch-Dest': 'empty', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'same-site', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', - 'X-DM-AppInfo-Id': 'com.dailymotion.neon', - 'X-DM-AppInfo-Type': 'website', - 'X-DM-AppInfo-Version': 'v2025-05-26T13:45:05.666Z', - 'X-DM-Neon-SSR': '0', - 'X-DM-Preferred-Country': 'tw', - 'accept-language': 'zh-CN', - 'authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhaWQiOiJmMWEzNjJkMjg4YzFiOTgwOTljNyIsInJvbCI6ImNhbi1tYW5hZ2UtcGFydG5lcnMtcmVwb3J0cyBjYW4tcmVhZC12aWRlby1zdHJlYW1zIGNhbi1zcG9vZi1jb3VudHJ5IGNhbi1hZG9wdC11c2VycyBjYW4tcmVhZC1jbGFpbS1ydWxlcyBjYW4tbWFuYWdlLWNsYWltLXJ1bGVzIGNhbi1tYW5hZ2UtdXNlci1hbmFseXRpY3MgY2FuLXJlYWQtbXktdmlkZW8tc3RyZWFtcyBjYW4tZG93bmxvYWQtbXktdmlkZW9zIGFjdC1hcyBhbGxzY29wZXMgYWNjb3VudC1jcmVhdG9yIGNhbi1yZWFkLWFwcGxpY2F0aW9ucyIsInNjbyI6InJlYWQgd3JpdGUgZGVsZXRlIGVtYWlsIHVzZXJpbmZvIGZlZWQgbWFuYWdlX3ZpZGVvcyBtYW5hZ2VfY29tbWVudHMgbWFuYWdlX3BsYXlsaXN0cyBtYW5hZ2VfdGlsZXMgbWFuYWdlX3N1YnNjcmlwdGlvbnMgbWFuYWdlX2ZyaWVuZHMgbWFuYWdlX2Zhdm9yaXRlcyBtYW5hZ2VfbGlrZXMgbWFuYWdlX2dyb3VwcyBtYW5hZ2VfcmVjb3JkcyBtYW5hZ2Vfc3VidGl0bGVzIG1hbmFnZV9mZWF0dXJlcyBtYW5hZ2VfaGlzdG9yeSBpZnR0dCByZWFkX2luc2lnaHRzIG1hbmFnZV9jbGFpbV9ydWxlcyBkZWxlZ2F0ZV9hY2NvdW50X21hbmFnZW1lbnQgbWFuYWdlX2FuYWx5dGljcyBtYW5hZ2VfcGxheWVyIG1hbmFnZV9wbGF5ZXJzIG1hbmFnZV91c2VyX3NldHRpbmdzIG1hbmFnZV9jb2xsZWN0aW9ucyBtYW5hZ2VfYXBwX2Nvbm5lY3Rpb25zIG1hbmFnZV9hcHBsaWNhdGlvbnMgbWFuYWdlX2RvbWFpbnMgbWFuYWdlX3BvZGNhc3RzIiwibHRvIjoiY0c1Z1RocGRBbFIwVEVZeVhEVWNBMnNDTDFrUFFncDNRUTBNS3ciLCJhaW4iOjEsImFkZyI6MSwiaWF0IjoxNzQ4NTI0MDU5LCJleHAiOjE3NDg1NjAwMDcsImRtdiI6IjEiLCJhdHAiOiJicm93c2VyIiwiYWRhIjoid3d3LmRhaWx5bW90aW9uLmNvbSIsInZpZCI6IjY0NjMzRDAzMDY1RjQxODZBRDBCMDI3Q0Y3OTVFRjBGIiwiZnRzIjo5MTE0MSwiY2FkIjoyLCJjeHAiOjIsImNhdSI6Miwia2lkIjoiQUY4NDlERDczQTU4NjNDRDdEOTdEMEJBQjA3MjI0M0IifQ.h27sfMMETgt0xKhQvFAGIpwInouNj2sFLOeb1Y74Orc', - 'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - 'x-dm-visit-id': '1748480937099', - 'x-dm-visitor-id': '1032a5f1-d07f-4bef-b96d-7783939abfc9', -} -_headers_cache = None # 保存最近一次成功的 headers -_cache_lock = Lock() -Gproxies = None - - -def get_proxies(g): - url = "https://www.kookeey.com/pickdynamicips" - params = { - "auth": "pwd", - "format": "1", - "n": "1", - "p": "http", - "gate": "sea", - "g": g, - "r": "0", - "type": "json", - "sign": "10099426b05c7119e9c4dbd6a7a0aa4e", - "accessid": "2207189", - "dl": "," - } - try: - response = requests.get(url, params=params) - except RequestException: - return get_proxies(g) - try: - proxy_data = response.json()['data'][0] - except Exception: - logger.exception(g) - logger.exception("数据返回解析错误!" + str(response.text)) - time.sleep(5) - return get_proxies(g) - proxies_url = f"http://{proxy_data['username']}:{proxy_data['password']}@{proxy_data['ip']}:{proxy_data['port']}" - proxies = { - "http": proxies_url, - "https": proxies_url, - } - return proxies - - -def post_with_retry(url, proxy_name, json_payload=None, data=None, headers=None, - retries=5, timeout=10, backoff_factor=2, verbose=True): - token_refreshed = False - for attempt in range(1, retries + 1): - try: - proxy_str = db.get_proxy(proxy_name) - - proxies = {"http": proxy_str, "https": proxy_str} - - resp = requests.post( - url, - json=json_payload, - data=data, - headers=headers, - proxies=proxies, - timeout=timeout, - ) - if resp.status_code == 401 and not token_refreshed: - if verbose: - logger.info("[post_with_retry] 收到 401,刷新 token 后重试") - gettoken() - token_refreshed = True - continue - - resp.raise_for_status() - return resp - - except RequestException as e: - if verbose: - logger.info(f"[{attempt}/{retries}] 请求失败: {e}") - # 如果还没刷新过 token,就刷新一次 - if not token_refreshed: - if verbose: - logger.info("[post_with_retry] 刷新 token 后再试") - gettoken(proxy_name) - token_refreshed = True - continue - if attempt == retries: - if verbose: - logger.info(f"[post_with_retry] 最终失败:{url}") - return None - - sleep_time = backoff_factor * (2 ** (attempt - 1)) - if verbose: - logger.info(f"[post_with_retry] 等待 {sleep_time}s 后重试…") - time.sleep(sleep_time) - - -def gettoken(proxy, r=2): - global _headers_cache - headers = { - 'Accept': '*/*', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', - 'Cache-Control': 'no-cache', - 'Connection': 'keep-alive', - 'Content-Type': 'application/x-www-form-urlencoded', - 'Origin': 'https://www.dailymotion.com', - 'Pragma': 'no-cache', - 'Referer': 'https://www.dailymotion.com/', - 'Sec-Fetch-Dest': 'empty', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'same-site', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36', - 'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - } - u = uuid.uuid4() - uuid_with_dash = str(u) - # uuid_no_dash = u.hex - traffic_segment = str(random.randint(100_000, 999_999)) - data = { - 'client_id': 'f1a362d288c1b98099c7', - 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5', - 'grant_type': 'client_credentials', - 'traffic_segment': traffic_segment, - 'visitor_id': uuid_with_dash, - } - try: - proxy_str = db.get_proxy(proxy) - url = 'https://graphql.api.dailymotion.com/oauth/token' - response = requests.post(url, headers=headers, data=data, proxies={"http": proxy_str, "https": proxy_str}) - token = response.json()['access_token'] - copy_headers = copy.deepcopy(headers1) - copy_headers['authorization'] = "Bearer " + token - copy_headers['x-dm-visit-id'] = str(int(time.time() * 1000)) - copy_headers['x-dm-visitor-id'] = uuid_with_dash - copy_headers['User-Agent'] = UserAgent[random.randint(0, len(UserAgent) - 1)] - copy_headers['X-DM-Preferred-Country'] = proxy.lower() - with _cache_lock: - _headers_cache = copy_headers - return copy_headers - except Exception as e: - logger.exception("[gettoken] 失败:", e) - if r > 0: - time.sleep(5) - return gettoken(proxy, r - 1) - else: - with _cache_lock: - if _headers_cache: - logger.info("[gettoken] 用缓存 headers 兜底") - return copy.deepcopy(_headers_cache) - # 仍然没有 → 返回模板(没有 Auth) - return copy.deepcopy(headers1) - - def get_searchInfo(keyword, level, headers, proxy_name, r=2): if r == 2: logger.info(f"NET处理->{keyword},\trn->{proxy_name},\tlevel->{level}") @@ -270,13 +83,13 @@ def get_searchInfo(keyword, level, headers, proxy_name, r=2): proxy_string = db.get_proxy(proxy_name) logger.info(f"代理: {proxy_string}") proxies = { - 'http':proxy_string, - 'https':proxy_string, + 'http': proxy_string, + 'https': proxy_string, } response = requests.get(endpoint, params=params, proxies=proxies) jsondata = response.json() try: - resinfo=jsondata.get("list") + resinfo = jsondata.get("list") except Exception: if r < 0: logger.exception("[搜索接口] 未知:未处理", response.text) @@ -322,12 +135,11 @@ proxiesdict = db.get_proxy_agent_dict() def search_worker(payload, kitem, flag): try: gproxies = proxiesdict[kitem['rn']] - header = gettoken(gproxies) - v_list = get_searchInfo(kitem['keyword'], kitem['level'], header, gproxies) + v_list = get_searchInfo(kitem['keyword'], kitem['level'], None, gproxies) if not v_list: for i in range(2): time.sleep(i * 5) - v_list = get_searchInfo(kitem['keyword'], kitem['level'], header, gproxies) + v_list = get_searchInfo(kitem['keyword'], kitem['level'], None, gproxies) if v_list: break time.sleep(2) @@ -340,9 +152,6 @@ def search_worker(payload, kitem, flag): return False, flag, payload, kitem, [] # 失败 -executor = concurrent.futures.ThreadPoolExecutor(MAX_WORKERS) - - def integrate_data_parallel(): while True: global proxiesdict @@ -401,6 +210,7 @@ def integrate_data_parallel(): db.rollback_l2(rollback[2]) time.sleep(10) + def parse_args() -> argparse.Namespace: global MACHINE_ID, MAX_WORKERS @@ -434,6 +244,7 @@ def parse_args() -> argparse.Namespace: if __name__ == '__main__': parse_args() + executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) start_time = datetime.datetime.now() logger.info(f"开始时间:{start_time.strftime('%Y-%m-%d %H:%M:%S')}") integrate_data_parallel()