refactor: 清理无用代码并优化代理处理逻辑

This commit is contained in:
晓丰 2025-06-01 15:55:07 +08:00
parent db440b420a
commit 6d8dca5e24

203
main.py
View File

@ -16,26 +16,11 @@ import copy
from threading import Lock
from concurrent.futures import ThreadPoolExecutor, as_completed
from logger import logger
import os
import urllib3
db = DBVidcon()
MACHINE_ID = None
MAX_WORKERS = 10
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
UserAgent = [
'User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'User-Agent,Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
'User-Agent,Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400',
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.0.3.4000 Chrome/47.0.2526.73 Safari/537.36',
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)']
def get_part_ids(part_num: int, take: int, offset: int = 0):
@ -77,178 +62,6 @@ def format_duration(seconds):
return "00:00"
headers1 = {
'Accept': '*/*, */*',
# 'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
# 'Content-Length': '8512',
'Content-Type': 'application/json, application/json',
'Host': 'graphql.api.dailymotion.com',
'Origin': 'https://www.dailymotion.com',
'Referer': 'https://www.dailymotion.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
'X-DM-AppInfo-Id': 'com.dailymotion.neon',
'X-DM-AppInfo-Type': 'website',
'X-DM-AppInfo-Version': 'v2025-05-26T13:45:05.666Z',
'X-DM-Neon-SSR': '0',
'X-DM-Preferred-Country': 'tw',
'accept-language': 'zh-CN',
'authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhaWQiOiJmMWEzNjJkMjg4YzFiOTgwOTljNyIsInJvbCI6ImNhbi1tYW5hZ2UtcGFydG5lcnMtcmVwb3J0cyBjYW4tcmVhZC12aWRlby1zdHJlYW1zIGNhbi1zcG9vZi1jb3VudHJ5IGNhbi1hZG9wdC11c2VycyBjYW4tcmVhZC1jbGFpbS1ydWxlcyBjYW4tbWFuYWdlLWNsYWltLXJ1bGVzIGNhbi1tYW5hZ2UtdXNlci1hbmFseXRpY3MgY2FuLXJlYWQtbXktdmlkZW8tc3RyZWFtcyBjYW4tZG93bmxvYWQtbXktdmlkZW9zIGFjdC1hcyBhbGxzY29wZXMgYWNjb3VudC1jcmVhdG9yIGNhbi1yZWFkLWFwcGxpY2F0aW9ucyIsInNjbyI6InJlYWQgd3JpdGUgZGVsZXRlIGVtYWlsIHVzZXJpbmZvIGZlZWQgbWFuYWdlX3ZpZGVvcyBtYW5hZ2VfY29tbWVudHMgbWFuYWdlX3BsYXlsaXN0cyBtYW5hZ2VfdGlsZXMgbWFuYWdlX3N1YnNjcmlwdGlvbnMgbWFuYWdlX2ZyaWVuZHMgbWFuYWdlX2Zhdm9yaXRlcyBtYW5hZ2VfbGlrZXMgbWFuYWdlX2dyb3VwcyBtYW5hZ2VfcmVjb3JkcyBtYW5hZ2Vfc3VidGl0bGVzIG1hbmFnZV9mZWF0dXJlcyBtYW5hZ2VfaGlzdG9yeSBpZnR0dCByZWFkX2luc2lnaHRzIG1hbmFnZV9jbGFpbV9ydWxlcyBkZWxlZ2F0ZV9hY2NvdW50X21hbmFnZW1lbnQgbWFuYWdlX2FuYWx5dGljcyBtYW5hZ2VfcGxheWVyIG1hbmFnZV9wbGF5ZXJzIG1hbmFnZV91c2VyX3NldHRpbmdzIG1hbmFnZV9jb2xsZWN0aW9ucyBtYW5hZ2VfYXBwX2Nvbm5lY3Rpb25zIG1hbmFnZV9hcHBsaWNhdGlvbnMgbWFuYWdlX2RvbWFpbnMgbWFuYWdlX3BvZGNhc3RzIiwibHRvIjoiY0c1Z1RocGRBbFIwVEVZeVhEVWNBMnNDTDFrUFFncDNRUTBNS3ciLCJhaW4iOjEsImFkZyI6MSwiaWF0IjoxNzQ4NTI0MDU5LCJleHAiOjE3NDg1NjAwMDcsImRtdiI6IjEiLCJhdHAiOiJicm93c2VyIiwiYWRhIjoid3d3LmRhaWx5bW90aW9uLmNvbSIsInZpZCI6IjY0NjMzRDAzMDY1RjQxODZBRDBCMDI3Q0Y3OTVFRjBGIiwiZnRzIjo5MTE0MSwiY2FkIjoyLCJjeHAiOjIsImNhdSI6Miwia2lkIjoiQUY4NDlERDczQTU4NjNDRDdEOTdEMEJBQjA3MjI0M0IifQ.h27sfMMETgt0xKhQvFAGIpwInouNj2sFLOeb1Y74Orc',
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'x-dm-visit-id': '1748480937099',
'x-dm-visitor-id': '1032a5f1-d07f-4bef-b96d-7783939abfc9',
}
_headers_cache = None # 保存最近一次成功的 headers
_cache_lock = Lock()
Gproxies = None
def get_proxies(g):
url = "https://www.kookeey.com/pickdynamicips"
params = {
"auth": "pwd",
"format": "1",
"n": "1",
"p": "http",
"gate": "sea",
"g": g,
"r": "0",
"type": "json",
"sign": "10099426b05c7119e9c4dbd6a7a0aa4e",
"accessid": "2207189",
"dl": ","
}
try:
response = requests.get(url, params=params)
except RequestException:
return get_proxies(g)
try:
proxy_data = response.json()['data'][0]
except Exception:
logger.exception(g)
logger.exception("数据返回解析错误!" + str(response.text))
time.sleep(5)
return get_proxies(g)
proxies_url = f"http://{proxy_data['username']}:{proxy_data['password']}@{proxy_data['ip']}:{proxy_data['port']}"
proxies = {
"http": proxies_url,
"https": proxies_url,
}
return proxies
def post_with_retry(url, proxy_name, json_payload=None, data=None, headers=None,
retries=5, timeout=10, backoff_factor=2, verbose=True):
token_refreshed = False
for attempt in range(1, retries + 1):
try:
proxy_str = db.get_proxy(proxy_name)
proxies = {"http": proxy_str, "https": proxy_str}
resp = requests.post(
url,
json=json_payload,
data=data,
headers=headers,
proxies=proxies,
timeout=timeout,
)
if resp.status_code == 401 and not token_refreshed:
if verbose:
logger.info("[post_with_retry] 收到 401刷新 token 后重试")
gettoken()
token_refreshed = True
continue
resp.raise_for_status()
return resp
except RequestException as e:
if verbose:
logger.info(f"[{attempt}/{retries}] 请求失败: {e}")
# 如果还没刷新过 token就刷新一次
if not token_refreshed:
if verbose:
logger.info("[post_with_retry] 刷新 token 后再试")
gettoken(proxy_name)
token_refreshed = True
continue
if attempt == retries:
if verbose:
logger.info(f"[post_with_retry] 最终失败:{url}")
return None
sleep_time = backoff_factor * (2 ** (attempt - 1))
if verbose:
logger.info(f"[post_with_retry] 等待 {sleep_time}s 后重试…")
time.sleep(sleep_time)
def gettoken(proxy, r=2):
global _headers_cache
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://www.dailymotion.com',
'Pragma': 'no-cache',
'Referer': 'https://www.dailymotion.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
u = uuid.uuid4()
uuid_with_dash = str(u)
# uuid_no_dash = u.hex
traffic_segment = str(random.randint(100_000, 999_999))
data = {
'client_id': 'f1a362d288c1b98099c7',
'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
'grant_type': 'client_credentials',
'traffic_segment': traffic_segment,
'visitor_id': uuid_with_dash,
}
try:
proxy_str = db.get_proxy(proxy)
url = 'https://graphql.api.dailymotion.com/oauth/token'
response = requests.post(url, headers=headers, data=data, proxies={"http": proxy_str, "https": proxy_str})
token = response.json()['access_token']
copy_headers = copy.deepcopy(headers1)
copy_headers['authorization'] = "Bearer " + token
copy_headers['x-dm-visit-id'] = str(int(time.time() * 1000))
copy_headers['x-dm-visitor-id'] = uuid_with_dash
copy_headers['User-Agent'] = UserAgent[random.randint(0, len(UserAgent) - 1)]
copy_headers['X-DM-Preferred-Country'] = proxy.lower()
with _cache_lock:
_headers_cache = copy_headers
return copy_headers
except Exception as e:
logger.exception("[gettoken] 失败:", e)
if r > 0:
time.sleep(5)
return gettoken(proxy, r - 1)
else:
with _cache_lock:
if _headers_cache:
logger.info("[gettoken] 用缓存 headers 兜底")
return copy.deepcopy(_headers_cache)
# 仍然没有 → 返回模板(没有 Auth
return copy.deepcopy(headers1)
def get_searchInfo(keyword, level, headers, proxy_name, r=2):
if r == 2:
logger.info(f"NET处理->{keyword},\trn->{proxy_name},\tlevel->{level}")
@ -270,13 +83,13 @@ def get_searchInfo(keyword, level, headers, proxy_name, r=2):
proxy_string = db.get_proxy(proxy_name)
logger.info(f"代理: {proxy_string}")
proxies = {
'http':proxy_string,
'https':proxy_string,
'http': proxy_string,
'https': proxy_string,
}
response = requests.get(endpoint, params=params, proxies=proxies)
jsondata = response.json()
try:
resinfo=jsondata.get("list")
resinfo = jsondata.get("list")
except Exception:
if r < 0:
logger.exception("[搜索接口] 未知:未处理", response.text)
@ -322,12 +135,11 @@ proxiesdict = db.get_proxy_agent_dict()
def search_worker(payload, kitem, flag):
try:
gproxies = proxiesdict[kitem['rn']]
header = gettoken(gproxies)
v_list = get_searchInfo(kitem['keyword'], kitem['level'], header, gproxies)
v_list = get_searchInfo(kitem['keyword'], kitem['level'], None, gproxies)
if not v_list:
for i in range(2):
time.sleep(i * 5)
v_list = get_searchInfo(kitem['keyword'], kitem['level'], header, gproxies)
v_list = get_searchInfo(kitem['keyword'], kitem['level'], None, gproxies)
if v_list:
break
time.sleep(2)
@ -340,9 +152,6 @@ def search_worker(payload, kitem, flag):
return False, flag, payload, kitem, [] # 失败
executor = concurrent.futures.ThreadPoolExecutor(MAX_WORKERS)
def integrate_data_parallel():
while True:
global proxiesdict
@ -401,6 +210,7 @@ def integrate_data_parallel():
db.rollback_l2(rollback[2])
time.sleep(10)
def parse_args() -> argparse.Namespace:
global MACHINE_ID, MAX_WORKERS
@ -434,6 +244,7 @@ def parse_args() -> argparse.Namespace:
if __name__ == '__main__':
parse_args()
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
start_time = datetime.datetime.now()
logger.info(f"开始时间:{start_time.strftime('%Y-%m-%d %H:%M:%S')}")
integrate_data_parallel()