refactor: 清理无用代码并优化代理处理逻辑
This commit is contained in:
parent
db440b420a
commit
6d8dca5e24
203
main.py
203
main.py
@ -16,26 +16,11 @@ import copy
|
|||||||
from threading import Lock
|
from threading import Lock
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from logger import logger
|
from logger import logger
|
||||||
import os
|
|
||||||
import urllib3
|
|
||||||
|
|
||||||
|
|
||||||
db = DBVidcon()
|
db = DBVidcon()
|
||||||
MACHINE_ID = None
|
MACHINE_ID = None
|
||||||
MAX_WORKERS = 10
|
MAX_WORKERS = 10
|
||||||
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
|
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
|
||||||
UserAgent = [
|
|
||||||
'User-Agent,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
|
|
||||||
'User-Agent,Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
|
|
||||||
'User-Agent,Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
|
|
||||||
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
|
|
||||||
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
|
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
|
|
||||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
|
|
||||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400',
|
|
||||||
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
|
|
||||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.0.3.4000 Chrome/47.0.2526.73 Safari/537.36',
|
|
||||||
'User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)']
|
|
||||||
|
|
||||||
|
|
||||||
def get_part_ids(part_num: int, take: int, offset: int = 0):
|
def get_part_ids(part_num: int, take: int, offset: int = 0):
|
||||||
@ -77,178 +62,6 @@ def format_duration(seconds):
|
|||||||
return "00:00"
|
return "00:00"
|
||||||
|
|
||||||
|
|
||||||
headers1 = {
|
|
||||||
'Accept': '*/*, */*',
|
|
||||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
|
||||||
'Cache-Control': 'no-cache',
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
# 'Content-Length': '8512',
|
|
||||||
'Content-Type': 'application/json, application/json',
|
|
||||||
'Host': 'graphql.api.dailymotion.com',
|
|
||||||
'Origin': 'https://www.dailymotion.com',
|
|
||||||
'Referer': 'https://www.dailymotion.com/',
|
|
||||||
'Sec-Fetch-Dest': 'empty',
|
|
||||||
'Sec-Fetch-Mode': 'cors',
|
|
||||||
'Sec-Fetch-Site': 'same-site',
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
|
|
||||||
'X-DM-AppInfo-Id': 'com.dailymotion.neon',
|
|
||||||
'X-DM-AppInfo-Type': 'website',
|
|
||||||
'X-DM-AppInfo-Version': 'v2025-05-26T13:45:05.666Z',
|
|
||||||
'X-DM-Neon-SSR': '0',
|
|
||||||
'X-DM-Preferred-Country': 'tw',
|
|
||||||
'accept-language': 'zh-CN',
|
|
||||||
'authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhaWQiOiJmMWEzNjJkMjg4YzFiOTgwOTljNyIsInJvbCI6ImNhbi1tYW5hZ2UtcGFydG5lcnMtcmVwb3J0cyBjYW4tcmVhZC12aWRlby1zdHJlYW1zIGNhbi1zcG9vZi1jb3VudHJ5IGNhbi1hZG9wdC11c2VycyBjYW4tcmVhZC1jbGFpbS1ydWxlcyBjYW4tbWFuYWdlLWNsYWltLXJ1bGVzIGNhbi1tYW5hZ2UtdXNlci1hbmFseXRpY3MgY2FuLXJlYWQtbXktdmlkZW8tc3RyZWFtcyBjYW4tZG93bmxvYWQtbXktdmlkZW9zIGFjdC1hcyBhbGxzY29wZXMgYWNjb3VudC1jcmVhdG9yIGNhbi1yZWFkLWFwcGxpY2F0aW9ucyIsInNjbyI6InJlYWQgd3JpdGUgZGVsZXRlIGVtYWlsIHVzZXJpbmZvIGZlZWQgbWFuYWdlX3ZpZGVvcyBtYW5hZ2VfY29tbWVudHMgbWFuYWdlX3BsYXlsaXN0cyBtYW5hZ2VfdGlsZXMgbWFuYWdlX3N1YnNjcmlwdGlvbnMgbWFuYWdlX2ZyaWVuZHMgbWFuYWdlX2Zhdm9yaXRlcyBtYW5hZ2VfbGlrZXMgbWFuYWdlX2dyb3VwcyBtYW5hZ2VfcmVjb3JkcyBtYW5hZ2Vfc3VidGl0bGVzIG1hbmFnZV9mZWF0dXJlcyBtYW5hZ2VfaGlzdG9yeSBpZnR0dCByZWFkX2luc2lnaHRzIG1hbmFnZV9jbGFpbV9ydWxlcyBkZWxlZ2F0ZV9hY2NvdW50X21hbmFnZW1lbnQgbWFuYWdlX2FuYWx5dGljcyBtYW5hZ2VfcGxheWVyIG1hbmFnZV9wbGF5ZXJzIG1hbmFnZV91c2VyX3NldHRpbmdzIG1hbmFnZV9jb2xsZWN0aW9ucyBtYW5hZ2VfYXBwX2Nvbm5lY3Rpb25zIG1hbmFnZV9hcHBsaWNhdGlvbnMgbWFuYWdlX2RvbWFpbnMgbWFuYWdlX3BvZGNhc3RzIiwibHRvIjoiY0c1Z1RocGRBbFIwVEVZeVhEVWNBMnNDTDFrUFFncDNRUTBNS3ciLCJhaW4iOjEsImFkZyI6MSwiaWF0IjoxNzQ4NTI0MDU5LCJleHAiOjE3NDg1NjAwMDcsImRtdiI6IjEiLCJhdHAiOiJicm93c2VyIiwiYWRhIjoid3d3LmRhaWx5bW90aW9uLmNvbSIsInZpZCI6IjY0NjMzRDAzMDY1RjQxODZBRDBCMDI3Q0Y3OTVFRjBGIiwiZnRzIjo5MTE0MSwiY2FkIjoyLCJjeHAiOjIsImNhdSI6Miwia2lkIjoiQUY4NDlERDczQTU4NjNDRDdEOTdEMEJBQjA3MjI0M0IifQ.h27sfMMETgt0xKhQvFAGIpwInouNj2sFLOeb1Y74Orc',
|
|
||||||
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
|
|
||||||
'sec-ch-ua-mobile': '?0',
|
|
||||||
'sec-ch-ua-platform': '"Windows"',
|
|
||||||
'x-dm-visit-id': '1748480937099',
|
|
||||||
'x-dm-visitor-id': '1032a5f1-d07f-4bef-b96d-7783939abfc9',
|
|
||||||
}
|
|
||||||
_headers_cache = None # 保存最近一次成功的 headers
|
|
||||||
_cache_lock = Lock()
|
|
||||||
Gproxies = None
|
|
||||||
|
|
||||||
|
|
||||||
def get_proxies(g):
|
|
||||||
url = "https://www.kookeey.com/pickdynamicips"
|
|
||||||
params = {
|
|
||||||
"auth": "pwd",
|
|
||||||
"format": "1",
|
|
||||||
"n": "1",
|
|
||||||
"p": "http",
|
|
||||||
"gate": "sea",
|
|
||||||
"g": g,
|
|
||||||
"r": "0",
|
|
||||||
"type": "json",
|
|
||||||
"sign": "10099426b05c7119e9c4dbd6a7a0aa4e",
|
|
||||||
"accessid": "2207189",
|
|
||||||
"dl": ","
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
response = requests.get(url, params=params)
|
|
||||||
except RequestException:
|
|
||||||
return get_proxies(g)
|
|
||||||
try:
|
|
||||||
proxy_data = response.json()['data'][0]
|
|
||||||
except Exception:
|
|
||||||
logger.exception(g)
|
|
||||||
logger.exception("数据返回解析错误!" + str(response.text))
|
|
||||||
time.sleep(5)
|
|
||||||
return get_proxies(g)
|
|
||||||
proxies_url = f"http://{proxy_data['username']}:{proxy_data['password']}@{proxy_data['ip']}:{proxy_data['port']}"
|
|
||||||
proxies = {
|
|
||||||
"http": proxies_url,
|
|
||||||
"https": proxies_url,
|
|
||||||
}
|
|
||||||
return proxies
|
|
||||||
|
|
||||||
|
|
||||||
def post_with_retry(url, proxy_name, json_payload=None, data=None, headers=None,
|
|
||||||
retries=5, timeout=10, backoff_factor=2, verbose=True):
|
|
||||||
token_refreshed = False
|
|
||||||
for attempt in range(1, retries + 1):
|
|
||||||
try:
|
|
||||||
proxy_str = db.get_proxy(proxy_name)
|
|
||||||
|
|
||||||
proxies = {"http": proxy_str, "https": proxy_str}
|
|
||||||
|
|
||||||
resp = requests.post(
|
|
||||||
url,
|
|
||||||
json=json_payload,
|
|
||||||
data=data,
|
|
||||||
headers=headers,
|
|
||||||
proxies=proxies,
|
|
||||||
timeout=timeout,
|
|
||||||
)
|
|
||||||
if resp.status_code == 401 and not token_refreshed:
|
|
||||||
if verbose:
|
|
||||||
logger.info("[post_with_retry] 收到 401,刷新 token 后重试")
|
|
||||||
gettoken()
|
|
||||||
token_refreshed = True
|
|
||||||
continue
|
|
||||||
|
|
||||||
resp.raise_for_status()
|
|
||||||
return resp
|
|
||||||
|
|
||||||
except RequestException as e:
|
|
||||||
if verbose:
|
|
||||||
logger.info(f"[{attempt}/{retries}] 请求失败: {e}")
|
|
||||||
# 如果还没刷新过 token,就刷新一次
|
|
||||||
if not token_refreshed:
|
|
||||||
if verbose:
|
|
||||||
logger.info("[post_with_retry] 刷新 token 后再试")
|
|
||||||
gettoken(proxy_name)
|
|
||||||
token_refreshed = True
|
|
||||||
continue
|
|
||||||
if attempt == retries:
|
|
||||||
if verbose:
|
|
||||||
logger.info(f"[post_with_retry] 最终失败:{url}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
sleep_time = backoff_factor * (2 ** (attempt - 1))
|
|
||||||
if verbose:
|
|
||||||
logger.info(f"[post_with_retry] 等待 {sleep_time}s 后重试…")
|
|
||||||
time.sleep(sleep_time)
|
|
||||||
|
|
||||||
|
|
||||||
def gettoken(proxy, r=2):
|
|
||||||
global _headers_cache
|
|
||||||
headers = {
|
|
||||||
'Accept': '*/*',
|
|
||||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
|
|
||||||
'Cache-Control': 'no-cache',
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
'Content-Type': 'application/x-www-form-urlencoded',
|
|
||||||
'Origin': 'https://www.dailymotion.com',
|
|
||||||
'Pragma': 'no-cache',
|
|
||||||
'Referer': 'https://www.dailymotion.com/',
|
|
||||||
'Sec-Fetch-Dest': 'empty',
|
|
||||||
'Sec-Fetch-Mode': 'cors',
|
|
||||||
'Sec-Fetch-Site': 'same-site',
|
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
|
|
||||||
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
|
|
||||||
'sec-ch-ua-mobile': '?0',
|
|
||||||
'sec-ch-ua-platform': '"Windows"',
|
|
||||||
}
|
|
||||||
u = uuid.uuid4()
|
|
||||||
uuid_with_dash = str(u)
|
|
||||||
# uuid_no_dash = u.hex
|
|
||||||
traffic_segment = str(random.randint(100_000, 999_999))
|
|
||||||
data = {
|
|
||||||
'client_id': 'f1a362d288c1b98099c7',
|
|
||||||
'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
|
|
||||||
'grant_type': 'client_credentials',
|
|
||||||
'traffic_segment': traffic_segment,
|
|
||||||
'visitor_id': uuid_with_dash,
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
proxy_str = db.get_proxy(proxy)
|
|
||||||
url = 'https://graphql.api.dailymotion.com/oauth/token'
|
|
||||||
response = requests.post(url, headers=headers, data=data, proxies={"http": proxy_str, "https": proxy_str})
|
|
||||||
token = response.json()['access_token']
|
|
||||||
copy_headers = copy.deepcopy(headers1)
|
|
||||||
copy_headers['authorization'] = "Bearer " + token
|
|
||||||
copy_headers['x-dm-visit-id'] = str(int(time.time() * 1000))
|
|
||||||
copy_headers['x-dm-visitor-id'] = uuid_with_dash
|
|
||||||
copy_headers['User-Agent'] = UserAgent[random.randint(0, len(UserAgent) - 1)]
|
|
||||||
copy_headers['X-DM-Preferred-Country'] = proxy.lower()
|
|
||||||
with _cache_lock:
|
|
||||||
_headers_cache = copy_headers
|
|
||||||
return copy_headers
|
|
||||||
except Exception as e:
|
|
||||||
logger.exception("[gettoken] 失败:", e)
|
|
||||||
if r > 0:
|
|
||||||
time.sleep(5)
|
|
||||||
return gettoken(proxy, r - 1)
|
|
||||||
else:
|
|
||||||
with _cache_lock:
|
|
||||||
if _headers_cache:
|
|
||||||
logger.info("[gettoken] 用缓存 headers 兜底")
|
|
||||||
return copy.deepcopy(_headers_cache)
|
|
||||||
# 仍然没有 → 返回模板(没有 Auth)
|
|
||||||
return copy.deepcopy(headers1)
|
|
||||||
|
|
||||||
|
|
||||||
def get_searchInfo(keyword, level, headers, proxy_name, r=2):
|
def get_searchInfo(keyword, level, headers, proxy_name, r=2):
|
||||||
if r == 2:
|
if r == 2:
|
||||||
logger.info(f"NET处理->{keyword},\trn->{proxy_name},\tlevel->{level}")
|
logger.info(f"NET处理->{keyword},\trn->{proxy_name},\tlevel->{level}")
|
||||||
@ -270,13 +83,13 @@ def get_searchInfo(keyword, level, headers, proxy_name, r=2):
|
|||||||
proxy_string = db.get_proxy(proxy_name)
|
proxy_string = db.get_proxy(proxy_name)
|
||||||
logger.info(f"代理: {proxy_string}")
|
logger.info(f"代理: {proxy_string}")
|
||||||
proxies = {
|
proxies = {
|
||||||
'http':proxy_string,
|
'http': proxy_string,
|
||||||
'https':proxy_string,
|
'https': proxy_string,
|
||||||
}
|
}
|
||||||
response = requests.get(endpoint, params=params, proxies=proxies)
|
response = requests.get(endpoint, params=params, proxies=proxies)
|
||||||
jsondata = response.json()
|
jsondata = response.json()
|
||||||
try:
|
try:
|
||||||
resinfo=jsondata.get("list")
|
resinfo = jsondata.get("list")
|
||||||
except Exception:
|
except Exception:
|
||||||
if r < 0:
|
if r < 0:
|
||||||
logger.exception("[搜索接口] 未知:未处理", response.text)
|
logger.exception("[搜索接口] 未知:未处理", response.text)
|
||||||
@ -322,12 +135,11 @@ proxiesdict = db.get_proxy_agent_dict()
|
|||||||
def search_worker(payload, kitem, flag):
|
def search_worker(payload, kitem, flag):
|
||||||
try:
|
try:
|
||||||
gproxies = proxiesdict[kitem['rn']]
|
gproxies = proxiesdict[kitem['rn']]
|
||||||
header = gettoken(gproxies)
|
v_list = get_searchInfo(kitem['keyword'], kitem['level'], None, gproxies)
|
||||||
v_list = get_searchInfo(kitem['keyword'], kitem['level'], header, gproxies)
|
|
||||||
if not v_list:
|
if not v_list:
|
||||||
for i in range(2):
|
for i in range(2):
|
||||||
time.sleep(i * 5)
|
time.sleep(i * 5)
|
||||||
v_list = get_searchInfo(kitem['keyword'], kitem['level'], header, gproxies)
|
v_list = get_searchInfo(kitem['keyword'], kitem['level'], None, gproxies)
|
||||||
if v_list:
|
if v_list:
|
||||||
break
|
break
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
@ -340,9 +152,6 @@ def search_worker(payload, kitem, flag):
|
|||||||
return False, flag, payload, kitem, [] # 失败
|
return False, flag, payload, kitem, [] # 失败
|
||||||
|
|
||||||
|
|
||||||
executor = concurrent.futures.ThreadPoolExecutor(MAX_WORKERS)
|
|
||||||
|
|
||||||
|
|
||||||
def integrate_data_parallel():
|
def integrate_data_parallel():
|
||||||
while True:
|
while True:
|
||||||
global proxiesdict
|
global proxiesdict
|
||||||
@ -401,6 +210,7 @@ def integrate_data_parallel():
|
|||||||
db.rollback_l2(rollback[2])
|
db.rollback_l2(rollback[2])
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
global MACHINE_ID, MAX_WORKERS
|
global MACHINE_ID, MAX_WORKERS
|
||||||
|
|
||||||
@ -434,6 +244,7 @@ def parse_args() -> argparse.Namespace:
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parse_args()
|
parse_args()
|
||||||
|
executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
|
||||||
start_time = datetime.datetime.now()
|
start_time = datetime.datetime.now()
|
||||||
logger.info(f"开始时间:{start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
logger.info(f"开始时间:{start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||||
integrate_data_parallel()
|
integrate_data_parallel()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user