feat: 引入自定义日志记录器并替换 print 语句为 logger 记录

This commit is contained in:
晓丰 2025-05-28 15:02:57 +08:00
parent 68d79c84b0
commit b4db51a8e7
3 changed files with 110 additions and 50 deletions

66
DB.py
View File

@ -11,6 +11,8 @@ from sqlalchemy import (
BigInteger, Integer, String, Text BigInteger, Integer, String, Text
) )
from sqlalchemy.dialects.mysql import insert as mysql_insert from sqlalchemy.dialects.mysql import insert as mysql_insert
from logger import logger
MYSQL_URL = ( MYSQL_URL = (
"mysql+pymysql://db_vidcon:rexdK4fhCCiRE4BZ" "mysql+pymysql://db_vidcon:rexdK4fhCCiRE4BZ"
@ -80,11 +82,11 @@ def mysql_retry(max_retries: int = 3, base_delay: float = 2.0):
return fn(self, *args, **kwargs) return fn(self, *args, **kwargs)
except pymysql.InterfaceError as e: except pymysql.InterfaceError as e:
wait = base_delay * (2 ** (attempt - 1)) wait = base_delay * (2 ** (attempt - 1))
print(f"[MySQL][{fn.__name__}] 第{attempt}次 InterfaceError{e},等待 {wait:.1f}s 后重连…") logger.info(f"[MySQL][{fn.__name__}] 第{attempt}次 InterfaceError{e},等待 {wait:.1f}s 后重连…")
time.sleep(wait) time.sleep(wait)
self._reconnect_mysql() self._reconnect_mysql()
if attempt == max_retries: if attempt == max_retries:
print("[MySQL] 重试多次仍失败,抛出异常") logger.info("[MySQL] 重试多次仍失败,抛出异常")
raise raise
return wrapper return wrapper
@ -104,12 +106,12 @@ def redis_retry(max_retries: int = 3):
try: try:
return fn(self, *args, **kwargs) return fn(self, *args, **kwargs)
except (ConnectionError, TimeoutError) as e: except (ConnectionError, TimeoutError) as e:
print(f"[Redis][{fn.__name__}] 第 {attempt} 次失败:{e}") logger.info(f"[Redis][{fn.__name__}] 第 {attempt} 次失败:{e}")
self.reconnect_redis() self.reconnect_redis()
if attempt == max_retries: if attempt == max_retries:
print("[Redis] 连接彻底失败") logger.info("[Redis] 连接彻底失败")
raise raise
print(f"[Redis] 重连后第 {attempt + 1} 次重试…") logger.info(f"[Redis] 重连后第 {attempt + 1} 次重试…")
return wrapper return wrapper
@ -151,7 +153,7 @@ class DBVidcon:
try: try:
self._connect_redis() self._connect_redis()
except Exception as e: except Exception as e:
print("[Redis reconnect error]", e) logger.info("[Redis reconnect error]", e)
time.sleep(2) time.sleep(2)
@redis_retry(max_retries=3) @redis_retry(max_retries=3)
@ -172,7 +174,7 @@ class DBVidcon:
break break
raws.append(item) raws.append(item)
except redis.exceptions.ConnectionError as e: except redis.exceptions.ConnectionError as e:
print("[Redis pop error]", e) logger.info("[Redis pop error]", e)
self.reconnect_redis() self.reconnect_redis()
return [] return []
if not raws: if not raws:
@ -193,7 +195,7 @@ class DBVidcon:
if isinstance(raws, str): if isinstance(raws, str):
raws = [raws] raws = [raws]
self.redis.lpush(self.l0_list_key, *raws) self.redis.lpush(self.l0_list_key, *raws)
print(f"[写入l0] 已推入 {len(raws)}") logger.info(f"[写入l0] 已推入 {len(raws)}")
@redis_retry(max_retries=3) @redis_retry(max_retries=3)
def push_l1(self, payloads): def push_l1(self, payloads):
@ -201,7 +203,7 @@ class DBVidcon:
if isinstance(payloads, str): if isinstance(payloads, str):
payloads = [payloads] payloads = [payloads]
self.redis.rpush(self.l1_list_key, *payloads) self.redis.rpush(self.l1_list_key, *payloads)
print(f"[写入l1] 已推入 {len(payloads)}") logger.info(f"[写入l1] 已推入 {len(payloads)}")
@redis_retry(max_retries=3) @redis_retry(max_retries=3)
def push_l2(self, raws): def push_l2(self, raws):
@ -209,7 +211,7 @@ class DBVidcon:
if isinstance(raws, str): if isinstance(raws, str):
raws = [raws] raws = [raws]
self.redis.lpush(self.l2_list_key, *raws) self.redis.lpush(self.l2_list_key, *raws)
print(f"[写入l2] 已推入 {len(raws)}") logger.info(f"[写入l2] 已推入 {len(raws)}")
@mysql_retry() @mysql_retry()
def get_proxy_agent_dict(self) -> dict: def get_proxy_agent_dict(self) -> dict:
@ -224,7 +226,7 @@ class DBVidcon:
sql = "SELECT parameter FROM proxy_agent WHERE rn = %s LIMIT 1" sql = "SELECT parameter FROM proxy_agent WHERE rn = %s LIMIT 1"
self.cursor.execute(sql, (rn,)) self.cursor.execute(sql, (rn,))
result = self.cursor.fetchone() result = self.cursor.fetchone()
print(result) logger.info(result)
return result['parameter'] if result else None return result['parameter'] if result else None
@redis_retry(max_retries=3) @redis_retry(max_retries=3)
@ -232,7 +234,7 @@ class DBVidcon:
try: try:
items = self.fetch_from_redis(count, list_key=self.l0_list_key) items = self.fetch_from_redis(count, list_key=self.l0_list_key)
except Exception as e: except Exception as e:
print("[Redis l0 pop error]", e) logger.info("[Redis l0 pop error]", e)
self.reconnect_redis() self.reconnect_redis()
items = [] items = []
@ -241,7 +243,7 @@ class DBVidcon:
try: try:
items = self.fetch_from_redis(count, list_key=self.l1_list_key) items = self.fetch_from_redis(count, list_key=self.l1_list_key)
except Exception as e: except Exception as e:
print("[Redis l1 pop error]", e) logger.info("[Redis l1 pop error]", e)
self.reconnect_redis() self.reconnect_redis()
items = [] items = []
@ -250,7 +252,7 @@ class DBVidcon:
try: try:
items = self.fetch_from_redis(count, list_key=self.l2_list_key) items = self.fetch_from_redis(count, list_key=self.l2_list_key)
except Exception as e: except Exception as e:
print("[Redis l2 pop error]", e) logger.info("[Redis l2 pop error]", e)
self.reconnect_redis() self.reconnect_redis()
items = [] items = []
return items, 99 return items, 99
@ -260,25 +262,25 @@ class DBVidcon:
if isinstance(payloads, str): if isinstance(payloads, str):
payloads = [payloads] payloads = [payloads]
self.redis.rpush(self.l1_list_key, *payloads) self.redis.rpush(self.l1_list_key, *payloads)
print(f"[回滚l1] 已退回 {len(payloads)}") logger.info(f"[回滚l1] 已退回 {len(payloads)}")
@redis_retry(max_retries=3) @redis_retry(max_retries=3)
def rollback_l0(self, raws): def rollback_l0(self, raws):
if isinstance(raws, str): if isinstance(raws, str):
raws = [raws] raws = [raws]
self.redis.lpush(self.l0_list_key, *raws) self.redis.lpush(self.l0_list_key, *raws)
print(f"[回滚l0] 已退回 {len(raws)}") logger.info(f"[回滚l0] 已退回 {len(raws)}")
@redis_retry(max_retries=3) @redis_retry(max_retries=3)
def rollback_l2(self, raws): def rollback_l2(self, raws):
if isinstance(raws, str): if isinstance(raws, str):
raws = [raws] raws = [raws]
self.redis.lpush(self.l2_list_key, *raws) self.redis.lpush(self.l2_list_key, *raws)
print(f"[回滚l2] 已退回 {len(raws)}") logger.info(f"[回滚l2] 已退回 {len(raws)}")
@mysql_retry() @mysql_retry()
def upsert_video(self, data: dict): def upsert_video(self, data: dict):
print(fr"DB处理->{data.get('v_xid')},\tlevel->{data.get('level')}") logger.info(fr"DB处理->{data.get('v_xid')},\tlevel->{data.get('level')}")
data.setdefault("a_id", 0) data.setdefault("a_id", 0)
data.setdefault("history_status", "") data.setdefault("history_status", "")
data.setdefault("is_piracy", 3) data.setdefault("is_piracy", 3)
@ -331,18 +333,18 @@ class DBVidcon:
# 回滚这次未提交的改动 # 回滚这次未提交的改动
self.conn.rollback() self.conn.rollback()
print("[数据库写入异常]", str(e)) logger.info("[数据库写入异常]", str(e))
print("[出错数据]:", data) logger.info("[出错数据]:", data)
if attempt < max_retries: if attempt < max_retries:
attempt += 1 attempt += 1
print(f"{attempt + 1} 次重试…") logger.info(f"{attempt + 1} 次重试…")
continue continue
else: else:
# 重试过后依然失败,推入 Redis 备用 # 重试过后依然失败,推入 Redis 备用
print("重试失败,将数据写入 Redis 以便后续处理") logger.info("重试失败,将数据写入 Redis 以便后续处理")
self.push_record(data) self.push_record(data)
print("[交由Redis处理]") logger.info("[交由Redis处理]")
break break
@mysql_retry() @mysql_retry()
@ -395,7 +397,7 @@ class DBVidcon:
self.conn.commit() self.conn.commit()
return self.cursor.lastrowid return self.cursor.lastrowid
except Exception as e: except Exception as e:
print(f"[log_batch_start] 插入失败:{e}") logger.info(f"[log_batch_start] 插入失败:{e}")
return None return None
@mysql_retry() @mysql_retry()
@ -418,7 +420,7 @@ class DBVidcon:
else: else:
self.redis.connection_pool.disconnect() self.redis.connection_pool.disconnect()
except Exception as e: except Exception as e:
print("[Redis close error]", e) logger.info("[Redis close error]", e)
@redis_retry(max_retries=3) @redis_retry(max_retries=3)
def get_proxy(self, region_code: str) -> str: def get_proxy(self, region_code: str) -> str:
@ -481,7 +483,7 @@ class DBSA:
_lock = threading.Lock() _lock = threading.Lock()
push_record_many = staticmethod( push_record_many = staticmethod(
lambda rows: print("[退回Redis] cnt=", len(rows)) lambda rows: logger.info("[退回Redis] cnt=", len(rows))
) )
@classmethod @classmethod
@ -516,7 +518,7 @@ class DBSA:
cls._buf_vid.append(vid_row) cls._buf_vid.append(vid_row)
cls._buf_payload.append(data) # 保存原始 cls._buf_payload.append(data) # 保存原始
buf_len = len(cls._buf_vid) buf_len = len(cls._buf_vid)
print(f"DB缓冲 -> xid={data['v_xid']}, level={data['level']}, buffer={buf_len}") logger.info(f"DB缓冲 -> xid={data['v_xid']}, level={data['level']}, buffer={buf_len}")
need_flush = False need_flush = False
flush_reason = "" flush_reason = ""
@ -528,7 +530,7 @@ class DBSA:
flush_reason = "TIME" flush_reason = "TIME"
if need_flush: if need_flush:
print(f"DBSA 落 ({flush_reason}) ...") logger.info(f"DBSA 落 ({flush_reason}) ...")
cls.flush() cls.flush()
@classmethod @classmethod
@ -577,20 +579,20 @@ class DBSA:
try: try:
cls._bulk_insert(op_rows) cls._bulk_insert(op_rows)
cls._bulk_upsert(vid_rows) cls._bulk_upsert(vid_rows)
print(f"[DBSA] 成 op={len(op_rows)} video={len(vid_rows)} time={time.time() - start:.3f}s") logger.info(f"[DBSA] 成 op={len(op_rows)} video={len(vid_rows)} time={time.time() - start:.3f}s")
except Exception as e: except Exception as e:
print(f"[DBSA] flush FAIL: {e} op={len(op_rows)} video={len(vid_rows)}") logger.info(f"[DBSA] flush FAIL: {e} op={len(op_rows)} video={len(vid_rows)}")
# 批量退回原始 payload字段最全 # 批量退回原始 payload字段最全
try: try:
cls.push_record_many(payloads) cls.push_record_many(payloads)
except Exception as re: except Exception as re:
print("[Redis 回退失败]", re) logger.info("[Redis 回退失败]", re)
@classmethod @classmethod
def update_video_stats(cls, locator:dict, stats:dict) -> int: def update_video_stats(cls, locator:dict, stats:dict) -> int:
""" """
立即更新 sh_dm_video_v2 表中的统计字段 立即更新 sh_dm_video_v2 表中的统计字段n
:param locator: 用于定位行的字典必须包含: v_xid, rn :param locator: 用于定位行的字典必须包含: v_xid, rn
:param stats: 需要更新的统计字段 {"fans": 633, "videos": 10090, "view": 1678408} :param stats: 需要更新的统计字段 {"fans": 633, "videos": 10090, "view": 1678408}
:return: 受影响的行数 :return: 受影响的行数

57
logger.py Normal file
View File

@ -0,0 +1,57 @@
import logging
import os
from datetime import datetime
from logging.handlers import TimedRotatingFileHandler
class CustomHourlyHandler(TimedRotatingFileHandler):
def __init__(self, base_filename, level=logging.INFO, is_error=False):
# 初始化路径
self.is_error = is_error
self.base_filename = base_filename
self.log_dir = self._get_log_dir()
filename = self._build_log_path()
# 初始化 handler
super().__init__(
filename,
when='H',
interval=1,
backupCount=336, # 14天 * 24小时 = 336 小时日志
encoding='utf-8',
utc=False
)
self.setLevel(level)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
self.setFormatter(formatter)
def _get_log_dir(self):
today = datetime.now().strftime("%Y-%m-%d")
log_dir = os.path.join("logs", today)
os.makedirs(log_dir, exist_ok=True)
return log_dir
def _build_log_path(self):
hour = datetime.now().strftime("%H")
suffix = "err.log" if self.is_error else "app.log"
return os.path.join(self.log_dir, f"{hour}_{suffix}")
def shouldRollover(self, record):
# 每小时轮换并重设路径
result = super().shouldRollover(record)
if result:
self.baseFilename = os.path.abspath(self._build_log_path())
return result
logger = logging.getLogger("DailyMotion")
logger.setLevel(logging.DEBUG)
if not logger.handlers:
logger.addHandler(CustomHourlyHandler("app", level=logging.DEBUG, is_error=False))
logger.addHandler(CustomHourlyHandler("err", level=logging.ERROR, is_error=True))
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
logger.addHandler(ch)

37
main.py
View File

@ -14,6 +14,7 @@ from dateutil import parser as date_parser
import copy import copy
from threading import Lock from threading import Lock
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from logger import logger
db = DBVidcon() db = DBVidcon()
@ -39,7 +40,7 @@ def clean_dash_to_zero(val):
try: try:
return int(val) return int(val)
except (ValueError, TypeError) as e: except (ValueError, TypeError) as e:
print(f"[字段异常] val = {val}{str(e)}") logger.info(f"[字段异常] val = {val}{str(e)}")
return 0 return 0
@ -48,7 +49,7 @@ def format_create_time(timestr):
dt = date_parser.isoparse(timestr) dt = date_parser.isoparse(timestr)
return dt.strftime("%Y-%m-%d %H:%M:%S") return dt.strftime("%Y-%m-%d %H:%M:%S")
except Exception as e: except Exception as e:
print(f"[时间格式错误] {timestr}{str(e)}") logger.info(f"[时间格式错误] {timestr}{str(e)}")
return "1970-01-01 00:00:00" return "1970-01-01 00:00:00"
@ -114,8 +115,8 @@ def get_proxies(g):
try: try:
proxy_data = response.json()['data'][0] proxy_data = response.json()['data'][0]
except Exception: except Exception:
print(g) logger.info(g)
print("数据返回解析错误!" + str(response.text)) logger.info("数据返回解析错误!" + str(response.text))
time.sleep(5) time.sleep(5)
return get_proxies(g) return get_proxies(g)
proxies_url = f"http://{proxy_data['username']}:{proxy_data['password']}@{proxy_data['ip']}:{proxy_data['port']}" proxies_url = f"http://{proxy_data['username']}:{proxy_data['password']}@{proxy_data['ip']}:{proxy_data['port']}"
@ -145,7 +146,7 @@ def post_with_retry(url, proxy_name, json_payload=None, data=None, headers=None,
) )
if resp.status_code == 401 and not token_refreshed: if resp.status_code == 401 and not token_refreshed:
if verbose: if verbose:
print("[post_with_retry] 收到 401刷新 token 后重试") logger.info("[post_with_retry] 收到 401刷新 token 后重试")
gettoken() gettoken()
token_refreshed = True token_refreshed = True
continue continue
@ -155,22 +156,22 @@ def post_with_retry(url, proxy_name, json_payload=None, data=None, headers=None,
except RequestException as e: except RequestException as e:
if verbose: if verbose:
print(f"[{attempt}/{retries}] 请求失败: {e}") logger.info(f"[{attempt}/{retries}] 请求失败: {e}")
# 如果还没刷新过 token就刷新一次 # 如果还没刷新过 token就刷新一次
if not token_refreshed: if not token_refreshed:
if verbose: if verbose:
print("[post_with_retry] 刷新 token 后再试") logger.info("[post_with_retry] 刷新 token 后再试")
gettoken(proxy_name) gettoken(proxy_name)
token_refreshed = True token_refreshed = True
continue continue
if attempt == retries: if attempt == retries:
if verbose: if verbose:
print(f"[post_with_retry] 最终失败:{url}") logger.info(f"[post_with_retry] 最终失败:{url}")
return None return None
sleep_time = backoff_factor * (2 ** (attempt - 1)) sleep_time = backoff_factor * (2 ** (attempt - 1))
if verbose: if verbose:
print(f"[post_with_retry] 等待 {sleep_time}s 后重试…") logger.info(f"[post_with_retry] 等待 {sleep_time}s 后重试…")
time.sleep(sleep_time) time.sleep(sleep_time)
@ -217,14 +218,14 @@ def gettoken(proxy, r=2):
_headers_cache = copy_headers _headers_cache = copy_headers
return copy_headers return copy_headers
except Exception as e: except Exception as e:
print("[gettoken] 失败:", e) logger.info("[gettoken] 失败:", e)
if r > 0: if r > 0:
time.sleep(5) time.sleep(5)
return gettoken(proxy, r - 1) return gettoken(proxy, r - 1)
else: else:
with _cache_lock: with _cache_lock:
if _headers_cache: if _headers_cache:
print("[gettoken] 用缓存 headers 兜底") logger.info("[gettoken] 用缓存 headers 兜底")
return copy.deepcopy(_headers_cache) return copy.deepcopy(_headers_cache)
# 仍然没有 → 返回模板(没有 Auth # 仍然没有 → 返回模板(没有 Auth
return copy.deepcopy(headers1) return copy.deepcopy(headers1)
@ -232,7 +233,7 @@ def gettoken(proxy, r=2):
def get_searchInfo(keyword, level, headers, proxy_name, r=2): def get_searchInfo(keyword, level, headers, proxy_name, r=2):
if r == 2: if r == 2:
print(f"NET处理->{keyword},\trn->{proxy_name},\tlevel->{level}") logger.info(f"NET处理->{keyword},\trn->{proxy_name},\tlevel->{level}")
video_list = [] video_list = []
max_page = 2 max_page = 2
limit = 30 limit = 30
@ -600,16 +601,16 @@ def get_searchInfo(keyword, level, headers, proxy_name, r=2):
if errors or stories is None: # 有错误 或 stories 为 null if errors or stories is None: # 有错误 或 stories 为 null
if r == 0: if r == 0:
print("连续 3 次错误或空结果:", json.dumps(jsondata, ensure_ascii=False)) logger.info("连续 3 次错误或空结果:", json.dumps(jsondata, ensure_ascii=False))
return None return None
time.sleep((3 - r) * 5) time.sleep((3 - r) * 5)
return get_searchInfo(keyword, level, headers, proxy_name, r - 1) return get_searchInfo(keyword, level, headers, proxy_name, r - 1)
resinfo = stories["edges"] resinfo = stories["edges"]
print("resinfo :", len(resinfo)) logger.info("resinfo :", len(resinfo))
except Exception: except Exception:
if r < 0: if r < 0:
print("[搜索接口] 未知:未处理", response.text) logger.info("[搜索接口] 未知:未处理", response.text)
print("返回字段解析错误!") logger.info("返回字段解析错误!")
return None return None
else: else:
time.sleep((3 - r) * 5) time.sleep((3 - r) * 5)
@ -663,7 +664,7 @@ def search_worker(payload, kitem, flag):
v_list = [] v_list = []
return True, flag, payload, kitem, v_list # 成功 return True, flag, payload, kitem, v_list # 成功
except Exception as e: except Exception as e:
print(f"[线程异常] {kitem['keyword']}{e}") logger.info(f"[线程异常] {kitem['keyword']}{e}")
traceback.print_exc() traceback.print_exc()
return False, flag, payload, kitem, [] # 失败 return False, flag, payload, kitem, [] # 失败
@ -762,7 +763,7 @@ def parse_args() -> argparse.Namespace:
if __name__ == '__main__': if __name__ == '__main__':
parse_args() parse_args()
start_time = datetime.datetime.now() start_time = datetime.datetime.now()
print(f"开始时间:{start_time.strftime('%Y-%m-%d %H:%M:%S')}") logger.info(f"开始时间:{start_time.strftime('%Y-%m-%d %H:%M:%S')}")
integrate_data_parallel() integrate_data_parallel()
end_time = datetime.datetime.now() end_time = datetime.datetime.now()
duration = end_time - start_time duration = end_time - start_time