diff --git a/mysql_to_xlsx.py b/mysql_to_xlsx.py index 1b61078..0a55ed0 100644 --- a/mysql_to_xlsx.py +++ b/mysql_to_xlsx.py @@ -19,6 +19,27 @@ URL = ( engine = create_engine(URL, pool_pre_ping=True) +# 在导出前执行更新 is_repeat 的 SQL +def update_is_repeat(batches: list[int]): + sql = text(""" + UPDATE sh_dm_video_op_v2 AS op + JOIN ( + SELECT v_xid, COUNT(*) AS cnt + FROM sh_dm_video_op_v2 + WHERE batch IN :batches + GROUP BY v_xid + ) AS agg + ON op.v_xid = agg.v_xid + SET op.is_repeat = CASE + WHEN agg.cnt = 1 THEN 1 + ELSE 2 + END + WHERE op.batch IN :batches; + """) + with engine.begin() as conn: + conn.execute(sql, {"batches": tuple(batches)}) + print(f"已更新批次 {batches} 的 is_repeat 字段。") + def get_rn_list() -> list[str]: sql = "SELECT DISTINCT rn FROM sh_dm_video_op_v2;" @@ -27,42 +48,43 @@ def get_rn_list() -> list[str]: return [row[0] for row in result] -def fetch_all_data_for_rn(rn: str) -> pd.DataFrame: - sql = """ - SELECT - op.id AS ID, - v.v_name AS 片名, - v.link AS 视频连接, - v.is_piracy AS 是否盗版, - op.`level` AS 优先级, - op.rn AS 地区, - NULL AS 投诉日期, - NULL AS 下线日期, - op.keyword AS 关键词, - v.title AS 标题, - v.duration AS 时长, - v.watch_number AS 观看数量, - v.public_time AS 上传时间, - v.u_pic AS 头像, - v.is_repeat AS 是否重复, - op.sort AS 排序, - op.batch AS 批次, - op.machine AS 机器号, - v.u_id AS 用户id, - v.u_xid AS u_xid, - v.u_name AS 用户名称 - FROM sh_dm_video_op_v2 AS op - LEFT JOIN sh_dm_video_v2 AS v - ON op.v_xid = v.v_xid - WHERE op.rn = %s - AND op.batch IN (1747324254, 1747323990) - ORDER BY op.id - """ - # 注意:params 用列表或元组 +def fetch_all_data_for_rn(rn: str, batches: list[int]) -> pd.DataFrame: + sql = text( + """ + SELECT + op.id AS ID, + v.v_name AS 片名, + v.link AS 视频连接, + v.is_piracy AS 是否盗版, + op.`level` AS 优先级, + op.rn AS 地区, + NULL AS 投诉日期, + NULL AS 下线日期, + op.keyword AS 关键词, + v.title AS 标题, + v.duration AS 时长, + v.watch_number AS 观看数量, + v.public_time AS 上传时间, + v.u_pic AS 头像, + v.is_repeat AS 是否重复, + op.sort AS 排序, + op.batch AS 批次, + op.machine AS 机器号, + v.u_id AS 用户id, + v.u_xid AS u_xid, + v.u_name AS 用户名称 + FROM sh_dm_video_op_v2 AS op + LEFT JOIN sh_dm_video_v2 AS v + ON op.v_xid = v.v_xid + WHERE op.rn = :rn + AND op.batch IN :batches + ORDER BY op.id + """ + ) chunks = pd.read_sql_query( sql, engine, - params=(rn,), + params={"rn": rn, "batches": tuple(batches)}, chunksize=10000 ) dfs = [] @@ -75,11 +97,16 @@ def fetch_all_data_for_rn(rn: str) -> pd.DataFrame: def export_all(): + # 指定要处理的批次 + batches = [1747324254, 1747323990] + # 先更新 is_repeat + update_is_repeat(batches) + rn_list = get_rn_list() timestamp = datetime.now().strftime("%Y%m%d") for rn in rn_list: print(f"开始处理地区:{rn}") - df = fetch_all_data_for_rn(rn) + df = fetch_all_data_for_rn(rn, batches) if df.empty: print(f"[{rn}] 无数据,跳过导出") continue