From 90ad9c28ff41c022eb4bca696f0059d4d663a707 Mon Sep 17 00:00:00 2001 From: Franklin-F Date: Sun, 18 May 2025 23:54:14 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E6=B7=BB=E5=8A=A0=E6=9B=B4=E6=96=B0=20i?= =?UTF-8?q?s=5Frepeat=20=E5=AD=97=E6=AE=B5=E7=9A=84=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E5=B9=B6=E4=BF=AE=E6=94=B9=E6=95=B0=E6=8D=AE=E8=8E=B7=E5=8F=96?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E4=BB=A5=E6=94=AF=E6=8C=81=E6=89=B9=E6=AC=A1?= =?UTF-8?q?=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mysql_to_xlsx.py | 95 +++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 34 deletions(-) diff --git a/mysql_to_xlsx.py b/mysql_to_xlsx.py index 1b61078..0a55ed0 100644 --- a/mysql_to_xlsx.py +++ b/mysql_to_xlsx.py @@ -19,6 +19,27 @@ URL = ( engine = create_engine(URL, pool_pre_ping=True) +# 在导出前执行更新 is_repeat 的 SQL +def update_is_repeat(batches: list[int]): + sql = text(""" + UPDATE sh_dm_video_op_v2 AS op + JOIN ( + SELECT v_xid, COUNT(*) AS cnt + FROM sh_dm_video_op_v2 + WHERE batch IN :batches + GROUP BY v_xid + ) AS agg + ON op.v_xid = agg.v_xid + SET op.is_repeat = CASE + WHEN agg.cnt = 1 THEN 1 + ELSE 2 + END + WHERE op.batch IN :batches; + """) + with engine.begin() as conn: + conn.execute(sql, {"batches": tuple(batches)}) + print(f"已更新批次 {batches} 的 is_repeat 字段。") + def get_rn_list() -> list[str]: sql = "SELECT DISTINCT rn FROM sh_dm_video_op_v2;" @@ -27,42 +48,43 @@ def get_rn_list() -> list[str]: return [row[0] for row in result] -def fetch_all_data_for_rn(rn: str) -> pd.DataFrame: - sql = """ - SELECT - op.id AS ID, - v.v_name AS 片名, - v.link AS 视频连接, - v.is_piracy AS 是否盗版, - op.`level` AS 优先级, - op.rn AS 地区, - NULL AS 投诉日期, - NULL AS 下线日期, - op.keyword AS 关键词, - v.title AS 标题, - v.duration AS 时长, - v.watch_number AS 观看数量, - v.public_time AS 上传时间, - v.u_pic AS 头像, - v.is_repeat AS 是否重复, - op.sort AS 排序, - op.batch AS 批次, - op.machine AS 机器号, - v.u_id AS 用户id, - v.u_xid AS u_xid, - v.u_name AS 用户名称 - FROM sh_dm_video_op_v2 AS op - LEFT JOIN sh_dm_video_v2 AS v - ON op.v_xid = v.v_xid - WHERE op.rn = %s - AND op.batch IN (1747324254, 1747323990) - ORDER BY op.id - """ - # 注意:params 用列表或元组 +def fetch_all_data_for_rn(rn: str, batches: list[int]) -> pd.DataFrame: + sql = text( + """ + SELECT + op.id AS ID, + v.v_name AS 片名, + v.link AS 视频连接, + v.is_piracy AS 是否盗版, + op.`level` AS 优先级, + op.rn AS 地区, + NULL AS 投诉日期, + NULL AS 下线日期, + op.keyword AS 关键词, + v.title AS 标题, + v.duration AS 时长, + v.watch_number AS 观看数量, + v.public_time AS 上传时间, + v.u_pic AS 头像, + v.is_repeat AS 是否重复, + op.sort AS 排序, + op.batch AS 批次, + op.machine AS 机器号, + v.u_id AS 用户id, + v.u_xid AS u_xid, + v.u_name AS 用户名称 + FROM sh_dm_video_op_v2 AS op + LEFT JOIN sh_dm_video_v2 AS v + ON op.v_xid = v.v_xid + WHERE op.rn = :rn + AND op.batch IN :batches + ORDER BY op.id + """ + ) chunks = pd.read_sql_query( sql, engine, - params=(rn,), + params={"rn": rn, "batches": tuple(batches)}, chunksize=10000 ) dfs = [] @@ -75,11 +97,16 @@ def fetch_all_data_for_rn(rn: str) -> pd.DataFrame: def export_all(): + # 指定要处理的批次 + batches = [1747324254, 1747323990] + # 先更新 is_repeat + update_is_repeat(batches) + rn_list = get_rn_list() timestamp = datetime.now().strftime("%Y%m%d") for rn in rn_list: print(f"开始处理地区:{rn}") - df = fetch_all_data_for_rn(rn) + df = fetch_all_data_for_rn(rn, batches) if df.empty: print(f"[{rn}] 无数据,跳过导出") continue