import os import pandas as pd folder_path = "out_put_CNTW" output_path = "out_put_CNTW.xlsx" def convert_to_east8(dt): try: dt = pd.to_datetime(dt, errors='coerce', utc=True) return dt.tz_convert("Asia/Shanghai") if pd.notna(dt) else None except Exception: return None def safe_format_datetime(val): if pd.isna(val): return "" try: return val.tz_localize(None).strftime("%Y-%m-%d %H:%M:%S") except Exception: return "" def format_duration(seconds): try: seconds = int(seconds) minutes = seconds // 60 remain = seconds % 60 return f"{minutes}:{remain:02d}" except Exception: return "" merged_list = [] for file in os.listdir(folder_path): if file.endswith(".xlsx"): path = os.path.join(folder_path, file) try: video_df = pd.read_excel(path, sheet_name="视频信息") user_df = pd.read_excel(path, sheet_name="用户信息") # 字段改名 & 添加来源 & 类型标记 video_df = video_df.rename(columns={"xid": "v_xid"}) user_df = user_df.rename(columns={"xid": "u_xid"}) video_df["xid"] = video_df["v_xid"] user_df["xid"] = user_df["u_xid"] video_df["来源文件"] = file user_df["来源文件"] = file video_df["数据类型"] = "视频" user_df["数据类型"] = "用户" # 视频专属字段处理 video_df["添加时间"] = video_df["添加时间"].apply(convert_to_east8) video_df["添加时间"] = video_df["添加时间"].apply(safe_format_datetime) video_df["时长 (秒)"] = video_df["时长 (秒)"].apply(format_duration) # 统一添加空字段 for col in ["操作员", "是否盗版", "投诉时间", "历史状态", "是否重复", "重复对象"]: video_df[col] = "" user_df[col] = "" # 合并当前文件的视频+用户,并按 Index 排序 combined = pd.concat([video_df, user_df], ignore_index=True, sort=False) if "Index" in combined.columns: combined = combined.sort_values(by="Index", ignore_index=True) merged_list.append(combined) except Exception as e: print(f"❌ 读取失败: {file} 错误: {e}") merged_df = pd.concat(merged_list, ignore_index=True, sort=False) # 添加序号 merged_df.insert(0, "序号", range(1, len(merged_df) + 1)) # ========= 重复识别 ========== if "xid" in merged_df.columns: xid_rows = merged_df[merged_df["xid"].notna()] for xid_val, group in xid_rows.groupby("xid"): if len(group) > 1: serials = group["序号"].tolist() for idx, row in group.iterrows(): others = [str(s) for s in serials if s != row["序号"]] merged_df.at[idx, "是否重复"] = "√" merged_df.at[idx, "重复对象"] = ",".join(others) # 删除 v_xid/u_xid 列(可选) merged_df.drop(columns=["v_xid", "u_xid"], inplace=True, errors='ignore') # ========= 导出结果 ========== with pd.ExcelWriter(output_path, engine="openpyxl") as writer: merged_df.to_excel(writer, sheet_name="合并信息", index=False) print(f"✅ 合并完成,保存为:{output_path}")