97 lines
3.3 KiB
Python
97 lines
3.3 KiB
Python
import os
|
|
import pandas as pd
|
|
|
|
folder_path = "out_put_CNTW"
|
|
output_path = "out_put_CNTW.xlsx"
|
|
|
|
def convert_to_east8(dt):
|
|
try:
|
|
dt = pd.to_datetime(dt, errors='coerce', utc=True)
|
|
return dt.tz_convert("Asia/Shanghai") if pd.notna(dt) else None
|
|
except Exception:
|
|
return None
|
|
|
|
def safe_format_datetime(val):
|
|
if pd.isna(val):
|
|
return ""
|
|
try:
|
|
return val.tz_localize(None).strftime("%Y-%m-%d %H:%M:%S")
|
|
except Exception:
|
|
return ""
|
|
|
|
def format_duration(seconds):
|
|
try:
|
|
seconds = int(seconds)
|
|
minutes = seconds // 60
|
|
remain = seconds % 60
|
|
return f"{minutes}:{remain:02d}"
|
|
except Exception:
|
|
return ""
|
|
|
|
merged_list = []
|
|
|
|
for file in os.listdir(folder_path):
|
|
if file.endswith(".xlsx"):
|
|
path = os.path.join(folder_path, file)
|
|
try:
|
|
video_df = pd.read_excel(path, sheet_name="视频信息")
|
|
user_df = pd.read_excel(path, sheet_name="用户信息")
|
|
|
|
# 字段改名 & 添加来源 & 类型标记
|
|
video_df = video_df.rename(columns={"xid": "v_xid"})
|
|
user_df = user_df.rename(columns={"xid": "u_xid"})
|
|
|
|
video_df["xid"] = video_df["v_xid"]
|
|
user_df["xid"] = user_df["u_xid"]
|
|
|
|
video_df["来源文件"] = file
|
|
user_df["来源文件"] = file
|
|
|
|
video_df["数据类型"] = "视频"
|
|
user_df["数据类型"] = "用户"
|
|
|
|
# 视频专属字段处理
|
|
video_df["添加时间"] = video_df["添加时间"].apply(convert_to_east8)
|
|
video_df["添加时间"] = video_df["添加时间"].apply(safe_format_datetime)
|
|
video_df["时长 (秒)"] = video_df["时长 (秒)"].apply(format_duration)
|
|
|
|
# 统一添加空字段
|
|
for col in ["操作员", "是否盗版", "投诉时间", "历史状态", "是否重复", "重复对象"]:
|
|
video_df[col] = ""
|
|
user_df[col] = ""
|
|
|
|
# 合并当前文件的视频+用户,并按 Index 排序
|
|
combined = pd.concat([video_df, user_df], ignore_index=True, sort=False)
|
|
if "Index" in combined.columns:
|
|
combined = combined.sort_values(by="Index", ignore_index=True)
|
|
|
|
merged_list.append(combined)
|
|
|
|
except Exception as e:
|
|
print(f"❌ 读取失败: {file} 错误: {e}")
|
|
|
|
merged_df = pd.concat(merged_list, ignore_index=True, sort=False)
|
|
|
|
# 添加序号
|
|
merged_df.insert(0, "序号", range(1, len(merged_df) + 1))
|
|
|
|
# ========= 重复识别 ==========
|
|
if "xid" in merged_df.columns:
|
|
xid_rows = merged_df[merged_df["xid"].notna()]
|
|
for xid_val, group in xid_rows.groupby("xid"):
|
|
if len(group) > 1:
|
|
serials = group["序号"].tolist()
|
|
for idx, row in group.iterrows():
|
|
others = [str(s) for s in serials if s != row["序号"]]
|
|
merged_df.at[idx, "是否重复"] = "√"
|
|
merged_df.at[idx, "重复对象"] = ",".join(others)
|
|
|
|
# 删除 v_xid/u_xid 列(可选)
|
|
merged_df.drop(columns=["v_xid", "u_xid"], inplace=True, errors='ignore')
|
|
|
|
# ========= 导出结果 ==========
|
|
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
|
|
merged_df.to_excel(writer, sheet_name="合并信息", index=False)
|
|
|
|
print(f"✅ 合并完成,保存为:{output_path}")
|