Crawler/web/dailymotion_com/read_excel_.py

import os
import pandas as pd

folder_path = "out_put_CNTW"
output_path = "out_put_CNTW.xlsx"

def convert_to_east8(dt):
    try:
        dt = pd.to_datetime(dt, errors='coerce', utc=True)
        return dt.tz_convert("Asia/Shanghai") if pd.notna(dt) else None
    except Exception:
        return None

def safe_format_datetime(val):
    if pd.isna(val):
        return ""
    try:
        return val.tz_localize(None).strftime("%Y-%m-%d %H:%M:%S")
    except Exception:
        return ""

def format_duration(seconds):
    try:
        seconds = int(seconds)
        minutes = seconds // 60
        remain = seconds % 60
        return f"{minutes}:{remain:02d}"
    except Exception:
        return ""

merged_list = []

for file in os.listdir(folder_path):
    if file.endswith(".xlsx"):
        path = os.path.join(folder_path, file)
        try:
            video_df = pd.read_excel(path, sheet_name="视频信息")
            user_df = pd.read_excel(path, sheet_name="用户信息")

            # 字段改名 & 添加来源 & 类型标记
            video_df = video_df.rename(columns={"xid": "v_xid"})
            user_df = user_df.rename(columns={"xid": "u_xid"})

            video_df["xid"] = video_df["v_xid"]
            user_df["xid"] = user_df["u_xid"]

            video_df["来源文件"] = file
            user_df["来源文件"] = file

            video_df["数据类型"] = "视频"
            user_df["数据类型"] = "用户"

            # 视频专属字段处理
            video_df["添加时间"] = video_df["添加时间"].apply(convert_to_east8)
            video_df["添加时间"] = video_df["添加时间"].apply(safe_format_datetime)
            video_df["时长 (秒)"] = video_df["时长 (秒)"].apply(format_duration)

            # 统一添加空字段
            for col in ["操作员", "是否盗版", "投诉时间", "历史状态", "是否重复", "重复对象"]:
                video_df[col] = ""
                user_df[col] = ""

            # 合并当前文件的视频+用户，并按 Index 排序
            combined = pd.concat([video_df, user_df], ignore_index=True, sort=False)
            if "Index" in combined.columns:
                combined = combined.sort_values(by="Index", ignore_index=True)

            merged_list.append(combined)

        except Exception as e:
            print(f"❌ 读取失败: {file} 错误: {e}")

merged_df = pd.concat(merged_list, ignore_index=True, sort=False)

# 添加序号
merged_df.insert(0, "序号", range(1, len(merged_df) + 1))

# ========= 重复识别 ==========
if "xid" in merged_df.columns:
    xid_rows = merged_df[merged_df["xid"].notna()]
    for xid_val, group in xid_rows.groupby("xid"):
        if len(group) > 1:
            serials = group["序号"].tolist()
            for idx, row in group.iterrows():
                others = [str(s) for s in serials if s != row["序号"]]
                merged_df.at[idx, "是否重复"] = "√"
                merged_df.at[idx, "重复对象"] = ",".join(others)

# 删除 v_xid/u_xid 列（可选）
merged_df.drop(columns=["v_xid", "u_xid"], inplace=True, errors='ignore')

# ========= 导出结果 ==========
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    merged_df.to_excel(writer, sheet_name="合并信息", index=False)

print(f"✅ 合并完成，保存为：{output_path}")