Crawler/web/dailymotion_com/read_excel_.py

97 lines
3.3 KiB
Python

import os
import pandas as pd
folder_path = "out_put_CNTW"
output_path = "out_put_CNTW.xlsx"
def convert_to_east8(dt):
try:
dt = pd.to_datetime(dt, errors='coerce', utc=True)
return dt.tz_convert("Asia/Shanghai") if pd.notna(dt) else None
except Exception:
return None
def safe_format_datetime(val):
if pd.isna(val):
return ""
try:
return val.tz_localize(None).strftime("%Y-%m-%d %H:%M:%S")
except Exception:
return ""
def format_duration(seconds):
try:
seconds = int(seconds)
minutes = seconds // 60
remain = seconds % 60
return f"{minutes}:{remain:02d}"
except Exception:
return ""
merged_list = []
for file in os.listdir(folder_path):
if file.endswith(".xlsx"):
path = os.path.join(folder_path, file)
try:
video_df = pd.read_excel(path, sheet_name="视频信息")
user_df = pd.read_excel(path, sheet_name="用户信息")
# 字段改名 & 添加来源 & 类型标记
video_df = video_df.rename(columns={"xid": "v_xid"})
user_df = user_df.rename(columns={"xid": "u_xid"})
video_df["xid"] = video_df["v_xid"]
user_df["xid"] = user_df["u_xid"]
video_df["来源文件"] = file
user_df["来源文件"] = file
video_df["数据类型"] = "视频"
user_df["数据类型"] = "用户"
# 视频专属字段处理
video_df["添加时间"] = video_df["添加时间"].apply(convert_to_east8)
video_df["添加时间"] = video_df["添加时间"].apply(safe_format_datetime)
video_df["时长 (秒)"] = video_df["时长 (秒)"].apply(format_duration)
# 统一添加空字段
for col in ["操作员", "是否盗版", "投诉时间", "历史状态", "是否重复", "重复对象"]:
video_df[col] = ""
user_df[col] = ""
# 合并当前文件的视频+用户,并按 Index 排序
combined = pd.concat([video_df, user_df], ignore_index=True, sort=False)
if "Index" in combined.columns:
combined = combined.sort_values(by="Index", ignore_index=True)
merged_list.append(combined)
except Exception as e:
print(f"❌ 读取失败: {file} 错误: {e}")
merged_df = pd.concat(merged_list, ignore_index=True, sort=False)
# 添加序号
merged_df.insert(0, "序号", range(1, len(merged_df) + 1))
# ========= 重复识别 ==========
if "xid" in merged_df.columns:
xid_rows = merged_df[merged_df["xid"].notna()]
for xid_val, group in xid_rows.groupby("xid"):
if len(group) > 1:
serials = group["序号"].tolist()
for idx, row in group.iterrows():
others = [str(s) for s in serials if s != row["序号"]]
merged_df.at[idx, "是否重复"] = ""
merged_df.at[idx, "重复对象"] = ",".join(others)
# 删除 v_xid/u_xid 列(可选)
merged_df.drop(columns=["v_xid", "u_xid"], inplace=True, errors='ignore')
# ========= 导出结果 ==========
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
merged_df.to_excel(writer, sheet_name="合并信息", index=False)
print(f"✅ 合并完成,保存为:{output_path}")