Crawler/web/dailymotion_com/deduplicateby_xid.py

21 lines
731 B
Python

import pandas as pd
# 读取目标文件
input_path = "merge.xlsx"
output_path = "xid_dedup.xlsx"
# 读取两个 sheet
video_df = pd.read_excel(input_path, sheet_name="视频信息")
user_df = pd.read_excel(input_path, sheet_name="用户信息")
# 按 xid 去重,保留第一条记录
video_df_dedup = video_df.drop_duplicates(subset="xid", keep="first")
user_df_dedup = user_df.drop_duplicates(subset="xid", keep="first")
# 写入去重后的新文件
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
video_df_dedup.to_excel(writer, sheet_name="视频信息", index=False)
user_df_dedup.to_excel(writer, sheet_name="用户信息", index=False)
print(f"去重完成,结果保存为:{output_path}")