Crawler/web/cfdzp/main.py

43 lines
1.4 KiB
Python

from cfd_zp import ResumeAPI
import pandas as pd
from datetime import datetime
def fetch_multiple_pages(keyword, total_pages=10, page_size=10):
api = ResumeAPI()
all_resumes = []
# 创建唯一的CSV文件名
filename = f'resumes_{keyword}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
print(f"开始采集关键词 '{keyword}' 的数据,将保存到文件: {filename}")
for page in range(1, total_pages + 1):
print(f"\n正在采集第 {page}/{total_pages}")
resumes = api.fetch_resumes(
keyword=keyword,
page_size=page_size,
page_index=page,
save_csv=False # 不在每页都保存CSV
)
if resumes:
all_resumes.extend(resumes)
# 将当前所有数据保存到CSV
df = pd.DataFrame(all_resumes)
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"已保存 {len(all_resumes)} 条数据到 {filename}")
else:
print(f"{page} 页数据获取失败或为空")
print(f"\n采集完成,共获取 {len(all_resumes)} 条数据")
return all_resumes
def main():
# 设置关键词和采集页数
keyword = '护工'
total_pages = 10
page_size = 10
# 开始批量采集
fetch_multiple_pages(keyword, total_pages, page_size)
if __name__ == '__main__':
main()