Crawler/web/cfdzp/cfd_zp.py

137 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import pandas as pd
from datetime import datetime
import time
import random
import urllib3
from concurrent.futures import ThreadPoolExecutor, as_completed
# 禁用 SSL 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class ResumeAPI:
def __init__(self):
self.base_url = 'https://www.qj050.com/api/v1'
self.headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQyODgzNzU3LCJleHAiOjE3NzQ0MTk3NTd9.sLsOLcTnxoB0iWbks7_9IVp9OmDPlo0cKOwL6qHcID8',
'Connection': 'keep-alive',
'Cookie': 'token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQyODgzNzU3LCJleHAiOjE3NzQ0MTk3NTd9.sLsOLcTnxoB0iWbks7_9IVp9OmDPlo0cKOwL6qHcID8;token.sig=SiletSGnwThzp8gd2-IEaawgh0aMNhG8ZduDjcH5syA;x-trace-id=5cbbd6e2d49347e2893925bbf110eb37',
'Host': 'www.qj050.com',
'User-Agent': 'PostmanRuntime-ApipostRuntime/1.1.0',
'x-platform': '1'
}
self.max_retries = 3
self.retry_delay = 2
def _make_request(self, url, params=None, method='get'):
for attempt in range(self.max_retries):
try:
# 添加随机延迟,避免频繁请求
time.sleep(random.uniform(1, 3))
# 禁用 SSL 验证
response = requests.get(url, headers=self.headers, params=params, verify=False) if method == 'get' else \
requests.post(url, headers=self.headers, json=params, verify=False)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
if attempt == self.max_retries - 1:
print(f"请求失败 ({url}): {str(e)}")
raise
print(f"重试请求 {attempt + 1}/{self.max_retries}")
time.sleep(self.retry_delay * (attempt + 1))
def get_name(self, resume_id, blurred_name):
url = f'{self.base_url}/resume/{resume_id}'
params = {'_': int(time.time() * 1000)}
try:
data = self._make_request(url, params) # 后续详细信息可以从这获取
return data.get('data', {}).get('name', '')
except:
return blurred_name
def get_contact_info(self, resume_id):
url = f'{self.base_url}/resume/{resume_id}/contact'
params = {'_': int(time.time() * 1000)}
try:
data = self._make_request(url, params)
return {
'phone': data.get('data', {}).get('phone', ''),
'real_name': data.get('data', {}).get('real_name', '')
}
except:
return {'phone': '', 'real_name': ''}
def fetch_resumes(self, keyword='护工', page_size=10, page_index=1, save_csv=True):
url = f'{self.base_url}/resumes'
params = {
'_': int(time.time() * 1000),
'tab': 'resume',
'keyword': keyword,
't': int(time.time() * 1000),
'info_subarea': '',
'info_category': '',
'pageSize': page_size,
'pageIndex': page_index,
'showStatus': 'true'
}
try:
data = self._make_request(url, params)
items = data.get('data', {}).get('items', [])
if not items:
return []
def process_resume(item):
try:
resume_id = item.get('id')
blurred_name = item.get('name_value', '')
contact_info = self.get_contact_info(resume_id)
name_value = self.get_name(resume_id, blurred_name)
category_names = [c.get('name', '') for c in item.get('infoCateforyArrObj', [])]
categories_str = ','.join(category_names)
return {
'name_value': blurred_name,
'age': item.get('age', ''),
'edu_value': item.get('edu_value', ''),
'job_instant_value': item.get('job_instant_value', ''),
'job_salary_from': item.get('job_salary_from', ''),
'job_salary_to': item.get('job_salary_to', ''),
'categories': categories_str,
'phone': contact_info['phone'],
'real_name': name_value
}
except Exception as e:
print(f"处理简历失败: {str(e)}")
return None
resumes = []
with ThreadPoolExecutor(max_workers=10) as executor:
future_to_resume = {executor.submit(process_resume, item): item for item in items}
for idx, future in enumerate(as_completed(future_to_resume), 1):
result = future.result()
print(f"已处理: {idx}/{len(items)}")
if result:
resumes.append(result)
# 保存 CSV可选
if resumes and save_csv:
df = pd.DataFrame(resumes)
filename = f'resumes_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f'数据已保存到 {filename}')
return resumes
except Exception as e:
print(f"获取简历数据失败: {str(e)}")
return []
if __name__ == '__main__':
api = ResumeAPI()
api.fetch_resumes()