137 lines
5.8 KiB
Python
137 lines
5.8 KiB
Python
import requests
|
||
import pandas as pd
|
||
from datetime import datetime
|
||
import time
|
||
import random
|
||
import urllib3
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
# 禁用 SSL 警告
|
||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||
|
||
class ResumeAPI:
|
||
def __init__(self):
|
||
self.base_url = 'https://www.qj050.com/api/v1'
|
||
self.headers = {
|
||
'Accept': '*/*',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||
'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQyODgzNzU3LCJleHAiOjE3NzQ0MTk3NTd9.sLsOLcTnxoB0iWbks7_9IVp9OmDPlo0cKOwL6qHcID8',
|
||
'Connection': 'keep-alive',
|
||
'Cookie': 'token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQyODgzNzU3LCJleHAiOjE3NzQ0MTk3NTd9.sLsOLcTnxoB0iWbks7_9IVp9OmDPlo0cKOwL6qHcID8;token.sig=SiletSGnwThzp8gd2-IEaawgh0aMNhG8ZduDjcH5syA;x-trace-id=5cbbd6e2d49347e2893925bbf110eb37',
|
||
'Host': 'www.qj050.com',
|
||
'User-Agent': 'PostmanRuntime-ApipostRuntime/1.1.0',
|
||
'x-platform': '1'
|
||
}
|
||
self.max_retries = 3
|
||
self.retry_delay = 2
|
||
|
||
def _make_request(self, url, params=None, method='get'):
|
||
for attempt in range(self.max_retries):
|
||
try:
|
||
# 添加随机延迟,避免频繁请求
|
||
time.sleep(random.uniform(1, 3))
|
||
|
||
# 禁用 SSL 验证
|
||
response = requests.get(url, headers=self.headers, params=params, verify=False) if method == 'get' else \
|
||
requests.post(url, headers=self.headers, json=params, verify=False)
|
||
response.raise_for_status()
|
||
return response.json()
|
||
except requests.exceptions.RequestException as e:
|
||
if attempt == self.max_retries - 1:
|
||
print(f"请求失败 ({url}): {str(e)}")
|
||
raise
|
||
print(f"重试请求 {attempt + 1}/{self.max_retries}")
|
||
time.sleep(self.retry_delay * (attempt + 1))
|
||
|
||
def get_name(self, resume_id, blurred_name):
|
||
url = f'{self.base_url}/resume/{resume_id}'
|
||
params = {'_': int(time.time() * 1000)}
|
||
try:
|
||
data = self._make_request(url, params) # 后续详细信息可以从这获取
|
||
return data.get('data', {}).get('name', '')
|
||
except:
|
||
return blurred_name
|
||
|
||
def get_contact_info(self, resume_id):
|
||
url = f'{self.base_url}/resume/{resume_id}/contact'
|
||
params = {'_': int(time.time() * 1000)}
|
||
try:
|
||
data = self._make_request(url, params)
|
||
return {
|
||
'phone': data.get('data', {}).get('phone', ''),
|
||
'real_name': data.get('data', {}).get('real_name', '')
|
||
}
|
||
except:
|
||
return {'phone': '', 'real_name': ''}
|
||
|
||
def fetch_resumes(self, keyword='护工', page_size=10, page_index=1, save_csv=True):
|
||
url = f'{self.base_url}/resumes'
|
||
params = {
|
||
'_': int(time.time() * 1000),
|
||
'tab': 'resume',
|
||
'keyword': keyword,
|
||
't': int(time.time() * 1000),
|
||
'info_subarea': '',
|
||
'info_category': '',
|
||
'pageSize': page_size,
|
||
'pageIndex': page_index,
|
||
'showStatus': 'true'
|
||
}
|
||
|
||
try:
|
||
data = self._make_request(url, params)
|
||
|
||
items = data.get('data', {}).get('items', [])
|
||
if not items:
|
||
return []
|
||
|
||
def process_resume(item):
|
||
try:
|
||
resume_id = item.get('id')
|
||
blurred_name = item.get('name_value', '')
|
||
contact_info = self.get_contact_info(resume_id)
|
||
name_value = self.get_name(resume_id, blurred_name)
|
||
|
||
category_names = [c.get('name', '') for c in item.get('infoCateforyArrObj', [])]
|
||
categories_str = ','.join(category_names)
|
||
|
||
return {
|
||
'name_value': blurred_name,
|
||
'age': item.get('age', ''),
|
||
'edu_value': item.get('edu_value', ''),
|
||
'job_instant_value': item.get('job_instant_value', ''),
|
||
'job_salary_from': item.get('job_salary_from', ''),
|
||
'job_salary_to': item.get('job_salary_to', ''),
|
||
'categories': categories_str,
|
||
'phone': contact_info['phone'],
|
||
'real_name': name_value
|
||
}
|
||
except Exception as e:
|
||
print(f"处理简历失败: {str(e)}")
|
||
return None
|
||
|
||
resumes = []
|
||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||
future_to_resume = {executor.submit(process_resume, item): item for item in items}
|
||
for idx, future in enumerate(as_completed(future_to_resume), 1):
|
||
result = future.result()
|
||
print(f"已处理: {idx}/{len(items)}")
|
||
if result:
|
||
resumes.append(result)
|
||
|
||
# 保存 CSV(可选)
|
||
if resumes and save_csv:
|
||
df = pd.DataFrame(resumes)
|
||
filename = f'resumes_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
|
||
df.to_csv(filename, index=False, encoding='utf-8-sig')
|
||
print(f'数据已保存到 {filename}')
|
||
|
||
return resumes
|
||
|
||
except Exception as e:
|
||
print(f"获取简历数据失败: {str(e)}")
|
||
return []
|
||
|
||
if __name__ == '__main__':
|
||
api = ResumeAPI()
|
||
api.fetch_resumes() |