201 lines
6.3 KiB
Python

import requests
import pymysql
import re
from datetime import datetime
# ==== 数据库配置 ====
MYSQL_CONFIG = {
'host': '39.101.135.56',
'user': 'tsreshub_prod',
'password': 'Tr5h$Prod!92@TsRH',
'database': 'tsreshub_db',
'port': 3306,
'charset': 'utf8mb4'
}
# ==== 请求配置 ====
HEADERS = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cache-control': 'no-cache',
'content-type': 'application/json;charset=UTF-8',
'origin': 'https://www.fnrc.vip',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.fnrc.vip/enterprise/resume_store/list',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
}
COOKIES = {
'PHPSESSID': 'ca613ae99706037e356a247500acb97b',
'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM',
'company_sign': '',
'company_nonce': '',
'cuid': '',
}
# ==== 字段清洗函数 ====
def extract_int(s):
try:
return int(re.search(r'\d+', str(s)).group())
except:
return None
def parse_datetime(s):
try:
return datetime.fromisoformat(s)
except:
return datetime(2019, 12, 12)
def clean_item(item):
reverse_field_map = {
'resume_id': 'resume_id',
'user_name': 'name',
'sex_show': 'gender',
'user_age': 'age',
'area_show': 'job_location',
'birthday': 'birthday',
'education_level_msg': 'education',
'expect_job': 'expected_position',
'last_edit_time': 'update_time',
'marry_status_show': 'marital_status',
'residence': 'current_location',
'phone_encrypt': 'phone',
'work_type_show': 'job_property',
'work_status_show': 'job_status',
'work_1_description': 'work_1_description',
'work_1_time': 'work_1_time',
'work_1_experience': 'work_1_experience',
'work_2_description': 'work_2_description',
'work_2_time': 'work_2_time',
'work_2_experience': 'work_2_experience',
'work_3_description': 'work_3_description',
'work_3_time': 'work_3_time',
'work_3_experience': 'work_3_experience',
'work_4_description': 'work_4_description',
'work_4_time': 'work_4_time',
'work_4_experience': 'work_4_experience',
}
experience = item.get("experience", [])
for j in range(4):
if j < len(experience):
company = experience[j].get("company", "")
time_line = experience[j].get("time_line", "")
content = experience[j].get("content", "")
else:
company = ''
time_line = ''
content = ''
item[f"work_{j + 1}_experience"] = company
item[f"work_{j + 1}_time"] = time_line
item[f"work_{j + 1}_description"] = content
cleaned = {
reverse_field_map[k]: v
for k, v in item.items()
if k in reverse_field_map
}
if "age" in cleaned:
cleaned["age"] = extract_int(cleaned["age"])
if "height" in cleaned:
cleaned["height"] = extract_int(cleaned["height"])
if "weight" in cleaned:
cleaned["weight"] = extract_int(cleaned["weight"])
if "update_time" in cleaned:
cleaned["update_time"] = parse_datetime(cleaned["update_time"])
cleaned["source_id"] = 3
return cleaned
# ==== 主逻辑 ====
def main():
session = requests.Session()
session.headers.update(HEADERS)
session.cookies.update(COOKIES)
connection = pymysql.connect(**MYSQL_CONFIG)
cursor = connection.cursor()
url = "https://www.fnrc.vip/job/company/v1/resume/page"
all_items = []
for page in range(6, 8):
payload = {
'step': 1000,
'page': page,
'education_level': [],
'arrival_time': [],
'work_time': [],
'area_id': [],
'keywords': '',
'work_status': '',
'work_status_show': '求职状态',
'category_id': '',
'work_type': '',
'work_type_show': '是否兼职',
'sex': '',
'sex_show': '性别',
'is_head': '',
'is_head_show': '有无照片',
'job_id': '',
'age': [],
'age_show': '年龄',
'refresh_time': 0,
'site_id': '',
'site_id2': '',
'province': '',
'city': '',
'county': '',
'provinceArr': [],
'cityArr': [],
'countyArr': [],
'only_job_category': 0,
}
try:
resp = session.post(url, json=payload, timeout=10)
resp.raise_for_status()
data = resp.json().get('data', [])
print(f"📖 第{page}页拿到 {len(data)} 条数据")
for item in data:
all_items.append(clean_item(item))
except Exception as e:
print(f"❌ 请求第{page}页失败: {e}")
if all_items:
keys = all_items[0].keys()
columns = ', '.join(keys)
placeholders = ', '.join(['%s'] * len(keys))
update_clause = ', '.join([f"{key}=VALUES({key})" for key in keys if key != 'resume_id'])
sql = f"""
INSERT INTO resumes_resumebasic ({columns})
VALUES ({placeholders})
ON DUPLICATE KEY UPDATE {update_clause}
"""
try:
values = [tuple(item.values()) for item in all_items]
cursor.executemany(sql, values)
connection.commit()
print(f"✅ 成功插入 {len(all_items)} 条数据")
except Exception as e:
print(f"❌ 批量插入失败: {e}")
connection.rollback()
cursor.close()
connection.close()
if __name__ == "__main__":
main()