201 lines
6.3 KiB
Python
201 lines
6.3 KiB
Python
import requests
|
|
import pymysql
|
|
import re
|
|
from datetime import datetime
|
|
|
|
# ==== 数据库配置 ====
|
|
MYSQL_CONFIG = {
|
|
'host': '39.101.135.56',
|
|
'user': 'tsreshub_prod',
|
|
'password': 'Tr5h$Prod!92@TsRH',
|
|
'database': 'tsreshub_db',
|
|
'port': 3306,
|
|
'charset': 'utf8mb4'
|
|
}
|
|
|
|
# ==== 请求配置 ====
|
|
HEADERS = {
|
|
'accept': 'application/json, text/plain, */*',
|
|
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
'cache-control': 'no-cache',
|
|
'content-type': 'application/json;charset=UTF-8',
|
|
'origin': 'https://www.fnrc.vip',
|
|
'pragma': 'no-cache',
|
|
'priority': 'u=1, i',
|
|
'referer': 'https://www.fnrc.vip/enterprise/resume_store/list',
|
|
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'cors',
|
|
'sec-fetch-site': 'same-origin',
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
|
}
|
|
COOKIES = {
|
|
'PHPSESSID': 'ca613ae99706037e356a247500acb97b',
|
|
'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM',
|
|
'company_sign': '',
|
|
'company_nonce': '',
|
|
'cuid': '',
|
|
}
|
|
|
|
# ==== 字段清洗函数 ====
|
|
def extract_int(s):
|
|
try:
|
|
return int(re.search(r'\d+', str(s)).group())
|
|
except:
|
|
return None
|
|
|
|
def parse_datetime(s):
|
|
try:
|
|
return datetime.fromisoformat(s)
|
|
except:
|
|
return datetime(2019, 12, 12)
|
|
|
|
def clean_item(item):
|
|
reverse_field_map = {
|
|
'resume_id': 'resume_id',
|
|
'user_name': 'name',
|
|
'sex_show': 'gender',
|
|
'user_age': 'age',
|
|
'area_show': 'job_location',
|
|
'birthday': 'birthday',
|
|
'education_level_msg': 'education',
|
|
'expect_job': 'expected_position',
|
|
'last_edit_time': 'update_time',
|
|
'marry_status_show': 'marital_status',
|
|
'residence': 'current_location',
|
|
'phone_encrypt': 'phone',
|
|
'work_type_show': 'job_property',
|
|
'work_status_show': 'job_status',
|
|
'work_1_description': 'work_1_description',
|
|
'work_1_time': 'work_1_time',
|
|
'work_1_experience': 'work_1_experience',
|
|
'work_2_description': 'work_2_description',
|
|
'work_2_time': 'work_2_time',
|
|
'work_2_experience': 'work_2_experience',
|
|
'work_3_description': 'work_3_description',
|
|
'work_3_time': 'work_3_time',
|
|
'work_3_experience': 'work_3_experience',
|
|
'work_4_description': 'work_4_description',
|
|
'work_4_time': 'work_4_time',
|
|
'work_4_experience': 'work_4_experience',
|
|
}
|
|
|
|
experience = item.get("experience", [])
|
|
for j in range(4):
|
|
if j < len(experience):
|
|
company = experience[j].get("company", "")
|
|
time_line = experience[j].get("time_line", "")
|
|
content = experience[j].get("content", "")
|
|
else:
|
|
company = ''
|
|
time_line = ''
|
|
content = ''
|
|
item[f"work_{j + 1}_experience"] = company
|
|
item[f"work_{j + 1}_time"] = time_line
|
|
item[f"work_{j + 1}_description"] = content
|
|
|
|
cleaned = {
|
|
reverse_field_map[k]: v
|
|
for k, v in item.items()
|
|
if k in reverse_field_map
|
|
}
|
|
|
|
if "age" in cleaned:
|
|
cleaned["age"] = extract_int(cleaned["age"])
|
|
|
|
if "height" in cleaned:
|
|
cleaned["height"] = extract_int(cleaned["height"])
|
|
|
|
if "weight" in cleaned:
|
|
cleaned["weight"] = extract_int(cleaned["weight"])
|
|
|
|
if "update_time" in cleaned:
|
|
cleaned["update_time"] = parse_datetime(cleaned["update_time"])
|
|
|
|
cleaned["source_id"] = 3
|
|
return cleaned
|
|
|
|
# ==== 主逻辑 ====
|
|
def main():
|
|
session = requests.Session()
|
|
session.headers.update(HEADERS)
|
|
session.cookies.update(COOKIES)
|
|
|
|
connection = pymysql.connect(**MYSQL_CONFIG)
|
|
cursor = connection.cursor()
|
|
|
|
url = "https://www.fnrc.vip/job/company/v1/resume/page"
|
|
all_items = []
|
|
|
|
for page in range(6, 8):
|
|
payload = {
|
|
'step': 1000,
|
|
'page': page,
|
|
'education_level': [],
|
|
'arrival_time': [],
|
|
'work_time': [],
|
|
'area_id': [],
|
|
'keywords': '',
|
|
'work_status': '',
|
|
'work_status_show': '求职状态',
|
|
'category_id': '',
|
|
'work_type': '',
|
|
'work_type_show': '是否兼职',
|
|
'sex': '',
|
|
'sex_show': '性别',
|
|
'is_head': '',
|
|
'is_head_show': '有无照片',
|
|
'job_id': '',
|
|
'age': [],
|
|
'age_show': '年龄',
|
|
'refresh_time': 0,
|
|
'site_id': '',
|
|
'site_id2': '',
|
|
'province': '',
|
|
'city': '',
|
|
'county': '',
|
|
'provinceArr': [],
|
|
'cityArr': [],
|
|
'countyArr': [],
|
|
'only_job_category': 0,
|
|
}
|
|
|
|
try:
|
|
resp = session.post(url, json=payload, timeout=10)
|
|
resp.raise_for_status()
|
|
data = resp.json().get('data', [])
|
|
print(f"📖 第{page}页拿到 {len(data)} 条数据")
|
|
for item in data:
|
|
all_items.append(clean_item(item))
|
|
except Exception as e:
|
|
print(f"❌ 请求第{page}页失败: {e}")
|
|
|
|
if all_items:
|
|
keys = all_items[0].keys()
|
|
columns = ', '.join(keys)
|
|
placeholders = ', '.join(['%s'] * len(keys))
|
|
update_clause = ', '.join([f"{key}=VALUES({key})" for key in keys if key != 'resume_id'])
|
|
|
|
sql = f"""
|
|
INSERT INTO resumes_resumebasic ({columns})
|
|
VALUES ({placeholders})
|
|
ON DUPLICATE KEY UPDATE {update_clause}
|
|
"""
|
|
|
|
try:
|
|
values = [tuple(item.values()) for item in all_items]
|
|
cursor.executemany(sql, values)
|
|
connection.commit()
|
|
print(f"✅ 成功插入 {len(all_items)} 条数据")
|
|
except Exception as e:
|
|
print(f"❌ 批量插入失败: {e}")
|
|
connection.rollback()
|
|
|
|
cursor.close()
|
|
connection.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|