import requests import pymysql import re from datetime import datetime # ==== 数据库配置 ==== MYSQL_CONFIG = { 'host': '39.101.135.56', 'user': 'tsreshub_prod', 'password': 'Tr5h$Prod!92@TsRH', 'database': 'tsreshub_db', 'port': 3306, 'charset': 'utf8mb4' } # ==== 请求配置 ==== HEADERS = { 'accept': 'application/json, text/plain, */*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'no-cache', 'content-type': 'application/json;charset=UTF-8', 'origin': 'https://www.fnrc.vip', 'pragma': 'no-cache', 'priority': 'u=1, i', 'referer': 'https://www.fnrc.vip/enterprise/resume_store/list', 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', } COOKIES = { 'PHPSESSID': 'ca613ae99706037e356a247500acb97b', 'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM', 'company_sign': '', 'company_nonce': '', 'cuid': '', } # ==== 字段清洗函数 ==== def extract_int(s): try: return int(re.search(r'\d+', str(s)).group()) except: return None def parse_datetime(s): try: return datetime.fromisoformat(s) except: return datetime(2019, 12, 12) def clean_item(item): reverse_field_map = { 'resume_id': 'resume_id', 'user_name': 'name', 'sex_show': 'gender', 'user_age': 'age', 'area_show': 'job_location', 'birthday': 'birthday', 'education_level_msg': 'education', 'expect_job': 'expected_position', 'last_edit_time': 'update_time', 'marry_status_show': 'marital_status', 'residence': 'current_location', 'phone_encrypt': 'phone', 'work_type_show': 'job_property', 'work_status_show': 'job_status', 'work_1_description': 'work_1_description', 'work_1_time': 'work_1_time', 'work_1_experience': 'work_1_experience', 'work_2_description': 'work_2_description', 'work_2_time': 'work_2_time', 'work_2_experience': 'work_2_experience', 'work_3_description': 'work_3_description', 'work_3_time': 'work_3_time', 'work_3_experience': 'work_3_experience', 'work_4_description': 'work_4_description', 'work_4_time': 'work_4_time', 'work_4_experience': 'work_4_experience', } experience = item.get("experience", []) for j in range(4): if j < len(experience): company = experience[j].get("company", "") time_line = experience[j].get("time_line", "") content = experience[j].get("content", "") else: company = '' time_line = '' content = '' item[f"work_{j + 1}_experience"] = company item[f"work_{j + 1}_time"] = time_line item[f"work_{j + 1}_description"] = content cleaned = { reverse_field_map[k]: v for k, v in item.items() if k in reverse_field_map } if "age" in cleaned: cleaned["age"] = extract_int(cleaned["age"]) if "height" in cleaned: cleaned["height"] = extract_int(cleaned["height"]) if "weight" in cleaned: cleaned["weight"] = extract_int(cleaned["weight"]) if "update_time" in cleaned: cleaned["update_time"] = parse_datetime(cleaned["update_time"]) cleaned["source_id"] = 3 return cleaned # ==== 主逻辑 ==== def main(): session = requests.Session() session.headers.update(HEADERS) session.cookies.update(COOKIES) connection = pymysql.connect(**MYSQL_CONFIG) cursor = connection.cursor() url = "https://www.fnrc.vip/job/company/v1/resume/page" all_items = [] for page in range(6, 8): payload = { 'step': 1000, 'page': page, 'education_level': [], 'arrival_time': [], 'work_time': [], 'area_id': [], 'keywords': '', 'work_status': '', 'work_status_show': '求职状态', 'category_id': '', 'work_type': '', 'work_type_show': '是否兼职', 'sex': '', 'sex_show': '性别', 'is_head': '', 'is_head_show': '有无照片', 'job_id': '', 'age': [], 'age_show': '年龄', 'refresh_time': 0, 'site_id': '', 'site_id2': '', 'province': '', 'city': '', 'county': '', 'provinceArr': [], 'cityArr': [], 'countyArr': [], 'only_job_category': 0, } try: resp = session.post(url, json=payload, timeout=10) resp.raise_for_status() data = resp.json().get('data', []) print(f"📖 第{page}页拿到 {len(data)} 条数据") for item in data: all_items.append(clean_item(item)) except Exception as e: print(f"❌ 请求第{page}页失败: {e}") if all_items: keys = all_items[0].keys() columns = ', '.join(keys) placeholders = ', '.join(['%s'] * len(keys)) update_clause = ', '.join([f"{key}=VALUES({key})" for key in keys if key != 'resume_id']) sql = f""" INSERT INTO resumes_resumebasic ({columns}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause} """ try: values = [tuple(item.values()) for item in all_items] cursor.executemany(sql, values) connection.commit() print(f"✅ 成功插入 {len(all_items)} 条数据") except Exception as e: print(f"❌ 批量插入失败: {e}") connection.rollback() cursor.close() connection.close() if __name__ == "__main__": main()