from datetime import datetime import re import pandas as pd from Requests_Except import * base_url = 'www.zhrczp.com' protocol = 'https' default_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } jobclass_search_list = [53] # 76护工 46 医院护理类,只有搜索 类型 405 会计;;43 财务;19:司机;出租车司机;249 电工 Requests = MR(base_url, protocol) Requests.set_default_headers(default_headers) pd_data = { 'resume_id': [], '姓名': [], '年龄': [], '身高': [], '体重': [], '工作经验': [], '最高学历': [], '婚姻状态': [], '民族': [], '现居住地': [], '更新时间': [], '工作职能': [], '意向岗位': [], '从事行业': [], '期望薪资': [], '到岗时间': [], '工作性质': [], '求职状态': [], '工作地点': [], } def get_search_page(keyword_id, page): params = { 'c': 'resume', 'jobin': str(keyword_id), 'jobclass_search': str(keyword_id), 'cityin': '', 'cityclass_search': '', 'keyword': '', 'minsalary': '', 'maxsalary': '', 'minage': '', 'maxage': '', 'exp': '', 'edu': '', 'uptime': '', 'sex': '', 'type': '', 'page': str(page), } url = '/member/index.php' resp = Requests.get(url, params=params) return resp.text, resp.status_code def get_resume_list(): resume_id_list = [] previous_page_html = '' for keyword_id in jobclass_search_list: for page in range(1, 6): html, rest_code = get_search_page(keyword_id, page) if rest_code != 200: print(rest_code, type(rest_code)) if html == previous_page_html: print('切换下一类型') break else: previous_page_html = html res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html) resume_id_list += list(set(res)) return resume_id_list def get_resumeInfo(resume_id): url = '/member/index.php' params = { 'c': 'hr', 'act': 'resumeInfo', 'eid': str(resume_id), 'state': '', 'from': 'rck', } resp = Requests.get(url, params=params) return resp.xpath() def extract_info(item): text = " ".join(item) # 年龄 age_match = re.search(r'(\d{2})岁', text) age = age_match.group(1) if age_match else '' # 身高 height_match = re.search(r'(\d{2,3})\s*cm', text, re.IGNORECASE) height = height_match.group(1) if height_match else '' # 体重 weight_match = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.IGNORECASE) weight = weight_match.group(1) if weight_match else '' # 工作经验 exp_match = re.search( r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)', text ) experience = exp_match.group(1) if exp_match else '' # 学历 edu_match = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text) education = edu_match.group(1) if edu_match else '' # 婚姻状态 marital_match = re.search(r'(已婚|未婚)', text) marital = marital_match.group(1) if marital_match else '' # 民族 ethnic_match = re.search(r'(汉|满|回|壮|蒙古)', text) ethnic = ethnic_match.group(1) if ethnic_match else '' return { 'age': age, 'height': height, 'weight': weight, 'experience': experience, 'education': education, 'marital': marital, 'ethnic': ethnic } def info_to_dict(info): name = info.xpath('//span[@class="hr_resume_username"]/text()') # 基本信息(年龄、身高、体重、经验、学历、婚姻、民族、现居地) parts_raw = info.xpath('//div[@class="hr_resume_info"]/text()') extra_span = info.xpath('//div[@class="hr_resume_info"]/span/text()') # 民族如“汉”可能在 parts = parts_raw[0] if parts_raw else '' if parts: cleaned = re.sub(r'\s+', ' ', parts).strip() parts = [p.strip() for p in cleaned.split('·') if p.strip()] if extra_span: parts.append(extra_span[0].strip()) # 把民族添加进去 # 处理现居住地(通常在最后) current_location = '' if parts and '现居' in parts[-1]: current_location = parts[-1] parts = parts[:-1] # 更新时间 update_time = info.xpath('//span[@class="hr_resume_time_l "]/text()') parts_dict = extract_info(parts) # 求职意向部分 XPath 提取 job_funcs = info.xpath('//span[@class="yun_newedition_yx_job"]/text()') job_titles = info.xpath('//li[span[contains(text(),"意向岗位")]]/text()') industry = info.xpath('//li[span[contains(text(),"从事行业")]]/text()') salary = info.xpath('//li[span[contains(text(),"期望薪资")]]/text()') report_time = info.xpath('//li[span[contains(text(),"到岗时间")]]/text()') job_type = info.xpath('//li[span[contains(text(),"工作性质")]]/text()') job_status = info.xpath('//li[span[contains(text(),"求职状态")]]/text()') # 新增 location = info.xpath('//li[span[contains(text(),"工作地点")]]/text()') # 数据整合 data = { '姓名': name[0].strip() if name else '', '年龄': parts_dict.get('age', ''), '身高': parts_dict.get('height', ''), '体重': parts_dict.get('weight', ''), '工作经验': parts_dict.get('experience', ''), '最高学历': parts_dict.get('education', ''), '婚姻状态': parts_dict.get('marital', ''), '民族': parts_dict.get('ethnic', ''), '现居住地': current_location.replace('现居', '').strip(), '更新时间': update_time[0][3:].strip() if update_time else '', # 求职意向 '工作职能': ', '.join([j.strip() for j in job_funcs]), '意向岗位': job_titles[0].strip() if job_titles else '', '从事行业': industry[0].strip() if industry else '', '期望薪资': salary[0].strip() if salary else '', '到岗时间': report_time[0].strip() if report_time else '', '工作性质': job_type[0].strip() if job_type else '', '求职状态': job_status[0].strip() if job_status else '', '工作地点': location[0].strip() if location else '', } return data def get_cookies(): url = '/login/c_loginsave.html' data = { 'act_login': '0', 'num': '2', 'referurl': 'https://www.zhrczp.com/', 'username': '18713831026', 'password': '18713831026', 'loginname': '0', 'authcode': '', 'verify_token': '', 'verify_str': '', } resp = Requests.post(url, data=data) return resp.cookies_dict() def data_integration(): resume_list = get_resume_list() if len(resume_list) < 1: cookies = get_cookies() Requests.set_default_cookies(cookies) resume_list = get_resume_list() for i in resume_list: data = info_to_dict(get_resumeInfo(i)) for key, value in data.items(): print(key, value) pd_data[key].append(value) pd_data['resume_id'].append(i) df = pd.DataFrame(pd_data) df.to_excel(f'遵化_{datetime.now():%Y%m%d%H%M%S}_服务员.xlsx', index=False) if __name__ == '__main__': data_integration() # print(get_resumeInfo('34735')) # get_cookies()