250 lines
8.4 KiB
Python

from datetime import datetime
import re
import pandas as pd
from Requests_Except import *
base_url = 'www.zhrczp.com'
protocol = 'https'
default_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
jobclass_search_list = [53] # 76护工 46 医院护理类,只有搜索 类型 405 会计;;43 财务;19:司机;出租车司机;249 电工
Requests = MR(base_url, protocol)
Requests.set_default_headers(default_headers)
pd_data = {
'resume_id': [],
'姓名': [],
'年龄': [],
'身高': [],
'体重': [],
'工作经验': [],
'最高学历': [],
'婚姻状态': [],
'民族': [],
'现居住地': [],
'更新时间': [],
'工作职能': [],
'意向岗位': [],
'从事行业': [],
'期望薪资': [],
'到岗时间': [],
'工作性质': [],
'求职状态': [],
'工作地点': [],
}
def get_search_page(keyword_id, page):
params = {
'c': 'resume',
'jobin': str(keyword_id),
'jobclass_search': str(keyword_id),
'cityin': '',
'cityclass_search': '',
'keyword': '',
'minsalary': '',
'maxsalary': '',
'minage': '',
'maxage': '',
'exp': '',
'edu': '',
'uptime': '',
'sex': '',
'type': '',
'page': str(page),
}
url = '/member/index.php'
resp = Requests.get(url, params=params)
return resp.text, resp.status_code
def get_resume_list():
resume_id_list = []
previous_page_html = ''
for keyword_id in jobclass_search_list:
for page in range(1, 6):
html, rest_code = get_search_page(keyword_id, page)
if rest_code != 200:
print(rest_code, type(rest_code))
if html == previous_page_html:
print('切换下一类型')
break
else:
previous_page_html = html
res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html)
resume_id_list += list(set(res))
return resume_id_list
def get_resumeInfo(resume_id):
url = '/member/index.php'
params = {
'c': 'hr',
'act': 'resumeInfo',
'eid': str(resume_id),
'state': '',
'from': 'rck',
}
resp = Requests.get(url, params=params)
return resp.xpath()
def extract_info(item):
text = " ".join(item)
# 年龄
age_match = re.search(r'(\d{2})岁', text)
age = age_match.group(1) if age_match else ''
# 身高
height_match = re.search(r'(\d{2,3})\s*cm', text, re.IGNORECASE)
height = height_match.group(1) if height_match else ''
# 体重
weight_match = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.IGNORECASE)
weight = weight_match.group(1) if weight_match else ''
# 工作经验
exp_match = re.search(
r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)',
text
)
experience = exp_match.group(1) if exp_match else ''
# 学历
edu_match = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text)
education = edu_match.group(1) if edu_match else ''
# 婚姻状态
marital_match = re.search(r'(已婚|未婚)', text)
marital = marital_match.group(1) if marital_match else ''
# 民族
ethnic_match = re.search(r'(汉|满|回|壮|蒙古)', text)
ethnic = ethnic_match.group(1) if ethnic_match else ''
return {
'age': age,
'height': height,
'weight': weight,
'experience': experience,
'education': education,
'marital': marital,
'ethnic': ethnic
}
def info_to_dict(info):
name = info.xpath('//span[@class="hr_resume_username"]/text()')
# 基本信息(年龄、身高、体重、经验、学历、婚姻、民族、现居地)
parts_raw = info.xpath('//div[@class="hr_resume_info"]/text()')
extra_span = info.xpath('//div[@class="hr_resume_info"]/span/text()') # 民族如“汉”可能在 <span>
parts = parts_raw[0] if parts_raw else ''
if parts:
cleaned = re.sub(r'\s+', ' ', parts).strip()
parts = [p.strip() for p in cleaned.split('·') if p.strip()]
if extra_span:
parts.append(extra_span[0].strip()) # 把民族添加进去
# 处理现居住地(通常在最后)
current_location = ''
if parts and '现居' in parts[-1]:
current_location = parts[-1]
parts = parts[:-1]
# 更新时间
update_time = info.xpath('//span[@class="hr_resume_time_l "]/text()')
parts_dict = extract_info(parts)
# 求职意向部分 XPath 提取
job_funcs = info.xpath('//span[@class="yun_newedition_yx_job"]/text()')
job_titles = info.xpath('//li[span[contains(text(),"意向岗位")]]/text()')
industry = info.xpath('//li[span[contains(text(),"从事行业")]]/text()')
salary = info.xpath('//li[span[contains(text(),"期望薪资")]]/text()')
report_time = info.xpath('//li[span[contains(text(),"到岗时间")]]/text()')
job_type = info.xpath('//li[span[contains(text(),"工作性质")]]/text()')
job_status = info.xpath('//li[span[contains(text(),"求职状态")]]/text()') # 新增
location = info.xpath('//li[span[contains(text(),"工作地点")]]/text()')
# 数据整合
data = {
'姓名': name[0].strip() if name else '',
'年龄': parts_dict.get('age', ''),
'身高': parts_dict.get('height', ''),
'体重': parts_dict.get('weight', ''),
'工作经验': parts_dict.get('experience', ''),
'最高学历': parts_dict.get('education', ''),
'婚姻状态': parts_dict.get('marital', ''),
'民族': parts_dict.get('ethnic', ''),
'现居住地': current_location.replace('现居', '').strip(),
'更新时间': update_time[0][3:].strip() if update_time else '',
# 求职意向
'工作职能': ', '.join([j.strip() for j in job_funcs]),
'意向岗位': job_titles[0].strip() if job_titles else '',
'从事行业': industry[0].strip() if industry else '',
'期望薪资': salary[0].strip() if salary else '',
'到岗时间': report_time[0].strip() if report_time else '',
'工作性质': job_type[0].strip() if job_type else '',
'求职状态': job_status[0].strip() if job_status else '',
'工作地点': location[0].strip() if location else '',
}
return data
def get_cookies():
url = '/login/c_loginsave.html'
data = {
'act_login': '0',
'num': '2',
'referurl': 'https://www.zhrczp.com/',
'username': '18713831026',
'password': '18713831026',
'loginname': '0',
'authcode': '',
'verify_token': '',
'verify_str': '',
}
resp = Requests.post(url, data=data)
return resp.cookies_dict()
def data_integration():
resume_list = get_resume_list()
if len(resume_list) < 1:
cookies = get_cookies()
Requests.set_default_cookies(cookies)
resume_list = get_resume_list()
for i in resume_list:
data = info_to_dict(get_resumeInfo(i))
for key, value in data.items():
print(key, value)
pd_data[key].append(value)
pd_data['resume_id'].append(i)
df = pd.DataFrame(pd_data)
df.to_excel(f'遵化_{datetime.now():%Y%m%d%H%M%S}_服务员.xlsx', index=False)
if __name__ == '__main__':
data_integration()
# print(get_resumeInfo('34735'))
# get_cookies()