250 lines
8.4 KiB
Python
250 lines
8.4 KiB
Python
from datetime import datetime
|
|
import re
|
|
|
|
import pandas as pd
|
|
|
|
from Requests_Except import *
|
|
|
|
base_url = 'www.zhrczp.com'
|
|
protocol = 'https'
|
|
default_headers = {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Pragma': 'no-cache',
|
|
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
'Sec-Fetch-User': '?1',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
|
|
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
}
|
|
jobclass_search_list = [53] # 76护工 46 医院护理类,只有搜索 类型 405 会计;;43 财务;19:司机;出租车司机;249 电工
|
|
|
|
Requests = MR(base_url, protocol)
|
|
Requests.set_default_headers(default_headers)
|
|
|
|
pd_data = {
|
|
'resume_id': [],
|
|
'姓名': [],
|
|
'年龄': [],
|
|
'身高': [],
|
|
'体重': [],
|
|
'工作经验': [],
|
|
'最高学历': [],
|
|
'婚姻状态': [],
|
|
'民族': [],
|
|
'现居住地': [],
|
|
'更新时间': [],
|
|
'工作职能': [],
|
|
'意向岗位': [],
|
|
'从事行业': [],
|
|
'期望薪资': [],
|
|
'到岗时间': [],
|
|
'工作性质': [],
|
|
'求职状态': [],
|
|
'工作地点': [],
|
|
}
|
|
|
|
|
|
def get_search_page(keyword_id, page):
|
|
params = {
|
|
'c': 'resume',
|
|
'jobin': str(keyword_id),
|
|
'jobclass_search': str(keyword_id),
|
|
'cityin': '',
|
|
'cityclass_search': '',
|
|
'keyword': '',
|
|
'minsalary': '',
|
|
'maxsalary': '',
|
|
'minage': '',
|
|
'maxage': '',
|
|
'exp': '',
|
|
'edu': '',
|
|
'uptime': '',
|
|
'sex': '',
|
|
'type': '',
|
|
'page': str(page),
|
|
}
|
|
url = '/member/index.php'
|
|
resp = Requests.get(url, params=params)
|
|
return resp.text, resp.status_code
|
|
|
|
|
|
def get_resume_list():
|
|
resume_id_list = []
|
|
previous_page_html = ''
|
|
for keyword_id in jobclass_search_list:
|
|
for page in range(1, 6):
|
|
html, rest_code = get_search_page(keyword_id, page)
|
|
if rest_code != 200:
|
|
print(rest_code, type(rest_code))
|
|
if html == previous_page_html:
|
|
print('切换下一类型')
|
|
break
|
|
else:
|
|
previous_page_html = html
|
|
|
|
res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html)
|
|
resume_id_list += list(set(res))
|
|
return resume_id_list
|
|
|
|
|
|
def get_resumeInfo(resume_id):
|
|
url = '/member/index.php'
|
|
params = {
|
|
'c': 'hr',
|
|
'act': 'resumeInfo',
|
|
'eid': str(resume_id),
|
|
'state': '',
|
|
'from': 'rck',
|
|
}
|
|
resp = Requests.get(url, params=params)
|
|
return resp.xpath()
|
|
|
|
|
|
def extract_info(item):
|
|
text = " ".join(item)
|
|
|
|
# 年龄
|
|
age_match = re.search(r'(\d{2})岁', text)
|
|
age = age_match.group(1) if age_match else ''
|
|
|
|
# 身高
|
|
height_match = re.search(r'(\d{2,3})\s*cm', text, re.IGNORECASE)
|
|
height = height_match.group(1) if height_match else ''
|
|
|
|
# 体重
|
|
weight_match = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.IGNORECASE)
|
|
weight = weight_match.group(1) if weight_match else ''
|
|
|
|
# 工作经验
|
|
exp_match = re.search(
|
|
r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)',
|
|
text
|
|
)
|
|
experience = exp_match.group(1) if exp_match else ''
|
|
|
|
# 学历
|
|
edu_match = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text)
|
|
education = edu_match.group(1) if edu_match else ''
|
|
|
|
# 婚姻状态
|
|
marital_match = re.search(r'(已婚|未婚)', text)
|
|
marital = marital_match.group(1) if marital_match else ''
|
|
|
|
# 民族
|
|
ethnic_match = re.search(r'(汉|满|回|壮|蒙古)', text)
|
|
ethnic = ethnic_match.group(1) if ethnic_match else ''
|
|
|
|
return {
|
|
'age': age,
|
|
'height': height,
|
|
'weight': weight,
|
|
'experience': experience,
|
|
'education': education,
|
|
'marital': marital,
|
|
'ethnic': ethnic
|
|
}
|
|
|
|
def info_to_dict(info):
|
|
name = info.xpath('//span[@class="hr_resume_username"]/text()')
|
|
|
|
# 基本信息(年龄、身高、体重、经验、学历、婚姻、民族、现居地)
|
|
parts_raw = info.xpath('//div[@class="hr_resume_info"]/text()')
|
|
extra_span = info.xpath('//div[@class="hr_resume_info"]/span/text()') # 民族如“汉”可能在 <span>
|
|
parts = parts_raw[0] if parts_raw else ''
|
|
if parts:
|
|
cleaned = re.sub(r'\s+', ' ', parts).strip()
|
|
parts = [p.strip() for p in cleaned.split('·') if p.strip()]
|
|
if extra_span:
|
|
parts.append(extra_span[0].strip()) # 把民族添加进去
|
|
|
|
# 处理现居住地(通常在最后)
|
|
current_location = ''
|
|
if parts and '现居' in parts[-1]:
|
|
current_location = parts[-1]
|
|
parts = parts[:-1]
|
|
|
|
# 更新时间
|
|
update_time = info.xpath('//span[@class="hr_resume_time_l "]/text()')
|
|
parts_dict = extract_info(parts)
|
|
# 求职意向部分 XPath 提取
|
|
job_funcs = info.xpath('//span[@class="yun_newedition_yx_job"]/text()')
|
|
job_titles = info.xpath('//li[span[contains(text(),"意向岗位")]]/text()')
|
|
industry = info.xpath('//li[span[contains(text(),"从事行业")]]/text()')
|
|
salary = info.xpath('//li[span[contains(text(),"期望薪资")]]/text()')
|
|
report_time = info.xpath('//li[span[contains(text(),"到岗时间")]]/text()')
|
|
job_type = info.xpath('//li[span[contains(text(),"工作性质")]]/text()')
|
|
job_status = info.xpath('//li[span[contains(text(),"求职状态")]]/text()') # 新增
|
|
location = info.xpath('//li[span[contains(text(),"工作地点")]]/text()')
|
|
|
|
# 数据整合
|
|
data = {
|
|
'姓名': name[0].strip() if name else '',
|
|
'年龄': parts_dict.get('age', ''),
|
|
'身高': parts_dict.get('height', ''),
|
|
'体重': parts_dict.get('weight', ''),
|
|
'工作经验': parts_dict.get('experience', ''),
|
|
'最高学历': parts_dict.get('education', ''),
|
|
'婚姻状态': parts_dict.get('marital', ''),
|
|
'民族': parts_dict.get('ethnic', ''),
|
|
'现居住地': current_location.replace('现居', '').strip(),
|
|
'更新时间': update_time[0][3:].strip() if update_time else '',
|
|
# 求职意向
|
|
'工作职能': ', '.join([j.strip() for j in job_funcs]),
|
|
'意向岗位': job_titles[0].strip() if job_titles else '',
|
|
'从事行业': industry[0].strip() if industry else '',
|
|
'期望薪资': salary[0].strip() if salary else '',
|
|
'到岗时间': report_time[0].strip() if report_time else '',
|
|
'工作性质': job_type[0].strip() if job_type else '',
|
|
'求职状态': job_status[0].strip() if job_status else '',
|
|
'工作地点': location[0].strip() if location else '',
|
|
}
|
|
|
|
return data
|
|
|
|
|
|
def get_cookies():
|
|
url = '/login/c_loginsave.html'
|
|
data = {
|
|
'act_login': '0',
|
|
'num': '2',
|
|
'referurl': 'https://www.zhrczp.com/',
|
|
'username': '18713831026',
|
|
'password': '18713831026',
|
|
'loginname': '0',
|
|
'authcode': '',
|
|
'verify_token': '',
|
|
'verify_str': '',
|
|
}
|
|
resp = Requests.post(url, data=data)
|
|
return resp.cookies_dict()
|
|
|
|
|
|
def data_integration():
|
|
resume_list = get_resume_list()
|
|
if len(resume_list) < 1:
|
|
cookies = get_cookies()
|
|
Requests.set_default_cookies(cookies)
|
|
resume_list = get_resume_list()
|
|
for i in resume_list:
|
|
data = info_to_dict(get_resumeInfo(i))
|
|
for key, value in data.items():
|
|
print(key, value)
|
|
pd_data[key].append(value)
|
|
pd_data['resume_id'].append(i)
|
|
df = pd.DataFrame(pd_data)
|
|
df.to_excel(f'遵化_{datetime.now():%Y%m%d%H%M%S}_服务员.xlsx', index=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
data_integration()
|
|
# print(get_resumeInfo('34735'))
|
|
# get_cookies()
|