Crawler/web/zhrczp_com/main.py

from datetime import datetime
import re

import pandas as pd

from Requests_Except import *

base_url = 'www.zhrczp.com'
protocol = 'https'
default_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}
jobclass_search_list = [53]  # 76护工 46 医院护理类,只有搜索 类型 405 会计;;43 财务;19:司机;出租车司机;249 电工

Requests = MR(base_url, protocol)
Requests.set_default_headers(default_headers)

pd_data = {
    'resume_id': [],
    '姓名': [],
    '年龄': [],
    '身高': [],
    '体重': [],
    '工作经验': [],
    '最高学历': [],
    '婚姻状态': [],
    '民族': [],
    '现居住地': [],
    '更新时间': [],
    '工作职能': [],
    '意向岗位': [],
    '从事行业': [],
    '期望薪资': [],
    '到岗时间': [],
    '工作性质': [],
    '求职状态': [],
    '工作地点': [],
}


def get_search_page(keyword_id, page):
    params = {
        'c': 'resume',
        'jobin': str(keyword_id),
        'jobclass_search': str(keyword_id),
        'cityin': '',
        'cityclass_search': '',
        'keyword': '',
        'minsalary': '',
        'maxsalary': '',
        'minage': '',
        'maxage': '',
        'exp': '',
        'edu': '',
        'uptime': '',
        'sex': '',
        'type': '',
        'page': str(page),
    }
    url = '/member/index.php'
    resp = Requests.get(url, params=params)
    return resp.text, resp.status_code


def get_resume_list():
    resume_id_list = []
    previous_page_html = ''
    for keyword_id in jobclass_search_list:
        for page in range(1, 6):
            html, rest_code = get_search_page(keyword_id, page)
            if rest_code != 200:
                print(rest_code, type(rest_code))
            if html == previous_page_html:
                print('切换下一类型')
                break
            else:
                previous_page_html = html

            res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html)
            resume_id_list += list(set(res))
    return resume_id_list


def get_resumeInfo(resume_id):
    url = '/member/index.php'
    params = {
        'c': 'hr',
        'act': 'resumeInfo',
        'eid': str(resume_id),
        'state': '',
        'from': 'rck',
    }
    resp = Requests.get(url, params=params)
    return resp.xpath()


def extract_info(item):
        text = " ".join(item)

        # 年龄
        age_match = re.search(r'(\d{2})岁', text)
        age = age_match.group(1) if age_match else ''

        # 身高
        height_match = re.search(r'(\d{2,3})\s*cm', text, re.IGNORECASE)
        height = height_match.group(1) if height_match else ''

        # 体重
        weight_match = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.IGNORECASE)
        weight = weight_match.group(1) if weight_match else ''

        # 工作经验
        exp_match = re.search(
            r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)',
            text
        )
        experience = exp_match.group(1) if exp_match else ''

        # 学历
        edu_match = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text)
        education = edu_match.group(1) if edu_match else ''

        # 婚姻状态
        marital_match = re.search(r'(已婚|未婚)', text)
        marital = marital_match.group(1) if marital_match else ''

        # 民族
        ethnic_match = re.search(r'(汉|满|回|壮|蒙古)', text)
        ethnic = ethnic_match.group(1) if ethnic_match else ''

        return {
            'age': age,
            'height': height,
            'weight': weight,
            'experience': experience,
            'education': education,
            'marital': marital,
            'ethnic': ethnic
        }

def info_to_dict(info):
    name = info.xpath('//span[@class="hr_resume_username"]/text()')

    # 基本信息（年龄、身高、体重、经验、学历、婚姻、民族、现居地）
    parts_raw = info.xpath('//div[@class="hr_resume_info"]/text()')
    extra_span = info.xpath('//div[@class="hr_resume_info"]/span/text()')  # 民族如“汉”可能在 <span>
    parts = parts_raw[0] if parts_raw else ''
    if parts:
        cleaned = re.sub(r'\s+', ' ', parts).strip()
        parts = [p.strip() for p in cleaned.split('·') if p.strip()]
        if extra_span:
            parts.append(extra_span[0].strip())  # 把民族添加进去

    # 处理现居住地（通常在最后）
    current_location = ''
    if parts and '现居' in parts[-1]:
        current_location = parts[-1]
        parts = parts[:-1]

    # 更新时间
    update_time = info.xpath('//span[@class="hr_resume_time_l "]/text()')
    parts_dict = extract_info(parts)
    # 求职意向部分 XPath 提取
    job_funcs = info.xpath('//span[@class="yun_newedition_yx_job"]/text()')
    job_titles = info.xpath('//li[span[contains(text(),"意向岗位")]]/text()')
    industry = info.xpath('//li[span[contains(text(),"从事行业")]]/text()')
    salary = info.xpath('//li[span[contains(text(),"期望薪资")]]/text()')
    report_time = info.xpath('//li[span[contains(text(),"到岗时间")]]/text()')
    job_type = info.xpath('//li[span[contains(text(),"工作性质")]]/text()')
    job_status = info.xpath('//li[span[contains(text(),"求职状态")]]/text()')  # 新增
    location = info.xpath('//li[span[contains(text(),"工作地点")]]/text()')

    # 数据整合
    data = {
        '姓名': name[0].strip() if name else '',
        '年龄': parts_dict.get('age', ''),
        '身高': parts_dict.get('height', ''),
        '体重': parts_dict.get('weight', ''),
        '工作经验': parts_dict.get('experience', ''),
        '最高学历': parts_dict.get('education', ''),
        '婚姻状态': parts_dict.get('marital', ''),
        '民族': parts_dict.get('ethnic', ''),
        '现居住地': current_location.replace('现居', '').strip(),
        '更新时间': update_time[0][3:].strip() if update_time else '',
        # 求职意向
        '工作职能': ', '.join([j.strip() for j in job_funcs]),
        '意向岗位': job_titles[0].strip() if job_titles else '',
        '从事行业': industry[0].strip() if industry else '',
        '期望薪资': salary[0].strip() if salary else '',
        '到岗时间': report_time[0].strip() if report_time else '',
        '工作性质': job_type[0].strip() if job_type else '',
        '求职状态': job_status[0].strip() if job_status else '',
        '工作地点': location[0].strip() if location else '',
    }

    return data


def get_cookies():
    url = '/login/c_loginsave.html'
    data = {
        'act_login': '0',
        'num': '2',
        'referurl': 'https://www.zhrczp.com/',
        'username': '18713831026',
        'password': '18713831026',
        'loginname': '0',
        'authcode': '',
        'verify_token': '',
        'verify_str': '',
    }
    resp = Requests.post(url, data=data)
    return resp.cookies_dict()


def data_integration():
    resume_list = get_resume_list()
    if len(resume_list) < 1:
        cookies = get_cookies()
        Requests.set_default_cookies(cookies)
        resume_list = get_resume_list()
    for i in resume_list:
        data = info_to_dict(get_resumeInfo(i))
        for key, value in data.items():
            print(key, value)
            pd_data[key].append(value)
        pd_data['resume_id'].append(i)
    df = pd.DataFrame(pd_data)
    df.to_excel(f'遵化_{datetime.now():%Y%m%d%H%M%S}_服务员.xlsx', index=False)


if __name__ == '__main__':
    data_integration()
    # print(get_resumeInfo('34735'))
    # get_cookies()