Crawler/web/zhrczp_com/Download.py

import datetime

import pandas as pd
from lxml import etree
from pathlib import Path
from Requests_Except import *

base_url = 'www.zhrczp.com'
protocol = 'https'
default_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}
Requests = MR(base_url, protocol)
Requests.set_default_headers(default_headers)

excelfilepath = r"C:\Users\Franklin_Kali\Documents\WeChat Files\w19960619\FileStorage\File\2025-06\遵化电话.xlsx"
xlsx_name = str(Path(excelfilepath).stem)

def get_search_page(keyword_id, page):
    params = {
        'c': 'resume',
        'jobin': str(keyword_id),
        'jobclass_search': str(keyword_id),
        'cityin': '',
        'cityclass_search': '',
        'keyword': '',
        'minsalary': '',
        'maxsalary': '',
        'minage': '',
        'maxage': '',
        'exp': '',
        'edu': '',
        'uptime': '',
        'sex': '',
        'type': '',
        'page': str(page),
    }
    url = '/member/index.php'
    resp = Requests.get(url, params=params)
    res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", resp.text)
    return res


def read_excel2df(file_path):
    df = pd.read_excel(file_path)
    return df


def get_resume_list(df):
    resume_id_list = []
    for index, row in df.iterrows():
        resume_id_list.append(row['resume_id'])
    return resume_id_list


def get_cookies():
    url = '/login/c_loginsave.html'
    data = {
        'act_login': '0',
        'num': '2',
        'referurl': 'https://www.zhrczp.com/',
        'username': '18713831026',
        'password': '18713831026',
        'loginname': '0',
        'authcode': '',
        'verify_token': '',
        'verify_str': '',
    }
    resp = Requests.post(url, data=data)
    return resp.cookies_dict()


def get_resumeInfo(resume_id):
    url = '/member/index.php'
    params = {
        'c': 'hr',
        'act': 'resumeInfo',
        'eid': str(resume_id),
        'state': '',
        'from': 'rck',
    }
    resp = Requests.get(url, params=params)
    return resp.xpath(), resp.text


def get_phone(xpath):
    phone = xpath.xpath('//div[contains(text(), "手机")]/span/text()')
    return phone[0].strip() if phone else ''


def get_email(xpath):
    email = xpath.xpath('//div[contains(text(), "邮箱")]/text()')
    if not email:
        return ''
    email = re.search(r'[\w\.-]+@[\w\.-]+', ''.join(email[0])).group()
    return email if email else ''


def post_phone(resume_id):
    url = '/index.php'
    params = {
        'm': 'ajax',
        'c': 'for_link',
    }

    data = {
        'eid': str(resume_id),
    }
    resp = Requests.post(url, params=params, data=data)
    return resp.json()

def data_integration():
    df = read_excel2df(excelfilepath)
    resume_list = get_resume_list(df)
    phones = []
    emails = []
    for resume_id in resume_list:
        phone = ''
        email = ''
        xobj, html = get_resumeInfo(resume_id)

        phone = get_phone(xobj)
        email = get_email(xobj)

        if phone == '' and email == '':
            data = post_phone(resume_id)
            if data.get('msg') == '请先登录！':
                Requests.set_default_cookies(get_cookies())
            if data.get('html'):
                xobj = etree.HTML(data.get('html'))
                phone = get_phone(xobj)
                email = get_email(xobj)

        phones.append(phone)
        emails.append(email)

    df['phone'] = phones
    df['email'] = emails
    df.to_excel(f'遵化_{datetime.datetime.now():%Y%m%d%H%M%S}_{xlsx_name}_p.xlsx', index=False)

if __name__ == '__main__':
    data_integration()