Crawler/web/zhrczp_com/get_position.py

import re
import sys, os

from lxml.html.diff import href_token

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from web.Requests_Except import MR

base_url = 'www.zhrczp.com'
protocol = 'https'
default_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}
Requests = MR(base_url, protocol)
Requests.set_default_headers(default_headers)


def get_position_page(page: int):
    url = f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html"
    res = Requests.get(url, timeout=10)
    return res.xpath()


def get_position(xpathobj) -> dict:
    href_list = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
    if href_list:
        for href in href_list:
            href = href.strip()
            if href:
                position_res = Requests.get(href, timeout=10)
                xpathobj = position_res.xpath()
                title_xlist = xpathobj.xpath('//h1[@clss="job_details_name"]/text()')
                title = ""
                if title_xlist:
                    title = title_xlist[0].strip()
                nature = "全职"
                category = ""
                region_xlist = xpathobj.xpath('//div[@class="job_details_info"]/text()')
                if region_xlist:
                    region_info = [region.strip() for region in region_xlist]
                    region = ""
                    experience = ""
                    education = ""
                    if len(region_info) == 3:
                        region = region_info[0]
                        experience = region_info[1]
                        education = region_info[2]
                    elif len(region_info) == 4 and "应届" in region_info[3]:
                        region = region_info[0]
                        experience = region_info[1]
                        education = region_info[2] + " " + region_info[3]
                    else:
                        print(region_info)

                salary_xlist = xpathobj.xpath('//span[@class="job_details_salary_n"]/text()')
                salary = ""
                if salary_xlist:
                    salary = salary_xlist[0].strip()
                position_status = 1
                description_xlist = xpathobj.xpath('//div[@class="job_details_describe"]/text()')

                description = ""
                if description_xlist:
                    description = "\r\n".join([desc.strip() for desc in description_xlist if desc.strip()])

                contact_name_xlist = xpathobj.xpath('//span[@class="job_details_touch_username"]/text()')
                contact_name = ""
                if contact_name_xlist:
                    contact_name = contact_name_xlist[0].strip()

                contact_info_xlist = xpathobj.xpath('//span[@class="job_details_touch_tel_n"]/text()')
                contact_info = ""
                if contact_info_xlist:
                    contact_info = contact_info_xlist[0].strip()

                benefits_xlist = xpathobj.xpath('//div[@class="job_details_welfare "]/span/text()')
                benefits = ""
                if benefits_xlist:
                    benefits = " | ".join([benefit.strip() for benefit in benefits_xlist if benefit.strip()])
                company_name_xlist = xpathobj.xpath('//div[@class="Compply_right_name"]/a/text()')
                company_name = ""
                if company_name_xlist:
                    company_name = company_name_xlist[0].strip()
                openings_xlist = xpathobj.xpath('//span[@class="job_details_describe_yq"]/text()')
                openings = 1  # 默认招聘人数为1
                if openings_xlist:
                    try:
                        openings_str = openings_xlist[0].strip()
                        openings_content = re.findall(r"(\d+)", openings_str)[0]
                        openings = int(openings_content)
                    except ValueError:
                        openings = 1
                data = {
                    "title": title,
                    "nature": nature,
                    "category": category,
                    "region": region,
                    "experience": experience,
                    "education": education,
                    "salary": salary,
                    "position_status": position_status,
                    "description": description,
                    "contact_name": contact_name,
                    "contact_info": contact_info,
                    "benefits": benefits,
                    "openings": openings,  # 默认招聘人数为1
                    "website_id": 1,
                    "company_name": company_name,
                }


if __name__ == '__main__':
    get_position(get_position_page(1))