TS_resume_spider/TS_resume_spider/spiders/zhrczp_com_position.py

import re
import scrapy
from lxml import etree


def first_or_empty(xpobj,path):
    lst = xpobj.xpath(path)
    return lst[0].strip() if lst else ""

def extract_position_data(xpathobj):
    print("aaa")
    title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
    print(title)
    if not title:
        return None
    nature = "全职"
    category = first_or_empty(xpathobj, '//div[@class="job_details_category"]/text()')
    region_info = [t.strip() for t in xpathobj.xpath('//div[@class="job_details_info"]/text()') if t.strip()]
    region = ""
    experience = ""
    education = ""
    if len(region_info) == 3:
        region, experience, education = region_info
    elif len(region_info) == 4 and "应届" in region_info[3]:
        region = region_info[0]
        experience = region_info[1]
        education = region_info[2] + " " + region_info[3]
    salary = first_or_empty(xpathobj, '//span[@class="job_details_salary_n"]/text()')
    position_status = 1
    description_list = xpathobj.xpath('//div[@class="job_details_describe"]/text()')
    description = "\r\n".join([d.strip() for d in description_list if d.strip()])
    contact_name = first_or_empty(xpathobj, '//span[@class="job_details_touch_username"]/text()')
    contact_info = first_or_empty(xpathobj, '//span[@class="job_details_touch_tel_n"]/text()')
    benefits_list = xpathobj.xpath('//div[@class="job_details_welfare "]/span/text()')
    benefits = " | ".join([b.strip() for b in benefits_list if b.strip()])
    openings = 1
    openings_str = first_or_empty(xpathobj, '//span[@class="job_details_describe_yq"]/text()')
    if openings_str:
        nums = re.findall(r"(\d+)", openings_str)
        if nums:
            openings = int(nums[0])
    company_name = first_or_empty(xpathobj, '//div[@class="Compply_right_name"]/a/text()')
    if not title or not company_name:
        return None

    return {
        "title": title,
        "nature": nature,
        "category": category,
        "region": region,
        "experience": experience,
        "education": education,
        "salary": salary,
        "position_status": position_status,
        "description": description,
        "contact_name": contact_name,
        "contact_info": contact_info,
        "benefits": benefits,
        "openings": openings,
        "website_id": 1,
        "company_name": company_name,
    }


def get_position_href(xpathobj):
    hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
    print(hrefs)
    return [href.strip() for href in hrefs if href.strip()]


class ZunHuaComSpider(scrapy.Spider):
    name = 'zhrczp_com_position'
    allowed_domains = ['zhrczp.com', 'www.zhrczp.com']
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }

    def start_requests(self):
        for page in range(1, 2):
            yield scrapy.Request(
                url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
                headers=self.headers,
                callback=self.parse,
                dont_filter=True,
            )

    def parse(self, response):
        self.logger.info(f"Parsing page: {response.url}")
        xpathobj = etree.HTML(response.text)
        position_hrefs = get_position_href(xpathobj)
        if position_hrefs:
            for href in position_hrefs:
                yield scrapy.Request(
                    url=href,
                    headers=self.headers,
                    callback=self.parse_position,
                    dont_filter=True,
                )

    def parse_position(self, response):
        self.logger.info(f"Parsing position: {response.url}")
        xpath_object = etree.HTML(response.text)
        position_data = extract_position_data(xpath_object)
        if position_data:
            self.logger.info(position_data)
            yield position_data