TS_resume_spider/TS_resume_spider/spiders/zhrczp_com_compary.py

from typing import Iterable
import scrapy
from lxml import etree


def extract_company_data(xpathobj):
    """从 etree.HTML 对象中提取公司信息，返回 dict 或 None。"""
    def first_or_empty(path):
        lst = xpathobj.xpath(path)
        return lst[0].strip() if lst else ""

    name = first_or_empty('//h1/a/text()')
    # 公司介绍段落
    intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
    introduction = "\r\n".join(intro_list)

    # 如果没有名称或介绍，直接忽略
    if not (name and introduction):
        return None

    # 公司详情信息
    info = [t.strip() for t in xpathobj.xpath('//div[@class="com_details_info"]/text()') if t.strip()]
    category     = info[1] if len(info) > 1 else ""
    company_type = info[2] if len(info) > 2 else ""
    size         = info[3] if len(info) > 3 else ""
    founded_date = info[4] if len(info) > 4 else ""

    # 公司福利
    benefits = [b.strip() for b in xpathobj.xpath('//div[@class="com_welfare "]/span/text()') if b.strip()]
    benefits_str = " | ".join(benefits)

    address = first_or_empty('//div[@class="com_details_tel_me"]/div/text()')

    return {
        "name":         name,
        "category":     category,
        "size":         size,
        "company_type": company_type,
        "founded_date": founded_date,
        "introduction": introduction,
        "address":      address,
        "benefits":     benefits_str,
        "website":      1,
    }


class ZunHuaComSpider(scrapy.Spider):
    name = 'zhrczp_com_compary'
    allowed_domains = ['zhrczp.com']
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }

    def start_requests(self) -> Iterable[scrapy.Request]:
        for page in range(1, 100000):
            url = f"https://www.zhrczp.com/company/{page}.html"
            yield scrapy.Request(
                url=url,
                method='GET',
                headers=self.headers,
                callback=self.parse,
            )

    def parse(self, response):
        # 使用 lxml 解析
        xpathobj = etree.HTML(response.text)
        # 调用公共提取函数
        company_data = extract_company_data(xpathobj)
        if company_data:
            yield company_data