添加ZunHuaComSpider和CompanySavePipeline实现公司数据提取和数据库插入逻辑

2025-05-24 19:41:43 +08:00 · 2025-05-24 19:41:43 +08:00 · 45b281e2d7
commit 45b281e2d7
parent f153c6d250
4 changed files with 120 additions and 0 deletions
--- a/TS_resume_spider/pipelines.py
+++ b/TS_resume_spider/pipelines.py
@ -117,3 +117,17 @@ class YTSavePipeline:
        return item
 class CompanySavePipeline:
    def process_item(self, item, spider):
        if spider.name not in ['zhrczp_com_compary']:
            return item
        company_name = item.get("company_name")
        if not company_name:
            raise DropItem("⚠️ company_name 缺失，已丢弃")
        try:
            DB.insert_company(item)
        except Exception as e:
            spider.logger.warning(f"❌ 写入失败：company_name={company_name}, 错误={e}")
        return item
--- a/TS_resume_spider/settings.py
+++ b/TS_resume_spider/settings.py
@ -57,6 +57,7 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
 ITEM_PIPELINES = {
    'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
    'TS_resume_spider.pipelines.YTSavePipeline': 500,
    'TS_resume_spider.pipelines.CompanySavePipeline': 600,
 }
 # 设置输出文件编码，防止中文乱码
--- a/TS_resume_spider/spiders/zhrczp_com_compary.py
+++ b/TS_resume_spider/spiders/zhrczp_com_compary.py
@ -0,0 +1,86 @@
 from typing import Iterable
 import scrapy
 from lxml import etree
 def extract_company_data(xpathobj):
    """从 etree.HTML 对象中提取公司信息，返回 dict 或 None。"""
    def first_or_empty(path):
        lst = xpathobj.xpath(path)
        return lst[0].strip() if lst else ""
    name = first_or_empty('//h1/a/text()')
    # 公司介绍段落
    intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
    introduction = "\r\n".join(intro_list)
    # 如果没有名称或介绍，直接忽略
    if not (name and introduction):
        return None
    # 公司详情信息
    info = [t.strip() for t in xpathobj.xpath('//div[@class="com_details_info"]/text()') if t.strip()]
    category     = info[1] if len(info) > 1 else ""
    company_type = info[2] if len(info) > 2 else ""
    size         = info[3] if len(info) > 3 else ""
    founded_date = info[4] if len(info) > 4 else ""
    # 公司福利
    benefits = [b.strip() for b in xpathobj.xpath('//div[@class="com_welfare "]/span/text()') if b.strip()]
    benefits_str = " | ".join(benefits)
    address = first_or_empty('//div[@class="com_details_tel_me"]/div/text()')
    return {
        "name":         name,
        "category":     category,
        "size":         size,
        "company_type": company_type,
        "founded_date": founded_date,
        "introduction": introduction,
        "address":      address,
        "benefits":     benefits_str,
        "website":      1,
    }
 class ZunHuaComSpider(scrapy.Spider):
    name = 'zhrczp_com_compary'
    allowed_domains = ['zhrczp.com']
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    def start_requests(self) -> Iterable[scrapy.Request]:
        for page in range(1, 100000):
            url = f"https://www.zhrczp.com/company/{page}.html"
            yield scrapy.Request(
                url=url,
                method='GET',
                headers=self.headers,
                callback=self.parse,
            )
    def parse(self, response):
        # 使用 lxml 解析
        xpathobj = etree.HTML(response.text)
        # 调用公共提取函数
        company_data = extract_company_data(xpathobj)
        if company_data:
            yield company_data
--- a/TS_resume_spider/utils/db.py
+++ b/TS_resume_spider/utils/db.py
@ -71,3 +71,22 @@ class DB:
        """
        cls._client.execute(sql, list(safe_data.values()))
    @classmethod
    def insert_company(cls, data: dict):
        if cls._client is None:
            raise RuntimeError("数据库未初始化。首先调用DB.init()。")
        safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
        if 'name' not in safe_data or 'website' not in safe_data:
            return
        table = 'companies_company'
        keys = ', '.join(safe_data.keys())
        holders = ', '.join(['%s'] * len(safe_data))
        updates = ', '.join([f"{k}=VALUES({k})" for k in safe_data if k not in ('name','website')])
        sql = (
            f"INSERT INTO {table} ({keys}) VALUES ({holders}) "
            f"ON DUPLICATE KEY UPDATE {updates}"
        )
        cls._client.execute(sql, list(safe_data.values()))