diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py index ea9b5f5..b7b9a3a 100644 --- a/TS_resume_spider/pipelines.py +++ b/TS_resume_spider/pipelines.py @@ -117,3 +117,17 @@ class YTSavePipeline: return item +class CompanySavePipeline: + def process_item(self, item, spider): + if spider.name not in ['zhrczp_com_compary']: + return item + company_name = item.get("company_name") + if not company_name: + raise DropItem("⚠️ company_name 缺失,已丢弃") + + try: + DB.insert_company(item) + except Exception as e: + spider.logger.warning(f"❌ 写入失败:company_name={company_name}, 错误={e}") + + return item \ No newline at end of file diff --git a/TS_resume_spider/settings.py b/TS_resume_spider/settings.py index 0d9e979..19e6cad 100644 --- a/TS_resume_spider/settings.py +++ b/TS_resume_spider/settings.py @@ -57,6 +57,7 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429] ITEM_PIPELINES = { 'TS_resume_spider.pipelines.YTSpiderPipeline': 300, 'TS_resume_spider.pipelines.YTSavePipeline': 500, + 'TS_resume_spider.pipelines.CompanySavePipeline': 600, } # 设置输出文件编码,防止中文乱码 diff --git a/TS_resume_spider/spiders/zhrczp_com_compary.py b/TS_resume_spider/spiders/zhrczp_com_compary.py new file mode 100644 index 0000000..040acd4 --- /dev/null +++ b/TS_resume_spider/spiders/zhrczp_com_compary.py @@ -0,0 +1,86 @@ +from typing import Iterable +import scrapy +from lxml import etree + + + +def extract_company_data(xpathobj): + """从 etree.HTML 对象中提取公司信息,返回 dict 或 None。""" + def first_or_empty(path): + lst = xpathobj.xpath(path) + return lst[0].strip() if lst else "" + + name = first_or_empty('//h1/a/text()') + # 公司介绍段落 + intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()] + introduction = "\r\n".join(intro_list) + + # 如果没有名称或介绍,直接忽略 + if not (name and introduction): + return None + + # 公司详情信息 + info = [t.strip() for t in xpathobj.xpath('//div[@class="com_details_info"]/text()') if t.strip()] + category = info[1] if len(info) > 1 else "" + company_type = info[2] if len(info) > 2 else "" + size = info[3] if len(info) > 3 else "" + founded_date = info[4] if len(info) > 4 else "" + + # 公司福利 + benefits = [b.strip() for b in xpathobj.xpath('//div[@class="com_welfare "]/span/text()') if b.strip()] + benefits_str = " | ".join(benefits) + + address = first_or_empty('//div[@class="com_details_tel_me"]/div/text()') + + return { + "name": name, + "category": category, + "size": size, + "company_type": company_type, + "founded_date": founded_date, + "introduction": introduction, + "address": address, + "benefits": benefits_str, + "website": 1, + } + + + +class ZunHuaComSpider(scrapy.Spider): + name = 'zhrczp_com_compary' + allowed_domains = ['zhrczp.com'] + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Pragma': 'no-cache', + 'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'same-origin', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', + 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + } + + def start_requests(self) -> Iterable[scrapy.Request]: + for page in range(1, 100000): + url = f"https://www.zhrczp.com/company/{page}.html" + yield scrapy.Request( + url=url, + method='GET', + headers=self.headers, + callback=self.parse, + ) + + def parse(self, response): + # 使用 lxml 解析 + xpathobj = etree.HTML(response.text) + # 调用公共提取函数 + company_data = extract_company_data(xpathobj) + if company_data: + yield company_data \ No newline at end of file diff --git a/TS_resume_spider/utils/db.py b/TS_resume_spider/utils/db.py index c84b7b6..df8b41d 100644 --- a/TS_resume_spider/utils/db.py +++ b/TS_resume_spider/utils/db.py @@ -71,3 +71,22 @@ class DB: """ cls._client.execute(sql, list(safe_data.values())) + + @classmethod + def insert_company(cls, data: dict): + if cls._client is None: + raise RuntimeError("数据库未初始化。首先调用DB.init()。") + + safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))} + if 'name' not in safe_data or 'website' not in safe_data: + return + + table = 'companies_company' + keys = ', '.join(safe_data.keys()) + holders = ', '.join(['%s'] * len(safe_data)) + updates = ', '.join([f"{k}=VALUES({k})" for k in safe_data if k not in ('name','website')]) + sql = ( + f"INSERT INTO {table} ({keys}) VALUES ({holders}) " + f"ON DUPLICATE KEY UPDATE {updates}" + ) + cls._client.execute(sql, list(safe_data.values())) \ No newline at end of file