添加ZunHuaComSpider和CompanySavePipeline实现公司数据提取和数据库插入逻辑

This commit is contained in:
晓丰 2025-05-24 19:41:43 +08:00
parent f153c6d250
commit 45b281e2d7
4 changed files with 120 additions and 0 deletions

View File

@ -117,3 +117,17 @@ class YTSavePipeline:
return item
class CompanySavePipeline:
def process_item(self, item, spider):
if spider.name not in ['zhrczp_com_compary']:
return item
company_name = item.get("company_name")
if not company_name:
raise DropItem("⚠️ company_name 缺失,已丢弃")
try:
DB.insert_company(item)
except Exception as e:
spider.logger.warning(f"❌ 写入失败company_name={company_name}, 错误={e}")
return item

View File

@ -57,6 +57,7 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
ITEM_PIPELINES = {
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
'TS_resume_spider.pipelines.YTSavePipeline': 500,
'TS_resume_spider.pipelines.CompanySavePipeline': 600,
}
# 设置输出文件编码,防止中文乱码

View File

@ -0,0 +1,86 @@
from typing import Iterable
import scrapy
from lxml import etree
def extract_company_data(xpathobj):
"""从 etree.HTML 对象中提取公司信息,返回 dict 或 None。"""
def first_or_empty(path):
lst = xpathobj.xpath(path)
return lst[0].strip() if lst else ""
name = first_or_empty('//h1/a/text()')
# 公司介绍段落
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
introduction = "\r\n".join(intro_list)
# 如果没有名称或介绍,直接忽略
if not (name and introduction):
return None
# 公司详情信息
info = [t.strip() for t in xpathobj.xpath('//div[@class="com_details_info"]/text()') if t.strip()]
category = info[1] if len(info) > 1 else ""
company_type = info[2] if len(info) > 2 else ""
size = info[3] if len(info) > 3 else ""
founded_date = info[4] if len(info) > 4 else ""
# 公司福利
benefits = [b.strip() for b in xpathobj.xpath('//div[@class="com_welfare "]/span/text()') if b.strip()]
benefits_str = " | ".join(benefits)
address = first_or_empty('//div[@class="com_details_tel_me"]/div/text()')
return {
"name": name,
"category": category,
"size": size,
"company_type": company_type,
"founded_date": founded_date,
"introduction": introduction,
"address": address,
"benefits": benefits_str,
"website": 1,
}
class ZunHuaComSpider(scrapy.Spider):
name = 'zhrczp_com_compary'
allowed_domains = ['zhrczp.com']
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def start_requests(self) -> Iterable[scrapy.Request]:
for page in range(1, 100000):
url = f"https://www.zhrczp.com/company/{page}.html"
yield scrapy.Request(
url=url,
method='GET',
headers=self.headers,
callback=self.parse,
)
def parse(self, response):
# 使用 lxml 解析
xpathobj = etree.HTML(response.text)
# 调用公共提取函数
company_data = extract_company_data(xpathobj)
if company_data:
yield company_data

View File

@ -71,3 +71,22 @@ class DB:
"""
cls._client.execute(sql, list(safe_data.values()))
@classmethod
def insert_company(cls, data: dict):
if cls._client is None:
raise RuntimeError("数据库未初始化。首先调用DB.init()。")
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
if 'name' not in safe_data or 'website' not in safe_data:
return
table = 'companies_company'
keys = ', '.join(safe_data.keys())
holders = ', '.join(['%s'] * len(safe_data))
updates = ', '.join([f"{k}=VALUES({k})" for k in safe_data if k not in ('name','website')])
sql = (
f"INSERT INTO {table} ({keys}) VALUES ({holders}) "
f"ON DUPLICATE KEY UPDATE {updates}"
)
cls._client.execute(sql, list(safe_data.values()))