添加ZunHuaComSpider和CompanySavePipeline实现公司数据提取和数据库插入逻辑
This commit is contained in:
parent
f153c6d250
commit
45b281e2d7
@ -117,3 +117,17 @@ class YTSavePipeline:
|
|||||||
|
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
class CompanySavePipeline:
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
if spider.name not in ['zhrczp_com_compary']:
|
||||||
|
return item
|
||||||
|
company_name = item.get("company_name")
|
||||||
|
if not company_name:
|
||||||
|
raise DropItem("⚠️ company_name 缺失,已丢弃")
|
||||||
|
|
||||||
|
try:
|
||||||
|
DB.insert_company(item)
|
||||||
|
except Exception as e:
|
||||||
|
spider.logger.warning(f"❌ 写入失败:company_name={company_name}, 错误={e}")
|
||||||
|
|
||||||
|
return item
|
@ -57,6 +57,7 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
|
|||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
|
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
|
||||||
'TS_resume_spider.pipelines.YTSavePipeline': 500,
|
'TS_resume_spider.pipelines.YTSavePipeline': 500,
|
||||||
|
'TS_resume_spider.pipelines.CompanySavePipeline': 600,
|
||||||
}
|
}
|
||||||
|
|
||||||
# 设置输出文件编码,防止中文乱码
|
# 设置输出文件编码,防止中文乱码
|
||||||
|
86
TS_resume_spider/spiders/zhrczp_com_compary.py
Normal file
86
TS_resume_spider/spiders/zhrczp_com_compary.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
from typing import Iterable
|
||||||
|
import scrapy
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_company_data(xpathobj):
|
||||||
|
"""从 etree.HTML 对象中提取公司信息,返回 dict 或 None。"""
|
||||||
|
def first_or_empty(path):
|
||||||
|
lst = xpathobj.xpath(path)
|
||||||
|
return lst[0].strip() if lst else ""
|
||||||
|
|
||||||
|
name = first_or_empty('//h1/a/text()')
|
||||||
|
# 公司介绍段落
|
||||||
|
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
|
||||||
|
introduction = "\r\n".join(intro_list)
|
||||||
|
|
||||||
|
# 如果没有名称或介绍,直接忽略
|
||||||
|
if not (name and introduction):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 公司详情信息
|
||||||
|
info = [t.strip() for t in xpathobj.xpath('//div[@class="com_details_info"]/text()') if t.strip()]
|
||||||
|
category = info[1] if len(info) > 1 else ""
|
||||||
|
company_type = info[2] if len(info) > 2 else ""
|
||||||
|
size = info[3] if len(info) > 3 else ""
|
||||||
|
founded_date = info[4] if len(info) > 4 else ""
|
||||||
|
|
||||||
|
# 公司福利
|
||||||
|
benefits = [b.strip() for b in xpathobj.xpath('//div[@class="com_welfare "]/span/text()') if b.strip()]
|
||||||
|
benefits_str = " | ".join(benefits)
|
||||||
|
|
||||||
|
address = first_or_empty('//div[@class="com_details_tel_me"]/div/text()')
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"category": category,
|
||||||
|
"size": size,
|
||||||
|
"company_type": company_type,
|
||||||
|
"founded_date": founded_date,
|
||||||
|
"introduction": introduction,
|
||||||
|
"address": address,
|
||||||
|
"benefits": benefits_str,
|
||||||
|
"website": 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ZunHuaComSpider(scrapy.Spider):
|
||||||
|
name = 'zhrczp_com_compary'
|
||||||
|
allowed_domains = ['zhrczp.com']
|
||||||
|
headers = {
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||||
|
'Cache-Control': 'no-cache',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Pragma': 'no-cache',
|
||||||
|
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'same-origin',
|
||||||
|
'Sec-Fetch-User': '?1',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
|
||||||
|
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self) -> Iterable[scrapy.Request]:
|
||||||
|
for page in range(1, 100000):
|
||||||
|
url = f"https://www.zhrczp.com/company/{page}.html"
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=url,
|
||||||
|
method='GET',
|
||||||
|
headers=self.headers,
|
||||||
|
callback=self.parse,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
# 使用 lxml 解析
|
||||||
|
xpathobj = etree.HTML(response.text)
|
||||||
|
# 调用公共提取函数
|
||||||
|
company_data = extract_company_data(xpathobj)
|
||||||
|
if company_data:
|
||||||
|
yield company_data
|
@ -71,3 +71,22 @@ class DB:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
cls._client.execute(sql, list(safe_data.values()))
|
cls._client.execute(sql, list(safe_data.values()))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def insert_company(cls, data: dict):
|
||||||
|
if cls._client is None:
|
||||||
|
raise RuntimeError("数据库未初始化。首先调用DB.init()。")
|
||||||
|
|
||||||
|
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
|
||||||
|
if 'name' not in safe_data or 'website' not in safe_data:
|
||||||
|
return
|
||||||
|
|
||||||
|
table = 'companies_company'
|
||||||
|
keys = ', '.join(safe_data.keys())
|
||||||
|
holders = ', '.join(['%s'] * len(safe_data))
|
||||||
|
updates = ', '.join([f"{k}=VALUES({k})" for k in safe_data if k not in ('name','website')])
|
||||||
|
sql = (
|
||||||
|
f"INSERT INTO {table} ({keys}) VALUES ({holders}) "
|
||||||
|
f"ON DUPLICATE KEY UPDATE {updates}"
|
||||||
|
)
|
||||||
|
cls._client.execute(sql, list(safe_data.values()))
|
Loading…
x
Reference in New Issue
Block a user