86 lines
3.2 KiB
Python
86 lines
3.2 KiB
Python
from typing import Iterable
|
|
import scrapy
|
|
from lxml import etree
|
|
|
|
|
|
|
|
def extract_company_data(xpathobj):
|
|
"""从 etree.HTML 对象中提取公司信息,返回 dict 或 None。"""
|
|
def first_or_empty(path):
|
|
lst = xpathobj.xpath(path)
|
|
return lst[0].strip() if lst else ""
|
|
|
|
name = first_or_empty('//h1/a/text()')
|
|
# 公司介绍段落
|
|
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
|
|
introduction = "\r\n".join(intro_list)
|
|
|
|
# 如果没有名称或介绍,直接忽略
|
|
if not (name and introduction):
|
|
return None
|
|
|
|
# 公司详情信息
|
|
info = [t.strip() for t in xpathobj.xpath('//div[@class="com_details_info"]/text()') if t.strip()]
|
|
category = info[1] if len(info) > 1 else ""
|
|
company_type = info[2] if len(info) > 2 else ""
|
|
size = info[3] if len(info) > 3 else ""
|
|
founded_date = info[4] if len(info) > 4 else ""
|
|
|
|
# 公司福利
|
|
benefits = [b.strip() for b in xpathobj.xpath('//div[@class="com_welfare "]/span/text()') if b.strip()]
|
|
benefits_str = " | ".join(benefits)
|
|
|
|
address = first_or_empty('//div[@class="com_details_tel_me"]/div/text()')
|
|
|
|
return {
|
|
"name": name,
|
|
"category": category,
|
|
"size": size,
|
|
"company_type": company_type,
|
|
"founded_date": founded_date,
|
|
"introduction": introduction,
|
|
"address": address,
|
|
"benefits": benefits_str,
|
|
"website": 1,
|
|
}
|
|
|
|
|
|
|
|
class ZunHuaComSpider(scrapy.Spider):
|
|
name = 'zhrczp_com_compary'
|
|
allowed_domains = ['zhrczp.com']
|
|
headers = {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Pragma': 'no-cache',
|
|
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
'Sec-Fetch-User': '?1',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
|
|
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
}
|
|
|
|
def start_requests(self) -> Iterable[scrapy.Request]:
|
|
for page in range(1, 100000):
|
|
url = f"https://www.zhrczp.com/company/{page}.html"
|
|
yield scrapy.Request(
|
|
url=url,
|
|
method='GET',
|
|
headers=self.headers,
|
|
callback=self.parse,
|
|
)
|
|
|
|
def parse(self, response):
|
|
# 使用 lxml 解析
|
|
xpathobj = etree.HTML(response.text)
|
|
# 调用公共提取函数
|
|
company_data = extract_company_data(xpathobj)
|
|
if company_data:
|
|
yield company_data |