TS_resume_spider/TS_resume_spider/spiders/zhrczp_com_compary.py

86 lines
3.2 KiB
Python

from typing import Iterable
import scrapy
from lxml import etree
def extract_company_data(xpathobj):
"""从 etree.HTML 对象中提取公司信息,返回 dict 或 None。"""
def first_or_empty(path):
lst = xpathobj.xpath(path)
return lst[0].strip() if lst else ""
name = first_or_empty('//h1/a/text()')
# 公司介绍段落
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
introduction = "\r\n".join(intro_list)
# 如果没有名称或介绍,直接忽略
if not (name and introduction):
return None
# 公司详情信息
info = [t.strip() for t in xpathobj.xpath('//div[@class="com_details_info"]/text()') if t.strip()]
category = info[1] if len(info) > 1 else ""
company_type = info[2] if len(info) > 2 else ""
size = info[3] if len(info) > 3 else ""
founded_date = info[4] if len(info) > 4 else ""
# 公司福利
benefits = [b.strip() for b in xpathobj.xpath('//div[@class="com_welfare "]/span/text()') if b.strip()]
benefits_str = " | ".join(benefits)
address = first_or_empty('//div[@class="com_details_tel_me"]/div/text()')
return {
"name": name,
"category": category,
"size": size,
"company_type": company_type,
"founded_date": founded_date,
"introduction": introduction,
"address": address,
"benefits": benefits_str,
"website": 1,
}
class ZunHuaComSpider(scrapy.Spider):
name = 'zhrczp_com_compary'
allowed_domains = ['zhrczp.com']
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def start_requests(self) -> Iterable[scrapy.Request]:
for page in range(1, 100000):
url = f"https://www.zhrczp.com/company/{page}.html"
yield scrapy.Request(
url=url,
method='GET',
headers=self.headers,
callback=self.parse,
)
def parse(self, response):
# 使用 lxml 解析
xpathobj = etree.HTML(response.text)
# 调用公共提取函数
company_data = extract_company_data(xpathobj)
if company_data:
yield company_data