更新YutianTopSpider爬虫以解析公司信息,增加公司类别、规模及简介的提取逻辑

This commit is contained in:
晓丰 2025-06-08 14:53:36 +08:00
parent 15be08c866
commit 5e4ff46c80

View File

@ -2,8 +2,6 @@ import scrapy
import json
import re
from sympy.benchmarks.bench_meijerint import bench
def first_or_empty(xpobj, path):
lst = xpobj.xpath(path)
@ -33,7 +31,7 @@ class YutianTopSpider(scrapy.Spider):
}
def start_requests(self):
for i in range(1, 39):
for i in range(1, 40):
yield scrapy.Request(
url=f'https://zp.yutian.top/search?keywords=&page={i}',
method='GET',
@ -96,6 +94,8 @@ class YutianTopSpider(scrapy.Spider):
benefits = ""
if openings == "若干":
openings = 1
else:
openings = 1
company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
meta = {
"title": title, # 职位名称
@ -124,5 +124,48 @@ class YutianTopSpider(scrapy.Spider):
)
def parse_company(self, response):
pass
# TODO: 解析公司信息
name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall()
category_and_size_value = response.xpath(
'//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall()
category = ""
size = ""
if len(category_and_size_key) == 2 and len(category_and_size_value) == 2:
if category_and_size_key[0].strip() == "公司类别 ":
category = category_and_size_value[0].strip()
if category_and_size_key[1].strip() == "公司规模 ":
size = category_and_size_value[1].strip()
elif len(category_and_size_key) == 1 and len(category_and_size_value) == 1:
key = category_and_size_key[0].strip()
val = category_and_size_value[0].strip()
if key == "公司类别 ":
category = val
elif key == "公司规模 ":
size = val
introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()')
address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()')
company_type = ""
founded_date = ""
benefits_str = ""
company_data = {
"name": name, # 公司名称
"category": category, # 公司行业类别
"size": size, # 公司规模
"company_type": company_type, # 公司性质(暂未提取)
"founded_date": founded_date, # 成立时间(暂未提取)
"introduction": introduction, # 公司简介
"address": address, # 公司地址
"benefits": benefits_str, # 员工福利(暂未提取)
"website_id": 2, # 来源网站 ID
}
data = {
"compary": company_data,
"position": response.meta,
}
yield data