更新YutianTopSpider爬虫以提取职位信息,增加公司ID的获取和公司信息解析逻辑

This commit is contained in:
晓丰 2025-05-27 23:09:47 +08:00
parent b13afeb51c
commit 15be08c866

View File

@ -2,8 +2,7 @@ import scrapy
import json import json
import re import re
from lxml.extensions import xpath_code from sympy.benchmarks.bench_meijerint import bench
from openpyxl.styles.builtins import title
def first_or_empty(xpobj, path): def first_or_empty(xpobj, path):
@ -36,7 +35,7 @@ class YutianTopSpider(scrapy.Spider):
def start_requests(self): def start_requests(self):
for i in range(1, 39): for i in range(1, 39):
yield scrapy.Request( yield scrapy.Request(
url=self.start_urls[0], url=f'https://zp.yutian.top/search?keywords=&page={i}',
method='GET', method='GET',
headers=self.headers, headers=self.headers,
dont_filter=True, dont_filter=True,
@ -88,9 +87,17 @@ class YutianTopSpider(scrapy.Spider):
experience = infovalue[index].strip() experience = infovalue[index].strip()
if "学历要求" in key: if "学历要求" in key:
education = infovalue[index].strip() education = infovalue[index].strip()
# TODO: 未完继续 salary = first_or_empty(response.xpath, "//div[@class='salary']/text()")
description = first_or_empty(response.xpath, "//div[@class='job-describe']/text()")
yield { contact_name = first_or_empty(response.xpath,
"//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()")
contact_info = first_or_empty(response.xpath,
"(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()")
benefits = ""
if openings == "若干":
openings = 1
company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
meta = {
"title": title, # 职位名称 "title": title, # 职位名称
"nature": nature, # 职位性质 "nature": nature, # 职位性质
"category": category, # 职位类别 "category": category, # 职位类别
@ -98,7 +105,7 @@ class YutianTopSpider(scrapy.Spider):
"experience": experience, # 工作经历要求 "experience": experience, # 工作经历要求
"education": education, # 学历要求 "education": education, # 学历要求
"salary": salary, # 职位薪资 "salary": salary, # 职位薪资
"position_status": position_status, # 职位状态 "position_status": 1, # 职位状态
"description": description, # 职位描述(详情) "description": description, # 职位描述(详情)
"contact_name": contact_name, # 联系人姓名 "contact_name": contact_name, # 联系人姓名
"contact_info": contact_info, # 联系方式 "contact_info": contact_info, # 联系方式
@ -107,3 +114,15 @@ class YutianTopSpider(scrapy.Spider):
"website_id": 2, # 网站ID "website_id": 2, # 网站ID
"company_name": company_name, # 所属企业名称 "company_name": company_name, # 所属企业名称
} }
company_id = first_or_empty(response.xpath, "//div[@class='job-detail']/@data-io-company-id")
yield scrapy.Request(
url=f"https://zp.yutian.top/company/{company_id}.html",
headers=self.headers,
callback=self.parse_company,
meta=meta,
dont_filter=True,
)
def parse_company(self, response):
pass
# TODO: 解析公司信息