更新公司介绍提取逻辑以支持职位信息的判断;调整爬虫请求页数至96

This commit is contained in:
晓丰 2025-05-26 22:10:45 +08:00
parent 05591129b9
commit dc80fb6c72
2 changed files with 4 additions and 2 deletions

View File

@ -15,11 +15,13 @@ def extract_company_data(xpathobj):
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()] intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
if not intro_list: if not intro_list:
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') if t.strip()] intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') if t.strip()]
# 判断一下有无职位
job_list = xpathobj.xpath('//div[@class="comshow_job"]/div[@class="firm_post"]')
introduction = "\r\n".join(intro_list) introduction = "\r\n".join(intro_list)
# 如果没有名称或介绍,直接忽略 # 如果没有名称或介绍,直接忽略
if not (name and introduction): if not (name and (introduction or job_list)):
return None return None
# 公司详情信息 # 公司详情信息

View File

@ -86,7 +86,7 @@ class ZunHuaComSpider(scrapy.Spider):
} }
def start_requests(self): def start_requests(self):
for page in range(1, 2): for page in range(1, 97):
yield scrapy.Request( yield scrapy.Request(
url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html", url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
headers=self.headers, headers=self.headers,