更新公司介绍提取逻辑以支持职位信息的判断;调整爬虫请求页数至96

This commit is contained in:
晓丰 2025-05-26 22:10:45 +08:00
parent 05591129b9
commit dc80fb6c72
2 changed files with 4 additions and 2 deletions

View File

@ -15,11 +15,13 @@ def extract_company_data(xpathobj):
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
if not intro_list:
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') if t.strip()]
# 判断一下有无职位
job_list = xpathobj.xpath('//div[@class="comshow_job"]/div[@class="firm_post"]')
introduction = "\r\n".join(intro_list)
# 如果没有名称或介绍,直接忽略
if not (name and introduction):
if not (name and (introduction or job_list)):
return None
# 公司详情信息

View File

@ -86,7 +86,7 @@ class ZunHuaComSpider(scrapy.Spider):
}
def start_requests(self):
for page in range(1, 2):
for page in range(1, 97):
yield scrapy.Request(
url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
headers=self.headers,