更新公司介绍提取逻辑以支持职位信息的判断;调整爬虫请求页数至96
This commit is contained in:
parent
05591129b9
commit
dc80fb6c72
@ -15,11 +15,13 @@ def extract_company_data(xpathobj):
|
||||
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
|
||||
if not intro_list:
|
||||
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') if t.strip()]
|
||||
# 判断一下有无职位
|
||||
|
||||
job_list = xpathobj.xpath('//div[@class="comshow_job"]/div[@class="firm_post"]')
|
||||
introduction = "\r\n".join(intro_list)
|
||||
|
||||
# 如果没有名称或介绍,直接忽略
|
||||
if not (name and introduction):
|
||||
if not (name and (introduction or job_list)):
|
||||
return None
|
||||
|
||||
# 公司详情信息
|
||||
|
@ -86,7 +86,7 @@ class ZunHuaComSpider(scrapy.Spider):
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
for page in range(1, 2):
|
||||
for page in range(1, 97):
|
||||
yield scrapy.Request(
|
||||
url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
|
||||
headers=self.headers,
|
||||
|
Loading…
x
Reference in New Issue
Block a user