更新公司介绍提取逻辑以支持职位信息的判断;调整爬虫请求页数至96
This commit is contained in:
parent
05591129b9
commit
dc80fb6c72
@ -15,11 +15,13 @@ def extract_company_data(xpathobj):
|
|||||||
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
|
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
|
||||||
if not intro_list:
|
if not intro_list:
|
||||||
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') if t.strip()]
|
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') if t.strip()]
|
||||||
|
# 判断一下有无职位
|
||||||
|
|
||||||
|
job_list = xpathobj.xpath('//div[@class="comshow_job"]/div[@class="firm_post"]')
|
||||||
introduction = "\r\n".join(intro_list)
|
introduction = "\r\n".join(intro_list)
|
||||||
|
|
||||||
# 如果没有名称或介绍,直接忽略
|
# 如果没有名称或介绍,直接忽略
|
||||||
if not (name and introduction):
|
if not (name and (introduction or job_list)):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# 公司详情信息
|
# 公司详情信息
|
||||||
|
@ -86,7 +86,7 @@ class ZunHuaComSpider(scrapy.Spider):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
for page in range(1, 2):
|
for page in range(1, 97):
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
|
url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user