diff --git a/TS_resume_spider/spiders/zhrczp_com_compary.py b/TS_resume_spider/spiders/zhrczp_com_compary.py index 8c31bfa..2b6ffcd 100644 --- a/TS_resume_spider/spiders/zhrczp_com_compary.py +++ b/TS_resume_spider/spiders/zhrczp_com_compary.py @@ -15,11 +15,13 @@ def extract_company_data(xpathobj): intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()] if not intro_list: intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') if t.strip()] + # 判断一下有无职位 + job_list = xpathobj.xpath('//div[@class="comshow_job"]/div[@class="firm_post"]') introduction = "\r\n".join(intro_list) # 如果没有名称或介绍,直接忽略 - if not (name and introduction): + if not (name and (introduction or job_list)): return None # 公司详情信息 diff --git a/TS_resume_spider/spiders/zhrczp_com_position.py b/TS_resume_spider/spiders/zhrczp_com_position.py index 561c518..23f73ed 100644 --- a/TS_resume_spider/spiders/zhrczp_com_position.py +++ b/TS_resume_spider/spiders/zhrczp_com_position.py @@ -86,7 +86,7 @@ class ZunHuaComSpider(scrapy.Spider): } def start_requests(self): - for page in range(1, 2): + for page in range(1, 97): yield scrapy.Request( url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html", headers=self.headers,