更新公司介绍提取逻辑以支持职位信息的判断；调整爬虫请求页数至96

2025-05-26 22:10:45 +08:00 · 2025-05-26 22:10:45 +08:00 · dc80fb6c72
commit dc80fb6c72
parent 05591129b9
2 changed files with 4 additions and 2 deletions
--- a/TS_resume_spider/spiders/zhrczp_com_compary.py
+++ b/TS_resume_spider/spiders/zhrczp_com_compary.py
@ -15,11 +15,13 @@ def extract_company_data(xpathobj):
        intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
    if not intro_list:
        intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') if t.strip()]
+    # 判断一下有无职位

+    job_list = xpathobj.xpath('//div[@class="comshow_job"]/div[@class="firm_post"]')
    introduction = "\r\n".join(intro_list)

    # 如果没有名称或介绍，直接忽略
-    if not (name and introduction):
+    if not (name and (introduction or job_list)):
        return None

    # 公司详情信息
--- a/TS_resume_spider/spiders/zhrczp_com_position.py
+++ b/TS_resume_spider/spiders/zhrczp_com_position.py
@ -86,7 +86,7 @@ class ZunHuaComSpider(scrapy.Spider):
    }

    def start_requests(self):
-        for page in range(1, 2):
+        for page in range(1, 97):
            yield scrapy.Request(
                url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
                headers=self.headers,