更新YutianTopSpider爬虫以修正公司和职位信息的提取逻辑，优化XPath调用并增强代码可读性

2025-06-08 17:12:45 +08:00 · 2025-06-08 17:12:45 +08:00 · f41404e1fd
commit f41404e1fd
parent f864ea6bb3
2 changed files with 17 additions and 16 deletions
--- a/TS_resume_spider/pipelines.py
+++ b/TS_resume_spider/pipelines.py
@ -121,7 +121,8 @@ class YTSavePipeline:
 class CompanySavePipeline:
    def process_item(self, item, spider):
        if spider.name == 'yutian_top_compary':
-            company = item.get("company")
+            company = item.get("compary")
+            # print(f"Processing company: {company}")
            if not company:
                return item
            if 'website' in company:
@ -158,6 +159,7 @@ class PositionSavePipeline:
    def process_item(self, item, spider):
        if spider.name == 'yutian_top_compary':
            position = item.get("position")
+            # print(f"Processing position: {position}")
            if not position:
                return item
            title = position.get("title")
--- a/TS_resume_spider/spiders/yutian_top_compary.py
+++ b/TS_resume_spider/spiders/yutian_top_compary.py
@ -4,8 +4,8 @@ import re


 def first_or_empty(xpobj, path):
-    lst = xpobj.xpath(path)
-    return lst[0].strip() if lst else ""
+    text = xpobj.xpath(path).get()
+    return text.strip() if text else ""


 class YutianTopSpider(scrapy.Spider):
@ -63,14 +63,14 @@ class YutianTopSpider(scrapy.Spider):
            self.logger.error(f"Not 200 page: {response.url}")
            return

-        title = first_or_empty(response.xpath, "//div[@class='top-title']/span/text()")
+        title = first_or_empty(response, "//div[@class='top-title']/span/text()")
        nature = ""
        category = ""
        region = ""
        experience = ""
        education = ""
-        infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()")
-        infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()")
+        infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()").getall()
+        infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()").getall()
        if infokey and infovalue:
            for index, key in enumerate(infokey):
                if '工作性质' in key:
@ -85,18 +85,18 @@ class YutianTopSpider(scrapy.Spider):
                    experience = infovalue[index].strip()
                if "学历要求" in key:
                    education = infovalue[index].strip()
-        salary = first_or_empty(response.xpath, "//div[@class='salary']/text()")
-        description = first_or_empty(response.xpath, "//div[@class='job-describe']/text()")
-        contact_name = first_or_empty(response.xpath,
+        salary = first_or_empty(response, "//div[@class='salary']/text()")
+        description = first_or_empty(response, "//div[@class='job-describe']/text()")
+        contact_name = first_or_empty(response,
                                      "//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()")
-        contact_info = first_or_empty(response.xpath,
+        contact_info = first_or_empty(response,
                                      "(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()")
        benefits = ""
        if openings == "若干":
            openings = 1
        else:
            openings = 1
-        company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
+        company_name = first_or_empty(response, "//div[@class='company-name']/a/text()")
        meta = {
            "title": title,  # 职位名称
            "nature": nature,  # 职位性质
@ -114,7 +114,7 @@ class YutianTopSpider(scrapy.Spider):
            "website_id": 2,  # 网站ID
            "company_name": company_name,  # 所属企业名称
        }
-        company_id = first_or_empty(response.xpath, "//div[@class='job-detail']/@data-io-company-id")
+        company_id = first_or_empty(response, "//div[@class='job-detail']/@data-io-company-id")
        yield scrapy.Request(
            url=f"https://zp.yutian.top/company/{company_id}.html",
            headers=self.headers,
@ -124,7 +124,7 @@ class YutianTopSpider(scrapy.Spider):
        )

    def parse_company(self, response):
-        name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
+        name = first_or_empty(response, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
        category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall()
        category_and_size_value = response.xpath(
            '//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall()
@ -144,8 +144,8 @@ class YutianTopSpider(scrapy.Spider):
            elif key == "公司规模 ：":
                size = val

-        introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()')
-        address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()')
+        introduction = first_or_empty(response, '//div[@class="job-left-content-des"]/text()')
+        address = first_or_empty(response, '//div[@class="job-left-content-address"]/text()')

        company_type = ""
        founded_date = ""
@ -162,7 +162,6 @@ class YutianTopSpider(scrapy.Spider):
            "benefits": benefits_str,  # 员工福利（暂未提取）
            "website_id": 2,  # 来源网站 ID
        }
-
        data = {
            "compary": company_data,
            "position": response.meta,