From f41404e1fdc6e2018245ecdea85148a3c9e99069 Mon Sep 17 00:00:00 2001
From: Franklin-F <dewujie64@gmail.com>
Date: Sun, 8 Jun 2025 17:12:45 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0YutianTopSpider=E7=88=AC?=
 =?UTF-8?q?=E8=99=AB=E4=BB=A5=E4=BF=AE=E6=AD=A3=E5=85=AC=E5=8F=B8=E5=92=8C?=
 =?UTF-8?q?=E8=81=8C=E4=BD=8D=E4=BF=A1=E6=81=AF=E7=9A=84=E6=8F=90=E5=8F=96?=
 =?UTF-8?q?=E9=80=BB=E8=BE=91=EF=BC=8C=E4=BC=98=E5=8C=96XPath=E8=B0=83?=
 =?UTF-8?q?=E7=94=A8=E5=B9=B6=E5=A2=9E=E5=BC=BA=E4=BB=A3=E7=A0=81=E5=8F=AF?=
 =?UTF-8?q?=E8=AF=BB=E6=80=A7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 TS_resume_spider/pipelines.py                 |  4 ++-
 .../spiders/yutian_top_compary.py             | 29 +++++++++----------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py
index 76ab4bb..7a72154 100644
--- a/TS_resume_spider/pipelines.py
+++ b/TS_resume_spider/pipelines.py
@@ -121,7 +121,8 @@ class YTSavePipeline:
 class CompanySavePipeline:
     def process_item(self, item, spider):
         if spider.name == 'yutian_top_compary':
-            company = item.get("company")
+            company = item.get("compary")
+            # print(f"Processing company: {company}")
             if not company:
                 return item
             if 'website' in company:
@@ -158,6 +159,7 @@ class PositionSavePipeline:
     def process_item(self, item, spider):
         if spider.name == 'yutian_top_compary':
             position = item.get("position")
+            # print(f"Processing position: {position}")
             if not position:
                 return item
             title = position.get("title")
diff --git a/TS_resume_spider/spiders/yutian_top_compary.py b/TS_resume_spider/spiders/yutian_top_compary.py
index dd21dea..a837261 100644
--- a/TS_resume_spider/spiders/yutian_top_compary.py
+++ b/TS_resume_spider/spiders/yutian_top_compary.py
@@ -4,8 +4,8 @@ import re
 
 
 def first_or_empty(xpobj, path):
-    lst = xpobj.xpath(path)
-    return lst[0].strip() if lst else ""
+    text = xpobj.xpath(path).get()
+    return text.strip() if text else ""
 
 
 class YutianTopSpider(scrapy.Spider):
@@ -63,14 +63,14 @@ class YutianTopSpider(scrapy.Spider):
             self.logger.error(f"Not 200 page: {response.url}")
             return
 
-        title = first_or_empty(response.xpath, "//div[@class='top-title']/span/text()")
+        title = first_or_empty(response, "//div[@class='top-title']/span/text()")
         nature = ""
         category = ""
         region = ""
         experience = ""
         education = ""
-        infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()")
-        infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()")
+        infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()").getall()
+        infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()").getall()
         if infokey and infovalue:
             for index, key in enumerate(infokey):
                 if '工作性质' in key:
@@ -85,18 +85,18 @@ class YutianTopSpider(scrapy.Spider):
                     experience = infovalue[index].strip()
                 if "学历要求" in key:
                     education = infovalue[index].strip()
-        salary = first_or_empty(response.xpath, "//div[@class='salary']/text()")
-        description = first_or_empty(response.xpath, "//div[@class='job-describe']/text()")
-        contact_name = first_or_empty(response.xpath,
+        salary = first_or_empty(response, "//div[@class='salary']/text()")
+        description = first_or_empty(response, "//div[@class='job-describe']/text()")
+        contact_name = first_or_empty(response,
                                       "//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()")
-        contact_info = first_or_empty(response.xpath,
+        contact_info = first_or_empty(response,
                                       "(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()")
         benefits = ""
         if openings == "若干":
             openings = 1
         else:
             openings = 1
-        company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
+        company_name = first_or_empty(response, "//div[@class='company-name']/a/text()")
         meta = {
             "title": title,  # 职位名称
             "nature": nature,  # 职位性质
@@ -114,7 +114,7 @@ class YutianTopSpider(scrapy.Spider):
             "website_id": 2,  # 网站ID
             "company_name": company_name,  # 所属企业名称
         }
-        company_id = first_or_empty(response.xpath, "//div[@class='job-detail']/@data-io-company-id")
+        company_id = first_or_empty(response, "//div[@class='job-detail']/@data-io-company-id")
         yield scrapy.Request(
             url=f"https://zp.yutian.top/company/{company_id}.html",
             headers=self.headers,
@@ -124,7 +124,7 @@ class YutianTopSpider(scrapy.Spider):
         )
 
     def parse_company(self, response):
-        name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
+        name = first_or_empty(response, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
         category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall()
         category_and_size_value = response.xpath(
             '//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall()
@@ -144,8 +144,8 @@ class YutianTopSpider(scrapy.Spider):
             elif key == "公司规模 ：":
                 size = val
 
-        introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()')
-        address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()')
+        introduction = first_or_empty(response, '//div[@class="job-left-content-des"]/text()')
+        address = first_or_empty(response, '//div[@class="job-left-content-address"]/text()')
 
         company_type = ""
         founded_date = ""
@@ -162,7 +162,6 @@ class YutianTopSpider(scrapy.Spider):
             "benefits": benefits_str,  # 员工福利（暂未提取）
             "website_id": 2,  # 来源网站 ID
         }
-
         data = {
             "compary": company_data,
             "position": response.meta,