更新YutianTopSpider爬虫以提取职位信息，增加公司ID的获取和公司信息解析逻辑

2025-05-27 23:09:47 +08:00 · 2025-05-27 23:09:47 +08:00 · 15be08c866
commit 15be08c866
parent b13afeb51c
1 changed files with 40 additions and 21 deletions
--- a/TS_resume_spider/spiders/yutian_top_compary.py
+++ b/TS_resume_spider/spiders/yutian_top_compary.py
@ -2,8 +2,7 @@ import scrapy
 import json
 import re

-from lxml.extensions import xpath_code
-from openpyxl.styles.builtins import title
+from sympy.benchmarks.bench_meijerint import bench


 def first_or_empty(xpobj, path):
@ -36,7 +35,7 @@ class YutianTopSpider(scrapy.Spider):
    def start_requests(self):
        for i in range(1, 39):
            yield scrapy.Request(
-                url=self.start_urls[0],
+                url=f'https://zp.yutian.top/search?keywords=&page={i}',
                method='GET',
                headers=self.headers,
                dont_filter=True,
@ -88,22 +87,42 @@ class YutianTopSpider(scrapy.Spider):
                    experience = infovalue[index].strip()
                if "学历要求" in key:
                    education = infovalue[index].strip()
-        # TODO: 未完继续
-
-        yield {
-            "title": title, # 职位名称
-            "nature": nature, # 职位性质
-            "category": category, # 职位类别
-            "region": region, # 职位区域
-            "experience": experience, # 工作经历要求
-            "education": education, # 学历要求
-            "salary": salary, # 职位薪资
-            "position_status": position_status, # 职位状态
-            "description": description, # 职位描述(详情)
-            "contact_name": contact_name, # 联系人姓名
-            "contact_info": contact_info, # 联系方式
-            "benefits": benefits, # 职位福利
-            "openings": openings, # 招聘人数
-            "website_id": 2, # 网站ID
-            "company_name": company_name, # 所属企业名称
+        salary = first_or_empty(response.xpath, "//div[@class='salary']/text()")
+        description = first_or_empty(response.xpath, "//div[@class='job-describe']/text()")
+        contact_name = first_or_empty(response.xpath,
+                                      "//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()")
+        contact_info = first_or_empty(response.xpath,
+                                      "(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()")
+        benefits = ""
+        if openings == "若干":
+            openings = 1
+        company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
+        meta = {
+            "title": title,  # 职位名称
+            "nature": nature,  # 职位性质
+            "category": category,  # 职位类别
+            "region": region,  # 职位区域
+            "experience": experience,  # 工作经历要求
+            "education": education,  # 学历要求
+            "salary": salary,  # 职位薪资
+            "position_status": 1,  # 职位状态
+            "description": description,  # 职位描述(详情)
+            "contact_name": contact_name,  # 联系人姓名
+            "contact_info": contact_info,  # 联系方式
+            "benefits": benefits,  # 职位福利
+            "openings": openings,  # 招聘人数
+            "website_id": 2,  # 网站ID
+            "company_name": company_name,  # 所属企业名称
        }
+        company_id = first_or_empty(response.xpath, "//div[@class='job-detail']/@data-io-company-id")
+        yield scrapy.Request(
+            url=f"https://zp.yutian.top/company/{company_id}.html",
+            headers=self.headers,
+            callback=self.parse_company,
+            meta=meta,
+            dont_filter=True,
+        )
+
+    def parse_company(self, response):
+        pass
+        # TODO: 解析公司信息