From f41404e1fdc6e2018245ecdea85148a3c9e99069 Mon Sep 17 00:00:00 2001 From: Franklin-F Date: Sun, 8 Jun 2025 17:12:45 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0YutianTopSpider=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E4=BB=A5=E4=BF=AE=E6=AD=A3=E5=85=AC=E5=8F=B8=E5=92=8C?= =?UTF-8?q?=E8=81=8C=E4=BD=8D=E4=BF=A1=E6=81=AF=E7=9A=84=E6=8F=90=E5=8F=96?= =?UTF-8?q?=E9=80=BB=E8=BE=91=EF=BC=8C=E4=BC=98=E5=8C=96XPath=E8=B0=83?= =?UTF-8?q?=E7=94=A8=E5=B9=B6=E5=A2=9E=E5=BC=BA=E4=BB=A3=E7=A0=81=E5=8F=AF?= =?UTF-8?q?=E8=AF=BB=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TS_resume_spider/pipelines.py | 4 ++- .../spiders/yutian_top_compary.py | 29 +++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py index 76ab4bb..7a72154 100644 --- a/TS_resume_spider/pipelines.py +++ b/TS_resume_spider/pipelines.py @@ -121,7 +121,8 @@ class YTSavePipeline: class CompanySavePipeline: def process_item(self, item, spider): if spider.name == 'yutian_top_compary': - company = item.get("company") + company = item.get("compary") + # print(f"Processing company: {company}") if not company: return item if 'website' in company: @@ -158,6 +159,7 @@ class PositionSavePipeline: def process_item(self, item, spider): if spider.name == 'yutian_top_compary': position = item.get("position") + # print(f"Processing position: {position}") if not position: return item title = position.get("title") diff --git a/TS_resume_spider/spiders/yutian_top_compary.py b/TS_resume_spider/spiders/yutian_top_compary.py index dd21dea..a837261 100644 --- a/TS_resume_spider/spiders/yutian_top_compary.py +++ b/TS_resume_spider/spiders/yutian_top_compary.py @@ -4,8 +4,8 @@ import re def first_or_empty(xpobj, path): - lst = xpobj.xpath(path) - return lst[0].strip() if lst else "" + text = xpobj.xpath(path).get() + return text.strip() if text else "" class YutianTopSpider(scrapy.Spider): @@ -63,14 +63,14 @@ class YutianTopSpider(scrapy.Spider): self.logger.error(f"Not 200 page: {response.url}") return - title = first_or_empty(response.xpath, "//div[@class='top-title']/span/text()") + title = first_or_empty(response, "//div[@class='top-title']/span/text()") nature = "" category = "" region = "" experience = "" education = "" - infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()") - infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()") + infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()").getall() + infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()").getall() if infokey and infovalue: for index, key in enumerate(infokey): if '工作性质' in key: @@ -85,18 +85,18 @@ class YutianTopSpider(scrapy.Spider): experience = infovalue[index].strip() if "学历要求" in key: education = infovalue[index].strip() - salary = first_or_empty(response.xpath, "//div[@class='salary']/text()") - description = first_or_empty(response.xpath, "//div[@class='job-describe']/text()") - contact_name = first_or_empty(response.xpath, + salary = first_or_empty(response, "//div[@class='salary']/text()") + description = first_or_empty(response, "//div[@class='job-describe']/text()") + contact_name = first_or_empty(response, "//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()") - contact_info = first_or_empty(response.xpath, + contact_info = first_or_empty(response, "(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()") benefits = "" if openings == "若干": openings = 1 else: openings = 1 - company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()") + company_name = first_or_empty(response, "//div[@class='company-name']/a/text()") meta = { "title": title, # 职位名称 "nature": nature, # 职位性质 @@ -114,7 +114,7 @@ class YutianTopSpider(scrapy.Spider): "website_id": 2, # 网站ID "company_name": company_name, # 所属企业名称 } - company_id = first_or_empty(response.xpath, "//div[@class='job-detail']/@data-io-company-id") + company_id = first_or_empty(response, "//div[@class='job-detail']/@data-io-company-id") yield scrapy.Request( url=f"https://zp.yutian.top/company/{company_id}.html", headers=self.headers, @@ -124,7 +124,7 @@ class YutianTopSpider(scrapy.Spider): ) def parse_company(self, response): - name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()') + name = first_or_empty(response, '//h1[@class="company-header-top-detail-name hide-txt"]/text()') category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall() category_and_size_value = response.xpath( '//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall() @@ -144,8 +144,8 @@ class YutianTopSpider(scrapy.Spider): elif key == "公司规模 :": size = val - introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()') - address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()') + introduction = first_or_empty(response, '//div[@class="job-left-content-des"]/text()') + address = first_or_empty(response, '//div[@class="job-left-content-address"]/text()') company_type = "" founded_date = "" @@ -162,7 +162,6 @@ class YutianTopSpider(scrapy.Spider): "benefits": benefits_str, # 员工福利(暂未提取) "website_id": 2, # 来源网站 ID } - data = { "compary": company_data, "position": response.meta,