From 5e4ff46c80bcf2e8e89e934d16e151cb552eb482 Mon Sep 17 00:00:00 2001 From: Franklin-F Date: Sun, 8 Jun 2025 14:53:36 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0YutianTopSpider=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E4=BB=A5=E8=A7=A3=E6=9E=90=E5=85=AC=E5=8F=B8=E4=BF=A1?= =?UTF-8?q?=E6=81=AF=EF=BC=8C=E5=A2=9E=E5=8A=A0=E5=85=AC=E5=8F=B8=E7=B1=BB?= =?UTF-8?q?=E5=88=AB=E3=80=81=E8=A7=84=E6=A8=A1=E5=8F=8A=E7=AE=80=E4=BB=8B?= =?UTF-8?q?=E7=9A=84=E6=8F=90=E5=8F=96=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../spiders/yutian_top_compary.py | 53 +++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/TS_resume_spider/spiders/yutian_top_compary.py b/TS_resume_spider/spiders/yutian_top_compary.py index abff592..dd21dea 100644 --- a/TS_resume_spider/spiders/yutian_top_compary.py +++ b/TS_resume_spider/spiders/yutian_top_compary.py @@ -2,8 +2,6 @@ import scrapy import json import re -from sympy.benchmarks.bench_meijerint import bench - def first_or_empty(xpobj, path): lst = xpobj.xpath(path) @@ -33,7 +31,7 @@ class YutianTopSpider(scrapy.Spider): } def start_requests(self): - for i in range(1, 39): + for i in range(1, 40): yield scrapy.Request( url=f'https://zp.yutian.top/search?keywords=&page={i}', method='GET', @@ -96,6 +94,8 @@ class YutianTopSpider(scrapy.Spider): benefits = "" if openings == "若干": openings = 1 + else: + openings = 1 company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()") meta = { "title": title, # 职位名称 @@ -124,5 +124,48 @@ class YutianTopSpider(scrapy.Spider): ) def parse_company(self, response): - pass - # TODO: 解析公司信息 + name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()') + category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall() + category_and_size_value = response.xpath( + '//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall() + + category = "" + size = "" + if len(category_and_size_key) == 2 and len(category_and_size_value) == 2: + if category_and_size_key[0].strip() == "公司类别 :": + category = category_and_size_value[0].strip() + if category_and_size_key[1].strip() == "公司规模 :": + size = category_and_size_value[1].strip() + elif len(category_and_size_key) == 1 and len(category_and_size_value) == 1: + key = category_and_size_key[0].strip() + val = category_and_size_value[0].strip() + if key == "公司类别 :": + category = val + elif key == "公司规模 :": + size = val + + introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()') + address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()') + + company_type = "" + founded_date = "" + benefits_str = "" + + company_data = { + "name": name, # 公司名称 + "category": category, # 公司行业类别 + "size": size, # 公司规模 + "company_type": company_type, # 公司性质(暂未提取) + "founded_date": founded_date, # 成立时间(暂未提取) + "introduction": introduction, # 公司简介 + "address": address, # 公司地址 + "benefits": benefits_str, # 员工福利(暂未提取) + "website_id": 2, # 来源网站 ID + } + + data = { + "compary": company_data, + "position": response.meta, + } + + yield data