diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py index 76ab4bb..7a72154 100644 --- a/TS_resume_spider/pipelines.py +++ b/TS_resume_spider/pipelines.py @@ -121,7 +121,8 @@ class YTSavePipeline: class CompanySavePipeline: def process_item(self, item, spider): if spider.name == 'yutian_top_compary': - company = item.get("company") + company = item.get("compary") + # print(f"Processing company: {company}") if not company: return item if 'website' in company: @@ -158,6 +159,7 @@ class PositionSavePipeline: def process_item(self, item, spider): if spider.name == 'yutian_top_compary': position = item.get("position") + # print(f"Processing position: {position}") if not position: return item title = position.get("title") diff --git a/TS_resume_spider/spiders/yutian_top_compary.py b/TS_resume_spider/spiders/yutian_top_compary.py index dd21dea..a837261 100644 --- a/TS_resume_spider/spiders/yutian_top_compary.py +++ b/TS_resume_spider/spiders/yutian_top_compary.py @@ -4,8 +4,8 @@ import re def first_or_empty(xpobj, path): - lst = xpobj.xpath(path) - return lst[0].strip() if lst else "" + text = xpobj.xpath(path).get() + return text.strip() if text else "" class YutianTopSpider(scrapy.Spider): @@ -63,14 +63,14 @@ class YutianTopSpider(scrapy.Spider): self.logger.error(f"Not 200 page: {response.url}") return - title = first_or_empty(response.xpath, "//div[@class='top-title']/span/text()") + title = first_or_empty(response, "//div[@class='top-title']/span/text()") nature = "" category = "" region = "" experience = "" education = "" - infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()") - infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()") + infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()").getall() + infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()").getall() if infokey and infovalue: for index, key in enumerate(infokey): if '工作性质' in key: @@ -85,18 +85,18 @@ class YutianTopSpider(scrapy.Spider): experience = infovalue[index].strip() if "学历要求" in key: education = infovalue[index].strip() - salary = first_or_empty(response.xpath, "//div[@class='salary']/text()") - description = first_or_empty(response.xpath, "//div[@class='job-describe']/text()") - contact_name = first_or_empty(response.xpath, + salary = first_or_empty(response, "//div[@class='salary']/text()") + description = first_or_empty(response, "//div[@class='job-describe']/text()") + contact_name = first_or_empty(response, "//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()") - contact_info = first_or_empty(response.xpath, + contact_info = first_or_empty(response, "(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()") benefits = "" if openings == "若干": openings = 1 else: openings = 1 - company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()") + company_name = first_or_empty(response, "//div[@class='company-name']/a/text()") meta = { "title": title, # 职位名称 "nature": nature, # 职位性质 @@ -114,7 +114,7 @@ class YutianTopSpider(scrapy.Spider): "website_id": 2, # 网站ID "company_name": company_name, # 所属企业名称 } - company_id = first_or_empty(response.xpath, "//div[@class='job-detail']/@data-io-company-id") + company_id = first_or_empty(response, "//div[@class='job-detail']/@data-io-company-id") yield scrapy.Request( url=f"https://zp.yutian.top/company/{company_id}.html", headers=self.headers, @@ -124,7 +124,7 @@ class YutianTopSpider(scrapy.Spider): ) def parse_company(self, response): - name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()') + name = first_or_empty(response, '//h1[@class="company-header-top-detail-name hide-txt"]/text()') category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall() category_and_size_value = response.xpath( '//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall() @@ -144,8 +144,8 @@ class YutianTopSpider(scrapy.Spider): elif key == "公司规模 :": size = val - introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()') - address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()') + introduction = first_or_empty(response, '//div[@class="job-left-content-des"]/text()') + address = first_or_empty(response, '//div[@class="job-left-content-address"]/text()') company_type = "" founded_date = "" @@ -162,7 +162,6 @@ class YutianTopSpider(scrapy.Spider): "benefits": benefits_str, # 员工福利(暂未提取) "website_id": 2, # 来源网站 ID } - data = { "compary": company_data, "position": response.meta,