更新YutianTopSpider爬虫以修正公司和职位信息的提取逻辑,优化XPath调用并增强代码可读性
This commit is contained in:
parent
f864ea6bb3
commit
f41404e1fd
@ -121,7 +121,8 @@ class YTSavePipeline:
|
|||||||
class CompanySavePipeline:
|
class CompanySavePipeline:
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
if spider.name == 'yutian_top_compary':
|
if spider.name == 'yutian_top_compary':
|
||||||
company = item.get("company")
|
company = item.get("compary")
|
||||||
|
# print(f"Processing company: {company}")
|
||||||
if not company:
|
if not company:
|
||||||
return item
|
return item
|
||||||
if 'website' in company:
|
if 'website' in company:
|
||||||
@ -158,6 +159,7 @@ class PositionSavePipeline:
|
|||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
if spider.name == 'yutian_top_compary':
|
if spider.name == 'yutian_top_compary':
|
||||||
position = item.get("position")
|
position = item.get("position")
|
||||||
|
# print(f"Processing position: {position}")
|
||||||
if not position:
|
if not position:
|
||||||
return item
|
return item
|
||||||
title = position.get("title")
|
title = position.get("title")
|
||||||
|
@ -4,8 +4,8 @@ import re
|
|||||||
|
|
||||||
|
|
||||||
def first_or_empty(xpobj, path):
|
def first_or_empty(xpobj, path):
|
||||||
lst = xpobj.xpath(path)
|
text = xpobj.xpath(path).get()
|
||||||
return lst[0].strip() if lst else ""
|
return text.strip() if text else ""
|
||||||
|
|
||||||
|
|
||||||
class YutianTopSpider(scrapy.Spider):
|
class YutianTopSpider(scrapy.Spider):
|
||||||
@ -63,14 +63,14 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
self.logger.error(f"Not 200 page: {response.url}")
|
self.logger.error(f"Not 200 page: {response.url}")
|
||||||
return
|
return
|
||||||
|
|
||||||
title = first_or_empty(response.xpath, "//div[@class='top-title']/span/text()")
|
title = first_or_empty(response, "//div[@class='top-title']/span/text()")
|
||||||
nature = ""
|
nature = ""
|
||||||
category = ""
|
category = ""
|
||||||
region = ""
|
region = ""
|
||||||
experience = ""
|
experience = ""
|
||||||
education = ""
|
education = ""
|
||||||
infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()")
|
infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()").getall()
|
||||||
infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()")
|
infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()").getall()
|
||||||
if infokey and infovalue:
|
if infokey and infovalue:
|
||||||
for index, key in enumerate(infokey):
|
for index, key in enumerate(infokey):
|
||||||
if '工作性质' in key:
|
if '工作性质' in key:
|
||||||
@ -85,18 +85,18 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
experience = infovalue[index].strip()
|
experience = infovalue[index].strip()
|
||||||
if "学历要求" in key:
|
if "学历要求" in key:
|
||||||
education = infovalue[index].strip()
|
education = infovalue[index].strip()
|
||||||
salary = first_or_empty(response.xpath, "//div[@class='salary']/text()")
|
salary = first_or_empty(response, "//div[@class='salary']/text()")
|
||||||
description = first_or_empty(response.xpath, "//div[@class='job-describe']/text()")
|
description = first_or_empty(response, "//div[@class='job-describe']/text()")
|
||||||
contact_name = first_or_empty(response.xpath,
|
contact_name = first_or_empty(response,
|
||||||
"//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()")
|
"//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()")
|
||||||
contact_info = first_or_empty(response.xpath,
|
contact_info = first_or_empty(response,
|
||||||
"(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()")
|
"(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()")
|
||||||
benefits = ""
|
benefits = ""
|
||||||
if openings == "若干":
|
if openings == "若干":
|
||||||
openings = 1
|
openings = 1
|
||||||
else:
|
else:
|
||||||
openings = 1
|
openings = 1
|
||||||
company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
|
company_name = first_or_empty(response, "//div[@class='company-name']/a/text()")
|
||||||
meta = {
|
meta = {
|
||||||
"title": title, # 职位名称
|
"title": title, # 职位名称
|
||||||
"nature": nature, # 职位性质
|
"nature": nature, # 职位性质
|
||||||
@ -114,7 +114,7 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
"website_id": 2, # 网站ID
|
"website_id": 2, # 网站ID
|
||||||
"company_name": company_name, # 所属企业名称
|
"company_name": company_name, # 所属企业名称
|
||||||
}
|
}
|
||||||
company_id = first_or_empty(response.xpath, "//div[@class='job-detail']/@data-io-company-id")
|
company_id = first_or_empty(response, "//div[@class='job-detail']/@data-io-company-id")
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=f"https://zp.yutian.top/company/{company_id}.html",
|
url=f"https://zp.yutian.top/company/{company_id}.html",
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
@ -124,7 +124,7 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def parse_company(self, response):
|
def parse_company(self, response):
|
||||||
name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
|
name = first_or_empty(response, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
|
||||||
category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall()
|
category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall()
|
||||||
category_and_size_value = response.xpath(
|
category_and_size_value = response.xpath(
|
||||||
'//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall()
|
'//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall()
|
||||||
@ -144,8 +144,8 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
elif key == "公司规模 :":
|
elif key == "公司规模 :":
|
||||||
size = val
|
size = val
|
||||||
|
|
||||||
introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()')
|
introduction = first_or_empty(response, '//div[@class="job-left-content-des"]/text()')
|
||||||
address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()')
|
address = first_or_empty(response, '//div[@class="job-left-content-address"]/text()')
|
||||||
|
|
||||||
company_type = ""
|
company_type = ""
|
||||||
founded_date = ""
|
founded_date = ""
|
||||||
@ -162,7 +162,6 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
"benefits": benefits_str, # 员工福利(暂未提取)
|
"benefits": benefits_str, # 员工福利(暂未提取)
|
||||||
"website_id": 2, # 来源网站 ID
|
"website_id": 2, # 来源网站 ID
|
||||||
}
|
}
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"compary": company_data,
|
"compary": company_data,
|
||||||
"position": response.meta,
|
"position": response.meta,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user