更新YutianTopSpider爬虫以提取职位信息,增加公司ID的获取和公司信息解析逻辑
This commit is contained in:
parent
b13afeb51c
commit
15be08c866
@ -2,8 +2,7 @@ import scrapy
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from lxml.extensions import xpath_code
|
from sympy.benchmarks.bench_meijerint import bench
|
||||||
from openpyxl.styles.builtins import title
|
|
||||||
|
|
||||||
|
|
||||||
def first_or_empty(xpobj, path):
|
def first_or_empty(xpobj, path):
|
||||||
@ -36,7 +35,7 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
for i in range(1, 39):
|
for i in range(1, 39):
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=self.start_urls[0],
|
url=f'https://zp.yutian.top/search?keywords=&page={i}',
|
||||||
method='GET',
|
method='GET',
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
dont_filter=True,
|
dont_filter=True,
|
||||||
@ -88,9 +87,17 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
experience = infovalue[index].strip()
|
experience = infovalue[index].strip()
|
||||||
if "学历要求" in key:
|
if "学历要求" in key:
|
||||||
education = infovalue[index].strip()
|
education = infovalue[index].strip()
|
||||||
# TODO: 未完继续
|
salary = first_or_empty(response.xpath, "//div[@class='salary']/text()")
|
||||||
|
description = first_or_empty(response.xpath, "//div[@class='job-describe']/text()")
|
||||||
yield {
|
contact_name = first_or_empty(response.xpath,
|
||||||
|
"//div[@class='bg-mask']/div[@class='connect-info-item']/span[@class='value']/text()")
|
||||||
|
contact_info = first_or_empty(response.xpath,
|
||||||
|
"(//div[@class='bg-mask']//div[@class='connect-info-item'])[2]/span[@class='value']/text()")
|
||||||
|
benefits = ""
|
||||||
|
if openings == "若干":
|
||||||
|
openings = 1
|
||||||
|
company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
|
||||||
|
meta = {
|
||||||
"title": title, # 职位名称
|
"title": title, # 职位名称
|
||||||
"nature": nature, # 职位性质
|
"nature": nature, # 职位性质
|
||||||
"category": category, # 职位类别
|
"category": category, # 职位类别
|
||||||
@ -98,7 +105,7 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
"experience": experience, # 工作经历要求
|
"experience": experience, # 工作经历要求
|
||||||
"education": education, # 学历要求
|
"education": education, # 学历要求
|
||||||
"salary": salary, # 职位薪资
|
"salary": salary, # 职位薪资
|
||||||
"position_status": position_status, # 职位状态
|
"position_status": 1, # 职位状态
|
||||||
"description": description, # 职位描述(详情)
|
"description": description, # 职位描述(详情)
|
||||||
"contact_name": contact_name, # 联系人姓名
|
"contact_name": contact_name, # 联系人姓名
|
||||||
"contact_info": contact_info, # 联系方式
|
"contact_info": contact_info, # 联系方式
|
||||||
@ -107,3 +114,15 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
"website_id": 2, # 网站ID
|
"website_id": 2, # 网站ID
|
||||||
"company_name": company_name, # 所属企业名称
|
"company_name": company_name, # 所属企业名称
|
||||||
}
|
}
|
||||||
|
company_id = first_or_empty(response.xpath, "//div[@class='job-detail']/@data-io-company-id")
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=f"https://zp.yutian.top/company/{company_id}.html",
|
||||||
|
headers=self.headers,
|
||||||
|
callback=self.parse_company,
|
||||||
|
meta=meta,
|
||||||
|
dont_filter=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_company(self, response):
|
||||||
|
pass
|
||||||
|
# TODO: 解析公司信息
|
||||||
|
Loading…
x
Reference in New Issue
Block a user