更新YutianTopSpider爬虫以解析公司信息,增加公司类别、规模及简介的提取逻辑
This commit is contained in:
parent
15be08c866
commit
5e4ff46c80
@ -2,8 +2,6 @@ import scrapy
|
||||
import json
|
||||
import re
|
||||
|
||||
from sympy.benchmarks.bench_meijerint import bench
|
||||
|
||||
|
||||
def first_or_empty(xpobj, path):
|
||||
lst = xpobj.xpath(path)
|
||||
@ -33,7 +31,7 @@ class YutianTopSpider(scrapy.Spider):
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
for i in range(1, 39):
|
||||
for i in range(1, 40):
|
||||
yield scrapy.Request(
|
||||
url=f'https://zp.yutian.top/search?keywords=&page={i}',
|
||||
method='GET',
|
||||
@ -96,6 +94,8 @@ class YutianTopSpider(scrapy.Spider):
|
||||
benefits = ""
|
||||
if openings == "若干":
|
||||
openings = 1
|
||||
else:
|
||||
openings = 1
|
||||
company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
|
||||
meta = {
|
||||
"title": title, # 职位名称
|
||||
@ -124,5 +124,48 @@ class YutianTopSpider(scrapy.Spider):
|
||||
)
|
||||
|
||||
def parse_company(self, response):
|
||||
pass
|
||||
# TODO: 解析公司信息
|
||||
name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
|
||||
category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall()
|
||||
category_and_size_value = response.xpath(
|
||||
'//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall()
|
||||
|
||||
category = ""
|
||||
size = ""
|
||||
if len(category_and_size_key) == 2 and len(category_and_size_value) == 2:
|
||||
if category_and_size_key[0].strip() == "公司类别 :":
|
||||
category = category_and_size_value[0].strip()
|
||||
if category_and_size_key[1].strip() == "公司规模 :":
|
||||
size = category_and_size_value[1].strip()
|
||||
elif len(category_and_size_key) == 1 and len(category_and_size_value) == 1:
|
||||
key = category_and_size_key[0].strip()
|
||||
val = category_and_size_value[0].strip()
|
||||
if key == "公司类别 :":
|
||||
category = val
|
||||
elif key == "公司规模 :":
|
||||
size = val
|
||||
|
||||
introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()')
|
||||
address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()')
|
||||
|
||||
company_type = ""
|
||||
founded_date = ""
|
||||
benefits_str = ""
|
||||
|
||||
company_data = {
|
||||
"name": name, # 公司名称
|
||||
"category": category, # 公司行业类别
|
||||
"size": size, # 公司规模
|
||||
"company_type": company_type, # 公司性质(暂未提取)
|
||||
"founded_date": founded_date, # 成立时间(暂未提取)
|
||||
"introduction": introduction, # 公司简介
|
||||
"address": address, # 公司地址
|
||||
"benefits": benefits_str, # 员工福利(暂未提取)
|
||||
"website_id": 2, # 来源网站 ID
|
||||
}
|
||||
|
||||
data = {
|
||||
"compary": company_data,
|
||||
"position": response.meta,
|
||||
}
|
||||
|
||||
yield data
|
||||
|
Loading…
x
Reference in New Issue
Block a user