更新YutianTopSpider爬虫以解析公司信息,增加公司类别、规模及简介的提取逻辑
This commit is contained in:
parent
15be08c866
commit
5e4ff46c80
@ -2,8 +2,6 @@ import scrapy
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from sympy.benchmarks.bench_meijerint import bench
|
|
||||||
|
|
||||||
|
|
||||||
def first_or_empty(xpobj, path):
|
def first_or_empty(xpobj, path):
|
||||||
lst = xpobj.xpath(path)
|
lst = xpobj.xpath(path)
|
||||||
@ -33,7 +31,7 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
for i in range(1, 39):
|
for i in range(1, 40):
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=f'https://zp.yutian.top/search?keywords=&page={i}',
|
url=f'https://zp.yutian.top/search?keywords=&page={i}',
|
||||||
method='GET',
|
method='GET',
|
||||||
@ -96,6 +94,8 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
benefits = ""
|
benefits = ""
|
||||||
if openings == "若干":
|
if openings == "若干":
|
||||||
openings = 1
|
openings = 1
|
||||||
|
else:
|
||||||
|
openings = 1
|
||||||
company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
|
company_name = first_or_empty(response.xpath, "//div[@class='company-name']/a/text()")
|
||||||
meta = {
|
meta = {
|
||||||
"title": title, # 职位名称
|
"title": title, # 职位名称
|
||||||
@ -124,5 +124,48 @@ class YutianTopSpider(scrapy.Spider):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def parse_company(self, response):
|
def parse_company(self, response):
|
||||||
pass
|
name = first_or_empty(response.xpath, '//h1[@class="company-header-top-detail-name hide-txt"]/text()')
|
||||||
# TODO: 解析公司信息
|
category_and_size_key = response.xpath('//div[@class="company-header-bottom-item-label"]/text()').getall()
|
||||||
|
category_and_size_value = response.xpath(
|
||||||
|
'//div[@class="company-header-bottom-item-text hide-txt"]/text()').getall()
|
||||||
|
|
||||||
|
category = ""
|
||||||
|
size = ""
|
||||||
|
if len(category_and_size_key) == 2 and len(category_and_size_value) == 2:
|
||||||
|
if category_and_size_key[0].strip() == "公司类别 :":
|
||||||
|
category = category_and_size_value[0].strip()
|
||||||
|
if category_and_size_key[1].strip() == "公司规模 :":
|
||||||
|
size = category_and_size_value[1].strip()
|
||||||
|
elif len(category_and_size_key) == 1 and len(category_and_size_value) == 1:
|
||||||
|
key = category_and_size_key[0].strip()
|
||||||
|
val = category_and_size_value[0].strip()
|
||||||
|
if key == "公司类别 :":
|
||||||
|
category = val
|
||||||
|
elif key == "公司规模 :":
|
||||||
|
size = val
|
||||||
|
|
||||||
|
introduction = first_or_empty(response.xpath, '//div[@class="job-left-content-des"]/text()')
|
||||||
|
address = first_or_empty(response.xpath, '//div[@class="job-left-content-address"]/text()')
|
||||||
|
|
||||||
|
company_type = ""
|
||||||
|
founded_date = ""
|
||||||
|
benefits_str = ""
|
||||||
|
|
||||||
|
company_data = {
|
||||||
|
"name": name, # 公司名称
|
||||||
|
"category": category, # 公司行业类别
|
||||||
|
"size": size, # 公司规模
|
||||||
|
"company_type": company_type, # 公司性质(暂未提取)
|
||||||
|
"founded_date": founded_date, # 成立时间(暂未提取)
|
||||||
|
"introduction": introduction, # 公司简介
|
||||||
|
"address": address, # 公司地址
|
||||||
|
"benefits": benefits_str, # 员工福利(暂未提取)
|
||||||
|
"website_id": 2, # 来源网站 ID
|
||||||
|
}
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"compary": company_data,
|
||||||
|
"position": response.meta,
|
||||||
|
}
|
||||||
|
|
||||||
|
yield data
|
||||||
|
Loading…
x
Reference in New Issue
Block a user