添加ZunHuaComSpider爬虫以提取职位信息并实现数据解析逻辑
This commit is contained in:
parent
542f2ce0bd
commit
03b56ae1b2
119
TS_resume_spider/spiders/zhrczp_com_position.py
Normal file
119
TS_resume_spider/spiders/zhrczp_com_position.py
Normal file
@ -0,0 +1,119 @@
|
||||
import re
|
||||
import scrapy
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def first_or_empty(xpobj,path):
|
||||
lst = xpobj.xpath(path)
|
||||
return lst[0].strip() if lst else ""
|
||||
|
||||
def extract_position_data(xpathobj):
|
||||
print("aaa")
|
||||
title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
|
||||
print(title)
|
||||
if not title:
|
||||
return None
|
||||
nature = "全职"
|
||||
category = first_or_empty(xpathobj, '//div[@class="job_details_category"]/text()')
|
||||
region_info = [t.strip() for t in xpathobj.xpath('//div[@class="job_details_info"]/text()') if t.strip()]
|
||||
region = ""
|
||||
experience = ""
|
||||
education = ""
|
||||
if len(region_info) == 3:
|
||||
region, experience, education = region_info
|
||||
elif len(region_info) == 4 and "应届" in region_info[3]:
|
||||
region = region_info[0]
|
||||
experience = region_info[1]
|
||||
education = region_info[2] + " " + region_info[3]
|
||||
salary = first_or_empty(xpathobj, '//span[@class="job_details_salary_n"]/text()')
|
||||
position_status = 1
|
||||
description_list = xpathobj.xpath('//div[@class="job_details_describe"]/text()')
|
||||
description = "\r\n".join([d.strip() for d in description_list if d.strip()])
|
||||
contact_name = first_or_empty(xpathobj, '//span[@class="job_details_touch_username"]/text()')
|
||||
contact_info = first_or_empty(xpathobj, '//span[@class="job_details_touch_tel_n"]/text()')
|
||||
benefits_list = xpathobj.xpath('//div[@class="job_details_welfare "]/span/text()')
|
||||
benefits = " | ".join([b.strip() for b in benefits_list if b.strip()])
|
||||
openings = 1
|
||||
openings_str = first_or_empty(xpathobj, '//span[@class="job_details_describe_yq"]/text()')
|
||||
if openings_str:
|
||||
nums = re.findall(r"(\d+)", openings_str)
|
||||
if nums:
|
||||
openings = int(nums[0])
|
||||
company_name = first_or_empty(xpathobj, '//div[@class="Compply_right_name"]/a/text()')
|
||||
if not title or not company_name:
|
||||
return None
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"nature": nature,
|
||||
"category": category,
|
||||
"region": region,
|
||||
"experience": experience,
|
||||
"education": education,
|
||||
"salary": salary,
|
||||
"position_status": position_status,
|
||||
"description": description,
|
||||
"contact_name": contact_name,
|
||||
"contact_info": contact_info,
|
||||
"benefits": benefits,
|
||||
"openings": openings,
|
||||
"website_id": 1,
|
||||
"company_name": company_name,
|
||||
}
|
||||
|
||||
|
||||
def get_position_href(xpathobj):
|
||||
hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
|
||||
print(hrefs)
|
||||
return [href.strip() for href in hrefs if href.strip()]
|
||||
|
||||
|
||||
class ZunHuaComSpider(scrapy.Spider):
|
||||
name = 'zhrczp_com_position'
|
||||
allowed_domains = ['zhrczp.com', 'www.zhrczp.com']
|
||||
headers = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Pragma': 'no-cache',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
|
||||
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
for page in range(1, 2):
|
||||
yield scrapy.Request(
|
||||
url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
|
||||
headers=self.headers,
|
||||
callback=self.parse,
|
||||
dont_filter=True,
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
self.logger.info(f"Parsing page: {response.url}")
|
||||
xpathobj = etree.HTML(response.text)
|
||||
position_hrefs = get_position_href(xpathobj)
|
||||
if position_hrefs:
|
||||
for href in position_hrefs:
|
||||
yield scrapy.Request(
|
||||
url=href,
|
||||
headers=self.headers,
|
||||
callback=self.parse_position,
|
||||
dont_filter=True,
|
||||
)
|
||||
|
||||
def parse_position(self, response):
|
||||
self.logger.info(f"Parsing position: {response.url}")
|
||||
xpath_object = etree.HTML(response.text)
|
||||
position_data = extract_position_data(xpath_object)
|
||||
if position_data:
|
||||
self.logger.info(position_data)
|
||||
yield position_data
|
Loading…
x
Reference in New Issue
Block a user