添加YutianTopSpider爬虫以提取职位信息并解析公司详情;更新.gitignore以排除IDE配置文件
This commit is contained in:
parent
dc80fb6c72
commit
b13afeb51c
2
.gitignore
vendored
2
.gitignore
vendored
@ -45,7 +45,7 @@ static/
|
|||||||
staticfiles/
|
staticfiles/
|
||||||
|
|
||||||
# === IDE 配置 ===
|
# === IDE 配置 ===
|
||||||
.idea/ # PyCharm
|
.idea/* # PyCharm
|
||||||
*.iml
|
*.iml
|
||||||
*.ipr
|
*.ipr
|
||||||
*.iws
|
*.iws
|
||||||
|
109
TS_resume_spider/spiders/yutian_top_compary.py
Normal file
109
TS_resume_spider/spiders/yutian_top_compary.py
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
import scrapy
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
from lxml.extensions import xpath_code
|
||||||
|
from openpyxl.styles.builtins import title
|
||||||
|
|
||||||
|
|
||||||
|
def first_or_empty(xpobj, path):
|
||||||
|
lst = xpobj.xpath(path)
|
||||||
|
return lst[0].strip() if lst else ""
|
||||||
|
|
||||||
|
|
||||||
|
class YutianTopSpider(scrapy.Spider):
|
||||||
|
name = 'yutian_top_compary'
|
||||||
|
allowed_domains = ['yutian.top']
|
||||||
|
base_url = "https://zp.yutian.top"
|
||||||
|
start_urls = ['https://zp.yutian.top/search']
|
||||||
|
headers = {
|
||||||
|
'accept': 'application/json, text/plain, */*',
|
||||||
|
'accept-language': 'zh-CN,zh;q=0.9',
|
||||||
|
'cache-control': 'no-cache',
|
||||||
|
'content-type': 'application/json;charset=UTF-8',
|
||||||
|
'origin': 'https://www.yutian.top',
|
||||||
|
'pragma': 'no-cache',
|
||||||
|
'priority': 'u=1, i',
|
||||||
|
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for i in range(1, 39):
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=self.start_urls[0],
|
||||||
|
method='GET',
|
||||||
|
headers=self.headers,
|
||||||
|
dont_filter=True,
|
||||||
|
callback=self.parse,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
status_code = response.status
|
||||||
|
if status_code != 200:
|
||||||
|
self.logger.error(f"Request failed with status code: {status_code}")
|
||||||
|
return
|
||||||
|
for node in response.xpath('//*[@onclick]'):
|
||||||
|
onclick = node.attrib.get("onclick", "")
|
||||||
|
match = re.search(r"/position/\d+\.html", onclick)
|
||||||
|
if match:
|
||||||
|
url = self.base_url + match.group(0)
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=url,
|
||||||
|
headers=self.headers,
|
||||||
|
callback=self.parse_compary,
|
||||||
|
dont_filter=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_compary(self, response):
|
||||||
|
self.logger.info(f"Parsing compary: {response.url}")
|
||||||
|
if response.status != 200:
|
||||||
|
self.logger.error(f"Not 200 page: {response.url}")
|
||||||
|
return
|
||||||
|
|
||||||
|
title = first_or_empty(response.xpath, "//div[@class='top-title']/span/text()")
|
||||||
|
nature = ""
|
||||||
|
category = ""
|
||||||
|
region = ""
|
||||||
|
experience = ""
|
||||||
|
education = ""
|
||||||
|
infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()")
|
||||||
|
infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()")
|
||||||
|
if infokey and infovalue:
|
||||||
|
for index, key in enumerate(infokey):
|
||||||
|
if '工作性质' in key:
|
||||||
|
nature = infovalue[index].strip()
|
||||||
|
if "职位类别" in key:
|
||||||
|
category = infovalue[index].strip()
|
||||||
|
if "工作区域" in key:
|
||||||
|
region = infovalue[index].strip()
|
||||||
|
if "招聘人数" in key:
|
||||||
|
openings = infovalue[index].strip()
|
||||||
|
if "工作年限" in key:
|
||||||
|
experience = infovalue[index].strip()
|
||||||
|
if "学历要求" in key:
|
||||||
|
education = infovalue[index].strip()
|
||||||
|
# TODO: 未完继续
|
||||||
|
|
||||||
|
yield {
|
||||||
|
"title": title, # 职位名称
|
||||||
|
"nature": nature, # 职位性质
|
||||||
|
"category": category, # 职位类别
|
||||||
|
"region": region, # 职位区域
|
||||||
|
"experience": experience, # 工作经历要求
|
||||||
|
"education": education, # 学历要求
|
||||||
|
"salary": salary, # 职位薪资
|
||||||
|
"position_status": position_status, # 职位状态
|
||||||
|
"description": description, # 职位描述(详情)
|
||||||
|
"contact_name": contact_name, # 联系人姓名
|
||||||
|
"contact_info": contact_info, # 联系方式
|
||||||
|
"benefits": benefits, # 职位福利
|
||||||
|
"openings": openings, # 招聘人数
|
||||||
|
"website_id": 2, # 网站ID
|
||||||
|
"company_name": company_name, # 所属企业名称
|
||||||
|
}
|
@ -3,7 +3,7 @@ import json
|
|||||||
|
|
||||||
|
|
||||||
class YutianTopSpider(scrapy.Spider):
|
class YutianTopSpider(scrapy.Spider):
|
||||||
name = 'yutian_top'
|
name = 'yutian_top_resume'
|
||||||
allowed_domains = ['yutian.top']
|
allowed_domains = ['yutian.top']
|
||||||
start_urls = ['https://www.yutian.top/job/company/v1/resume/page']
|
start_urls = ['https://www.yutian.top/job/company/v1/resume/page']
|
||||||
|
|
BIN
job_info/ts_resume_spider/requests.queue/1
Normal file
BIN
job_info/ts_resume_spider/requests.queue/1
Normal file
Binary file not shown.
1
job_info/ts_resume_spider/requests.queue/active.json
Normal file
1
job_info/ts_resume_spider/requests.queue/active.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
116025
job_info/ts_resume_spider/requests.seen
Normal file
116025
job_info/ts_resume_spider/requests.seen
Normal file
File diff suppressed because it is too large
Load Diff
1
job_info/ts_resume_spider/spider.state
Normal file
1
job_info/ts_resume_spider/spider.state
Normal file
@ -0,0 +1 @@
|
|||||||
|
<EFBFBD>}<7D>.
|
Loading…
x
Reference in New Issue
Block a user