添加YutianTopSpider爬虫以提取职位信息并解析公司详情;更新.gitignore以排除IDE配置文件

This commit is contained in:
晓丰 2025-05-26 23:42:53 +08:00
parent dc80fb6c72
commit b13afeb51c
7 changed files with 116138 additions and 2 deletions

2
.gitignore vendored
View File

@ -45,7 +45,7 @@ static/
staticfiles/ staticfiles/
# === IDE 配置 === # === IDE 配置 ===
.idea/ # PyCharm .idea/* # PyCharm
*.iml *.iml
*.ipr *.ipr
*.iws *.iws

View File

@ -0,0 +1,109 @@
import scrapy
import json
import re
from lxml.extensions import xpath_code
from openpyxl.styles.builtins import title
def first_or_empty(xpobj, path):
lst = xpobj.xpath(path)
return lst[0].strip() if lst else ""
class YutianTopSpider(scrapy.Spider):
name = 'yutian_top_compary'
allowed_domains = ['yutian.top']
base_url = "https://zp.yutian.top"
start_urls = ['https://zp.yutian.top/search']
headers = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json;charset=UTF-8',
'origin': 'https://www.yutian.top',
'pragma': 'no-cache',
'priority': 'u=1, i',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
}
def start_requests(self):
for i in range(1, 39):
yield scrapy.Request(
url=self.start_urls[0],
method='GET',
headers=self.headers,
dont_filter=True,
callback=self.parse,
)
def parse(self, response):
status_code = response.status
if status_code != 200:
self.logger.error(f"Request failed with status code: {status_code}")
return
for node in response.xpath('//*[@onclick]'):
onclick = node.attrib.get("onclick", "")
match = re.search(r"/position/\d+\.html", onclick)
if match:
url = self.base_url + match.group(0)
yield scrapy.Request(
url=url,
headers=self.headers,
callback=self.parse_compary,
dont_filter=True,
)
def parse_compary(self, response):
self.logger.info(f"Parsing compary: {response.url}")
if response.status != 200:
self.logger.error(f"Not 200 page: {response.url}")
return
title = first_or_empty(response.xpath, "//div[@class='top-title']/span/text()")
nature = ""
category = ""
region = ""
experience = ""
education = ""
infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()")
infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()")
if infokey and infovalue:
for index, key in enumerate(infokey):
if '工作性质' in key:
nature = infovalue[index].strip()
if "职位类别" in key:
category = infovalue[index].strip()
if "工作区域" in key:
region = infovalue[index].strip()
if "招聘人数" in key:
openings = infovalue[index].strip()
if "工作年限" in key:
experience = infovalue[index].strip()
if "学历要求" in key:
education = infovalue[index].strip()
# TODO: 未完继续
yield {
"title": title, # 职位名称
"nature": nature, # 职位性质
"category": category, # 职位类别
"region": region, # 职位区域
"experience": experience, # 工作经历要求
"education": education, # 学历要求
"salary": salary, # 职位薪资
"position_status": position_status, # 职位状态
"description": description, # 职位描述(详情)
"contact_name": contact_name, # 联系人姓名
"contact_info": contact_info, # 联系方式
"benefits": benefits, # 职位福利
"openings": openings, # 招聘人数
"website_id": 2, # 网站ID
"company_name": company_name, # 所属企业名称
}

View File

@ -3,7 +3,7 @@ import json
class YutianTopSpider(scrapy.Spider): class YutianTopSpider(scrapy.Spider):
name = 'yutian_top' name = 'yutian_top_resume'
allowed_domains = ['yutian.top'] allowed_domains = ['yutian.top']
start_urls = ['https://www.yutian.top/job/company/v1/resume/page'] start_urls = ['https://www.yutian.top/job/company/v1/resume/page']

Binary file not shown.

View File

@ -0,0 +1 @@
[]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
<EFBFBD>}<7D>.