添加YutianTopSpider爬虫以提取职位信息并解析公司详情；更新.gitignore以排除IDE配置文件

2025-05-26 23:42:53 +08:00 · 2025-05-26 23:42:53 +08:00 · b13afeb51c
commit b13afeb51c
parent dc80fb6c72
7 changed files with 116138 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,7 +45,7 @@ static/
 staticfiles/
 # === IDE 配置 ===
-.idea/          # PyCharm
+.idea/*          # PyCharm
 *.iml
 *.ipr
 *.iws
--- a/TS_resume_spider/spiders/yutian_top_compary.py
+++ b/TS_resume_spider/spiders/yutian_top_compary.py
@ -0,0 +1,109 @@
 import scrapy
 import json
 import re
 from lxml.extensions import xpath_code
 from openpyxl.styles.builtins import title
 def first_or_empty(xpobj, path):
    lst = xpobj.xpath(path)
    return lst[0].strip() if lst else ""
 class YutianTopSpider(scrapy.Spider):
    name = 'yutian_top_compary'
    allowed_domains = ['yutian.top']
    base_url = "https://zp.yutian.top"
    start_urls = ['https://zp.yutian.top/search']
    headers = {
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'no-cache',
        'content-type': 'application/json;charset=UTF-8',
        'origin': 'https://www.yutian.top',
        'pragma': 'no-cache',
        'priority': 'u=1, i',
        'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
    }
    def start_requests(self):
        for i in range(1, 39):
            yield scrapy.Request(
                url=self.start_urls[0],
                method='GET',
                headers=self.headers,
                dont_filter=True,
                callback=self.parse,
            )
    def parse(self, response):
        status_code = response.status
        if status_code != 200:
            self.logger.error(f"Request failed with status code: {status_code}")
            return
        for node in response.xpath('//*[@onclick]'):
            onclick = node.attrib.get("onclick", "")
            match = re.search(r"/position/\d+\.html", onclick)
            if match:
                url = self.base_url + match.group(0)
                yield scrapy.Request(
                    url=url,
                    headers=self.headers,
                    callback=self.parse_compary,
                    dont_filter=True,
                )
    def parse_compary(self, response):
        self.logger.info(f"Parsing compary: {response.url}")
        if response.status != 200:
            self.logger.error(f"Not 200 page: {response.url}")
            return
        title = first_or_empty(response.xpath, "//div[@class='top-title']/span/text()")
        nature = ""
        category = ""
        region = ""
        experience = ""
        education = ""
        infokey = response.xpath("//span[@class='job-info-item']/span[@class='label']/text()")
        infovalue = response.xpath("//span[@class='job-info-item']/span[@class='value hide-text']/text()")
        if infokey and infovalue:
            for index, key in enumerate(infokey):
                if '工作性质' in key:
                    nature = infovalue[index].strip()
                if "职位类别" in key:
                    category = infovalue[index].strip()
                if "工作区域" in key:
                    region = infovalue[index].strip()
                if "招聘人数" in key:
                    openings = infovalue[index].strip()
                if "工作年限" in key:
                    experience = infovalue[index].strip()
                if "学历要求" in key:
                    education = infovalue[index].strip()
        # TODO: 未完继续
        yield {
            "title": title, # 职位名称
            "nature": nature, # 职位性质
            "category": category, # 职位类别
            "region": region, # 职位区域
            "experience": experience, # 工作经历要求
            "education": education, # 学历要求
            "salary": salary, # 职位薪资
            "position_status": position_status, # 职位状态
            "description": description, # 职位描述(详情)
            "contact_name": contact_name, # 联系人姓名
            "contact_info": contact_info, # 联系方式
            "benefits": benefits, # 职位福利
            "openings": openings, # 招聘人数
            "website_id": 2, # 网站ID
            "company_name": company_name, # 所属企业名称
        }
--- a/TS_resume_spider/spiders/yutian_top_resume.py
+++ b/TS_resume_spider/spiders/yutian_top_resume.py
@ -3,7 +3,7 @@ import json
 class YutianTopSpider(scrapy.Spider):
-    name = 'yutian_top'
+    name = 'yutian_top_resume'
    allowed_domains = ['yutian.top']
    start_urls = ['https://www.yutian.top/job/company/v1/resume/page']
--- a/job_info/ts_resume_spider/requests.queue/1
+++ b/job_info/ts_resume_spider/requests.queue/1
--- a/job_info/ts_resume_spider/requests.queue/active.json
+++ b/job_info/ts_resume_spider/requests.queue/active.json
@ -0,0 +1 @@
 []
--- a/job_info/ts_resume_spider/requests.seen
+++ b/job_info/ts_resume_spider/requests.seen
--- a/job_info/ts_resume_spider/spider.state
+++ b/job_info/ts_resume_spider/spider.state
@ -0,0 +1 @@
 <EFBFBD>}<7D>.