添加ZunHuaComSpider爬虫以提取职位信息并实现数据解析逻辑

2025-05-25 22:39:17 +08:00 · 2025-05-25 22:39:17 +08:00 · 03b56ae1b2
commit 03b56ae1b2
parent 542f2ce0bd
1 changed files with 119 additions and 0 deletions
--- a/TS_resume_spider/spiders/zhrczp_com_position.py
+++ b/TS_resume_spider/spiders/zhrczp_com_position.py
@ -0,0 +1,119 @@
+import re
+import scrapy
+from lxml import etree
+
+
+def first_or_empty(xpobj,path):
+    lst = xpobj.xpath(path)
+    return lst[0].strip() if lst else ""
+
+def extract_position_data(xpathobj):
+    print("aaa")
+    title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
+    print(title)
+    if not title:
+        return None
+    nature = "全职"
+    category = first_or_empty(xpathobj, '//div[@class="job_details_category"]/text()')
+    region_info = [t.strip() for t in xpathobj.xpath('//div[@class="job_details_info"]/text()') if t.strip()]
+    region = ""
+    experience = ""
+    education = ""
+    if len(region_info) == 3:
+        region, experience, education = region_info
+    elif len(region_info) == 4 and "应届" in region_info[3]:
+        region = region_info[0]
+        experience = region_info[1]
+        education = region_info[2] + " " + region_info[3]
+    salary = first_or_empty(xpathobj, '//span[@class="job_details_salary_n"]/text()')
+    position_status = 1
+    description_list = xpathobj.xpath('//div[@class="job_details_describe"]/text()')
+    description = "\r\n".join([d.strip() for d in description_list if d.strip()])
+    contact_name = first_or_empty(xpathobj, '//span[@class="job_details_touch_username"]/text()')
+    contact_info = first_or_empty(xpathobj, '//span[@class="job_details_touch_tel_n"]/text()')
+    benefits_list = xpathobj.xpath('//div[@class="job_details_welfare "]/span/text()')
+    benefits = " | ".join([b.strip() for b in benefits_list if b.strip()])
+    openings = 1
+    openings_str = first_or_empty(xpathobj, '//span[@class="job_details_describe_yq"]/text()')
+    if openings_str:
+        nums = re.findall(r"(\d+)", openings_str)
+        if nums:
+            openings = int(nums[0])
+    company_name = first_or_empty(xpathobj, '//div[@class="Compply_right_name"]/a/text()')
+    if not title or not company_name:
+        return None
+
+    return {
+        "title": title,
+        "nature": nature,
+        "category": category,
+        "region": region,
+        "experience": experience,
+        "education": education,
+        "salary": salary,
+        "position_status": position_status,
+        "description": description,
+        "contact_name": contact_name,
+        "contact_info": contact_info,
+        "benefits": benefits,
+        "openings": openings,
+        "website_id": 1,
+        "company_name": company_name,
+    }
+
+
+def get_position_href(xpathobj):
+    hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
+    print(hrefs)
+    return [href.strip() for href in hrefs if href.strip()]
+
+
+class ZunHuaComSpider(scrapy.Spider):
+    name = 'zhrczp_com_position'
+    allowed_domains = ['zhrczp.com', 'www.zhrczp.com']
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Pragma': 'no-cache',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'same-origin',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+    }
+
+    def start_requests(self):
+        for page in range(1, 2):
+            yield scrapy.Request(
+                url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html",
+                headers=self.headers,
+                callback=self.parse,
+                dont_filter=True,
+            )
+
+    def parse(self, response):
+        self.logger.info(f"Parsing page: {response.url}")
+        xpathobj = etree.HTML(response.text)
+        position_hrefs = get_position_href(xpathobj)
+        if position_hrefs:
+            for href in position_hrefs:
+                yield scrapy.Request(
+                    url=href,
+                    headers=self.headers,
+                    callback=self.parse_position,
+                    dont_filter=True,
+                )
+
+    def parse_position(self, response):
+        self.logger.info(f"Parsing position: {response.url}")
+        xpath_object = etree.HTML(response.text)
+        position_data = extract_position_data(xpath_object)
+        if position_data:
+            self.logger.info(position_data)
+            yield position_data