diff --git a/TS_resume_spider/spiders/zhrczp_com_position.py b/TS_resume_spider/spiders/zhrczp_com_position.py new file mode 100644 index 0000000..075253e --- /dev/null +++ b/TS_resume_spider/spiders/zhrczp_com_position.py @@ -0,0 +1,119 @@ +import re +import scrapy +from lxml import etree + + +def first_or_empty(xpobj,path): + lst = xpobj.xpath(path) + return lst[0].strip() if lst else "" + +def extract_position_data(xpathobj): + print("aaa") + title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()') + print(title) + if not title: + return None + nature = "全职" + category = first_or_empty(xpathobj, '//div[@class="job_details_category"]/text()') + region_info = [t.strip() for t in xpathobj.xpath('//div[@class="job_details_info"]/text()') if t.strip()] + region = "" + experience = "" + education = "" + if len(region_info) == 3: + region, experience, education = region_info + elif len(region_info) == 4 and "应届" in region_info[3]: + region = region_info[0] + experience = region_info[1] + education = region_info[2] + " " + region_info[3] + salary = first_or_empty(xpathobj, '//span[@class="job_details_salary_n"]/text()') + position_status = 1 + description_list = xpathobj.xpath('//div[@class="job_details_describe"]/text()') + description = "\r\n".join([d.strip() for d in description_list if d.strip()]) + contact_name = first_or_empty(xpathobj, '//span[@class="job_details_touch_username"]/text()') + contact_info = first_or_empty(xpathobj, '//span[@class="job_details_touch_tel_n"]/text()') + benefits_list = xpathobj.xpath('//div[@class="job_details_welfare "]/span/text()') + benefits = " | ".join([b.strip() for b in benefits_list if b.strip()]) + openings = 1 + openings_str = first_or_empty(xpathobj, '//span[@class="job_details_describe_yq"]/text()') + if openings_str: + nums = re.findall(r"(\d+)", openings_str) + if nums: + openings = int(nums[0]) + company_name = first_or_empty(xpathobj, '//div[@class="Compply_right_name"]/a/text()') + if not title or not company_name: + return None + + return { + "title": title, + "nature": nature, + "category": category, + "region": region, + "experience": experience, + "education": education, + "salary": salary, + "position_status": position_status, + "description": description, + "contact_name": contact_name, + "contact_info": contact_info, + "benefits": benefits, + "openings": openings, + "website_id": 1, + "company_name": company_name, + } + + +def get_position_href(xpathobj): + hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href") + print(hrefs) + return [href.strip() for href in hrefs if href.strip()] + + +class ZunHuaComSpider(scrapy.Spider): + name = 'zhrczp_com_position' + allowed_domains = ['zhrczp.com', 'www.zhrczp.com'] + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Pragma': 'no-cache', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'same-origin', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', + 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + } + + def start_requests(self): + for page in range(1, 2): + yield scrapy.Request( + url=f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html", + headers=self.headers, + callback=self.parse, + dont_filter=True, + ) + + def parse(self, response): + self.logger.info(f"Parsing page: {response.url}") + xpathobj = etree.HTML(response.text) + position_hrefs = get_position_href(xpathobj) + if position_hrefs: + for href in position_hrefs: + yield scrapy.Request( + url=href, + headers=self.headers, + callback=self.parse_position, + dont_filter=True, + ) + + def parse_position(self, response): + self.logger.info(f"Parsing position: {response.url}") + xpath_object = etree.HTML(response.text) + position_data = extract_position_data(xpath_object) + if position_data: + self.logger.info(position_data) + yield position_data