import re import sys, os from lxml.html.diff import href_token project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) if project_root not in sys.path: sys.path.insert(0, project_root) from web.Requests_Except import MR base_url = 'www.zhrczp.com' protocol = 'https' default_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } Requests = MR(base_url, protocol) Requests.set_default_headers(default_headers) def get_position_page(page: int): url = f"https://www.zhrczp.com/job/list/0-0-0-0_0_0_0_0_0_0_0-0-0-0-{page}.html" res = Requests.get(url, timeout=10) return res.xpath() def get_position(xpathobj) -> dict: href_list = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href") if href_list: for href in href_list: href = href.strip() if href: position_res = Requests.get(href, timeout=10) xpathobj = position_res.xpath() title_xlist = xpathobj.xpath('//h1[@clss="job_details_name"]/text()') title = "" if title_xlist: title = title_xlist[0].strip() nature = "全职" category = "" region_xlist = xpathobj.xpath('//div[@class="job_details_info"]/text()') if region_xlist: region_info = [region.strip() for region in region_xlist] region = "" experience = "" education = "" if len(region_info) == 3: region = region_info[0] experience = region_info[1] education = region_info[2] elif len(region_info) == 4 and "应届" in region_info[3]: region = region_info[0] experience = region_info[1] education = region_info[2] + " " + region_info[3] else: print(region_info) salary_xlist = xpathobj.xpath('//span[@class="job_details_salary_n"]/text()') salary = "" if salary_xlist: salary = salary_xlist[0].strip() position_status = 1 description_xlist = xpathobj.xpath('//div[@class="job_details_describe"]/text()') description = "" if description_xlist: description = "\r\n".join([desc.strip() for desc in description_xlist if desc.strip()]) contact_name_xlist = xpathobj.xpath('//span[@class="job_details_touch_username"]/text()') contact_name = "" if contact_name_xlist: contact_name = contact_name_xlist[0].strip() contact_info_xlist = xpathobj.xpath('//span[@class="job_details_touch_tel_n"]/text()') contact_info = "" if contact_info_xlist: contact_info = contact_info_xlist[0].strip() benefits_xlist = xpathobj.xpath('//div[@class="job_details_welfare "]/span/text()') benefits = "" if benefits_xlist: benefits = " | ".join([benefit.strip() for benefit in benefits_xlist if benefit.strip()]) company_name_xlist = xpathobj.xpath('//div[@class="Compply_right_name"]/a/text()') company_name = "" if company_name_xlist: company_name = company_name_xlist[0].strip() openings_xlist = xpathobj.xpath('//span[@class="job_details_describe_yq"]/text()') openings = 1 # 默认招聘人数为1 if openings_xlist: try: openings_str = openings_xlist[0].strip() openings_content = re.findall(r"(\d+)", openings_str)[0] openings = int(openings_content) except ValueError: openings = 1 data = { "title": title, "nature": nature, "category": category, "region": region, "experience": experience, "education": education, "salary": salary, "position_status": position_status, "description": description, "contact_name": contact_name, "contact_info": contact_info, "benefits": benefits, "openings": openings, # 默认招聘人数为1 "website_id": 1, "company_name": company_name, } if __name__ == '__main__': get_position(get_position_page(1))