import sys, os project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) if project_root not in sys.path: sys.path.insert(0, project_root) from web.Requests_Except import MR base_url = 'zp.yutian.top' protocol = 'https' default_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } Requests = MR(base_url, protocol) Requests.set_default_headers(default_headers) def data_cleaning(xpathobj) -> dict: # name_xlist = xpathobj.xpath('//h1/a/text()') # name = name_xlist[0] if name_xlist else "" # compary_info_xlist = xpathobj.xpath('//div[@class="com_details_info"]/text()') # if compary_info_xlist: # compary_info_list = [info.strip() for info in compary_info_xlist if info.strip()] # category = compary_info_list[1] # company_type = compary_info_list[2] # size = compary_info_list[3] # if len(compary_info_list) > 4: # founded_date = compary_info_list[4] # else: # founded_date = "" # else: # category = "" # company_type = "" # size = "" # founded_date = "" # benefits_xlist = xpathobj.xpath('//div[@class="com_welfare "]/span/text()') # if benefits_xlist: # benefits_str = " | ".join(benefits_xlist) # else: # benefits_str = "" # introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') # if not introduction_xlist: # introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/text()') # if not introduction_xlist: # introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()') # if introduction_xlist: # introduction = "\r\n".join([info.strip() for info in introduction_xlist if info.strip()]) # else: # introduction = "" # address_xlist = xpathobj.xpath('//div[@class="com_details_tel_me"]/div/text()') # if address_xlist: # address = address_xlist[0].strip() # else: # address = "" # if name != "" and introduction != "": # company_data = { # "name": name, # "category": category, # "size": size, # "company_type": company_type, # "founded_date": founded_date, # "introduction": introduction, # "address": address, # "benefits": benefits_str, # "website": 1, # } # else: # company_data = None # return company_data name = xpathobj.xpath('//h1[@class="company-header-top-detail-name hide-txt"]/text()')[0].strip() category_and_size_key = xpathobj.xpath('//div[@class="company-header-bottom-item-label"]/text()') category_and_size_value = xpathobj.xpath('//div[@class="company-header-bottom-item-text hide-txt"]/text()') category_and_size_key_count = len(category_and_size_key) category_and_size_value_count = len(category_and_size_value) if category_and_size_key_count == 2 and category_and_size_value_count == 2: key0 = category_and_size_key[0].strip() key1 = category_and_size_key[1].strip() if key0 == "公司类别 :": category = category_and_size_value[0].strip() if key1 == "公司规模 :": size = category_and_size_value[1].strip() elif category_and_size_key_count == 1 and category_and_size_value_count == 1: key0 = category_and_size_key[0].strip() if key0 == "公司类别 :": category = category_and_size_value[0].strip() size = "" elif key0 == "公司规模 :": size = category_and_size_value[0].strip() category = "" else: # 如果无法解析,返回空字符串 category = "" size = "" introduction = xpathobj.xpath('//div[@class="job-left-content-des"]/text()')[0].strip() address = xpathobj.xpath('//div[@class="job-left-content-address"]/text()')[0].strip() print(name) print(category) print(size) print(introduction) def get_position_page(page:int): url = f"/position/{page}.html" res = Requests.get(url, timeout=10) print(res.text) return data_cleaning(res.xpath()) def get_company_page(page:int): url = f"/company/{page}.html" res = Requests.get(url, timeout=10) return data_cleaning(res.xpath()) if __name__ == '__main__': # for page in range(1, 1000): print(get_company_page(3273))