125 lines
5.0 KiB
Python
125 lines
5.0 KiB
Python
import sys, os
|
||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||
if project_root not in sys.path:
|
||
sys.path.insert(0, project_root)
|
||
|
||
from web.Requests_Except import MR
|
||
base_url = 'zp.yutian.top'
|
||
protocol = 'https'
|
||
default_headers = {
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||
'Cache-Control': 'no-cache',
|
||
'Connection': 'keep-alive',
|
||
'Pragma': 'no-cache',
|
||
'Sec-Fetch-Dest': 'document',
|
||
'Sec-Fetch-Mode': 'navigate',
|
||
'Sec-Fetch-Site': 'same-origin',
|
||
'Sec-Fetch-User': '?1',
|
||
'Upgrade-Insecure-Requests': '1',
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
|
||
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
||
'sec-ch-ua-mobile': '?0',
|
||
'sec-ch-ua-platform': '"Windows"',
|
||
}
|
||
Requests = MR(base_url, protocol)
|
||
Requests.set_default_headers(default_headers)
|
||
|
||
|
||
def data_cleaning(xpathobj) -> dict:
|
||
# name_xlist = xpathobj.xpath('//h1/a/text()')
|
||
# name = name_xlist[0] if name_xlist else ""
|
||
# compary_info_xlist = xpathobj.xpath('//div[@class="com_details_info"]/text()')
|
||
# if compary_info_xlist:
|
||
# compary_info_list = [info.strip() for info in compary_info_xlist if info.strip()]
|
||
# category = compary_info_list[1]
|
||
# company_type = compary_info_list[2]
|
||
# size = compary_info_list[3]
|
||
# if len(compary_info_list) > 4:
|
||
# founded_date = compary_info_list[4]
|
||
# else:
|
||
# founded_date = ""
|
||
# else:
|
||
# category = ""
|
||
# company_type = ""
|
||
# size = ""
|
||
# founded_date = ""
|
||
# benefits_xlist = xpathobj.xpath('//div[@class="com_welfare "]/span/text()')
|
||
# if benefits_xlist:
|
||
# benefits_str = " | ".join(benefits_xlist)
|
||
# else:
|
||
# benefits_str = ""
|
||
# introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/p/text()')
|
||
# if not introduction_xlist:
|
||
# introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/text()')
|
||
# if not introduction_xlist:
|
||
# introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()')
|
||
# if introduction_xlist:
|
||
# introduction = "\r\n".join([info.strip() for info in introduction_xlist if info.strip()])
|
||
# else:
|
||
# introduction = ""
|
||
# address_xlist = xpathobj.xpath('//div[@class="com_details_tel_me"]/div/text()')
|
||
# if address_xlist:
|
||
# address = address_xlist[0].strip()
|
||
# else:
|
||
# address = ""
|
||
# if name != "" and introduction != "":
|
||
# company_data = {
|
||
# "name": name,
|
||
# "category": category,
|
||
# "size": size,
|
||
# "company_type": company_type,
|
||
# "founded_date": founded_date,
|
||
# "introduction": introduction,
|
||
# "address": address,
|
||
# "benefits": benefits_str,
|
||
# "website": 1,
|
||
# }
|
||
# else:
|
||
# company_data = None
|
||
# return company_data
|
||
name = xpathobj.xpath('//h1[@class="company-header-top-detail-name hide-txt"]/text()')[0].strip()
|
||
category_and_size_key = xpathobj.xpath('//div[@class="company-header-bottom-item-label"]/text()')
|
||
category_and_size_value = xpathobj.xpath('//div[@class="company-header-bottom-item-text hide-txt"]/text()')
|
||
category_and_size_key_count = len(category_and_size_key)
|
||
category_and_size_value_count = len(category_and_size_value)
|
||
if category_and_size_key_count == 2 and category_and_size_value_count == 2:
|
||
key0 = category_and_size_key[0].strip()
|
||
key1 = category_and_size_key[1].strip()
|
||
if key0 == "公司类别 :":
|
||
category = category_and_size_value[0].strip()
|
||
if key1 == "公司规模 :":
|
||
size = category_and_size_value[1].strip()
|
||
elif category_and_size_key_count == 1 and category_and_size_value_count == 1:
|
||
key0 = category_and_size_key[0].strip()
|
||
if key0 == "公司类别 :":
|
||
category = category_and_size_value[0].strip()
|
||
size = ""
|
||
elif key0 == "公司规模 :":
|
||
size = category_and_size_value[0].strip()
|
||
category = ""
|
||
else:
|
||
# 如果无法解析,返回空字符串
|
||
category = ""
|
||
size = ""
|
||
introduction = xpathobj.xpath('//div[@class="job-left-content-des"]/text()')[0].strip()
|
||
address = xpathobj.xpath('//div[@class="job-left-content-address"]/text()')[0].strip()
|
||
print(name)
|
||
print(category)
|
||
print(size)
|
||
print(introduction)
|
||
|
||
def get_position_page(page:int):
|
||
url = f"/position/{page}.html"
|
||
res = Requests.get(url, timeout=10)
|
||
print(res.text)
|
||
return data_cleaning(res.xpath())
|
||
|
||
def get_company_page(page:int):
|
||
url = f"/company/{page}.html"
|
||
res = Requests.get(url, timeout=10)
|
||
return data_cleaning(res.xpath())
|
||
|
||
if __name__ == '__main__':
|
||
# for page in range(1, 1000):
|
||
print(get_company_page(3273)) |