Crawler/web/zhrczp_com/get_company.py

114 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import sys, os
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if project_root not in sys.path:
sys.path.insert(0, project_root)
from web.Requests_Except import MR
base_url = 'www.zhrczp.com'
protocol = 'https'
default_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
Requests = MR(base_url, protocol)
Requests.set_default_headers(default_headers)
def data_cleaning(xpathobj) -> dict:
# name_xlist = xpathobj.xpath('//h1/a/text()')
# name = name_xlist[0] if name_xlist else ""
# compary_info_xlist = xpathobj.xpath('//div[@class="com_details_info"]/text()')
# if compary_info_xlist:
# compary_info_list = [info.strip() for info in compary_info_xlist if info.strip()]
# category = compary_info_list[1]
# company_type = compary_info_list[2]
# size = compary_info_list[3]
# if len(compary_info_list) > 4:
# founded_date = compary_info_list[4]
# else:
# founded_date = ""
# else:
# category = ""
# company_type = ""
# size = ""
# founded_date = ""
# benefits_xlist = xpathobj.xpath('//div[@class="com_welfare "]/span/text()')
# if benefits_xlist:
# benefits_str = " | ".join(benefits_xlist)
# else:
# benefits_str = ""
# introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/p/text()')
# if not introduction_xlist:
# introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/text()')
# if not introduction_xlist:
# introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()')
# if introduction_xlist:
# introduction = "\r\n".join([info.strip() for info in introduction_xlist if info.strip()])
# else:
# introduction = ""
# address_xlist = xpathobj.xpath('//div[@class="com_details_tel_me"]/div/text()')
# if address_xlist:
# address = address_xlist[0].strip()
# else:
# address = ""
# if name != "" and introduction != "":
# company_data = {
# "name": name,
# "category": category,
# "size": size,
# "company_type": company_type,
# "founded_date": founded_date,
# "introduction": introduction,
# "address": address,
# "benefits": benefits_str,
# "website": 1,
# }
# else:
# company_data = None
# return company_data
name = xpathobj.xpath('//h1[@class="company-header-top-detail-name hide-txt"]/text()')[0].strip()
category = xpathobj.xpath('//div[@class="company-header-bottom-item-text hide-txt"]/text()')[0].strip()
size = xpathobj.xpath('//div[@class="company-header-bottom-item-text hide-txt"]/text()')[0].strip()
# data = {
# "name": name, # 公司名称,例如 "字节跳动"
# "category": category, # 行业类别,例如 "互联网/软件/信息技术"
# "size": size, # 公司规模,例如 "1000人以上"
# "company_type": company_type, # 公司性质,例如 "民营", "外企", "国企"
# "founded_date": founded_date, # 成立时间,推荐格式 "YYYY-MM-DD" 或 "2010年"
# "introduction": introduction, # 公司简介/介绍,字符串类型
# "address": address, # 公司地址,完整的办公地点描述
# "benefits": benefits_str, # 员工福利,例如 "五险一金, 带薪年假, 免费下午茶"
# "website": 1, # 是否有官网1 表示有0 表示无(注意:这里可能是布尔值标志,不是真正的网址)
# }
print(name)
print(category)
print(size)
def get_company_page(page: int):
url = f"/company/{page}.html"
res = Requests.get(url, timeout=10)
with open("zhrczp_com_company.html", "w", encoding="utf-8") as f:
f.write(res.text)
return data_cleaning(res.xpath())
if __name__ == '__main__':
# for page in range(1, 1000):
print(get_company_page(3273))