Crawler/web/zhrczp_com/get_company.py

import sys, os

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from web.Requests_Except import MR

base_url = 'www.zhrczp.com'
protocol = 'https'
default_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Pragma': 'no-cache',
    'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}
Requests = MR(base_url, protocol)
Requests.set_default_headers(default_headers)


def data_cleaning(xpathobj) -> dict:
    # name_xlist = xpathobj.xpath('//h1/a/text()')
    # name = name_xlist[0] if name_xlist else ""
    # compary_info_xlist = xpathobj.xpath('//div[@class="com_details_info"]/text()')
    # if compary_info_xlist:
    #     compary_info_list = [info.strip() for info in compary_info_xlist if info.strip()]
    #     category = compary_info_list[1]
    #     company_type = compary_info_list[2]
    #     size = compary_info_list[3]
    #     if len(compary_info_list) > 4:
    #         founded_date = compary_info_list[4]
    #     else:
    #         founded_date = ""
    # else:
    #     category = ""
    #     company_type = ""
    #     size = ""
    #     founded_date = ""
    # benefits_xlist = xpathobj.xpath('//div[@class="com_welfare "]/span/text()')
    # if benefits_xlist:
    #     benefits_str = " | ".join(benefits_xlist)
    # else:
    #     benefits_str = ""
    # introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/p/text()')
    # if not introduction_xlist:
    #     introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/text()')
    # if not introduction_xlist:
    #     introduction_xlist = xpathobj.xpath('//div[@class="company_img_auto"]/p/span/text()')
    # if introduction_xlist:
    #     introduction = "\r\n".join([info.strip() for info in introduction_xlist if info.strip()])
    # else:
    #     introduction = ""
    # address_xlist = xpathobj.xpath('//div[@class="com_details_tel_me"]/div/text()')
    # if address_xlist:
    #     address = address_xlist[0].strip()
    # else:
    #     address = ""
    # if name != "" and introduction != "":
    #     company_data = {
    #         "name": name,
    #         "category": category,
    #         "size": size,
    #         "company_type": company_type,
    #         "founded_date": founded_date,
    #         "introduction": introduction,
    #         "address": address,
    #         "benefits": benefits_str,
    #         "website": 1,
    #     }
    # else:
    #     company_data = None
    # return company_data
    name = xpathobj.xpath('//h1[@class="company-header-top-detail-name hide-txt"]/text()')[0].strip()
    category = xpathobj.xpath('//div[@class="company-header-bottom-item-text hide-txt"]/text()')[0].strip()
    size = xpathobj.xpath('//div[@class="company-header-bottom-item-text hide-txt"]/text()')[0].strip()
    # data = {
    #     "name": name,  # 公司名称，例如 "字节跳动"
    #     "category": category,  # 行业类别，例如 "互联网/软件/信息技术"
    #     "size": size,  # 公司规模，例如 "1000人以上"
    #     "company_type": company_type,  # 公司性质，例如 "民营", "外企", "国企"
    #     "founded_date": founded_date,  # 成立时间，推荐格式 "YYYY-MM-DD" 或 "2010年"
    #     "introduction": introduction,  # 公司简介/介绍，字符串类型
    #     "address": address,  # 公司地址，完整的办公地点描述
    #     "benefits": benefits_str,  # 员工福利，例如 "五险一金, 带薪年假, 免费下午茶"
    #     "website": 1,  # 是否有官网，1 表示有，0 表示无（注意：这里可能是布尔值标志，不是真正的网址）
    # }

    print(name)
    print(category)
    print(size)

def get_company_page(page: int):
    url = f"/company/{page}.html"
    res = Requests.get(url, timeout=10)
    with open("zhrczp_com_company.html", "w", encoding="utf-8") as f:
        f.write(res.text)
    return data_cleaning(res.xpath())


if __name__ == '__main__':
    # for page in range(1, 1000):
    print(get_company_page(3273))