import datetime import pandas as pd from lxml import etree from pathlib import Path from Requests_Except import * base_url = 'www.zhrczp.com' protocol = 'https' default_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', 'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36', 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } Requests = MR(base_url, protocol) Requests.set_default_headers(default_headers) excelfilepath = r"C:\Users\Franklin_Kali\Documents\WeChat Files\w19960619\FileStorage\File\2025-06\遵化电话.xlsx" xlsx_name = str(Path(excelfilepath).stem) def get_search_page(keyword_id, page): params = { 'c': 'resume', 'jobin': str(keyword_id), 'jobclass_search': str(keyword_id), 'cityin': '', 'cityclass_search': '', 'keyword': '', 'minsalary': '', 'maxsalary': '', 'minage': '', 'maxage': '', 'exp': '', 'edu': '', 'uptime': '', 'sex': '', 'type': '', 'page': str(page), } url = '/member/index.php' resp = Requests.get(url, params=params) res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", resp.text) return res def read_excel2df(file_path): df = pd.read_excel(file_path) return df def get_resume_list(df): resume_id_list = [] for index, row in df.iterrows(): resume_id_list.append(row['resume_id']) return resume_id_list def get_cookies(): url = '/login/c_loginsave.html' data = { 'act_login': '0', 'num': '2', 'referurl': 'https://www.zhrczp.com/', 'username': '18713831026', 'password': '18713831026', 'loginname': '0', 'authcode': '', 'verify_token': '', 'verify_str': '', } resp = Requests.post(url, data=data) return resp.cookies_dict() def get_resumeInfo(resume_id): url = '/member/index.php' params = { 'c': 'hr', 'act': 'resumeInfo', 'eid': str(resume_id), 'state': '', 'from': 'rck', } resp = Requests.get(url, params=params) return resp.xpath(), resp.text def get_phone(xpath): phone = xpath.xpath('//div[contains(text(), "手机")]/span/text()') return phone[0].strip() if phone else '' def get_email(xpath): email = xpath.xpath('//div[contains(text(), "邮箱")]/text()') if not email: return '' email = re.search(r'[\w\.-]+@[\w\.-]+', ''.join(email[0])).group() return email if email else '' def post_phone(resume_id): url = '/index.php' params = { 'm': 'ajax', 'c': 'for_link', } data = { 'eid': str(resume_id), } resp = Requests.post(url, params=params, data=data) return resp.json() def data_integration(): df = read_excel2df(excelfilepath) resume_list = get_resume_list(df) phones = [] emails = [] for resume_id in resume_list: phone = '' email = '' xobj, html = get_resumeInfo(resume_id) phone = get_phone(xobj) email = get_email(xobj) if phone == '' and email == '': data = post_phone(resume_id) if data.get('msg') == '请先登录!': Requests.set_default_cookies(get_cookies()) if data.get('html'): xobj = etree.HTML(data.get('html')) phone = get_phone(xobj) email = get_email(xobj) phones.append(phone) emails.append(email) df['phone'] = phones df['email'] = emails df.to_excel(f'遵化_{datetime.datetime.now():%Y%m%d%H%M%S}_{xlsx_name}_p.xlsx', index=False) if __name__ == '__main__': data_integration()