Crawler/web/zhrczp_com/Download.py

157 lines
4.4 KiB
Python

import datetime
import pandas as pd
from lxml import etree
from pathlib import Path
from Requests_Except import *
base_url = 'www.zhrczp.com'
protocol = 'https'
default_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
Requests = MR(base_url, protocol)
Requests.set_default_headers(default_headers)
excelfilepath = r"C:\Users\Franklin_Kali\Documents\WeChat Files\w19960619\FileStorage\File\2025-06\遵化电话.xlsx"
xlsx_name = str(Path(excelfilepath).stem)
def get_search_page(keyword_id, page):
params = {
'c': 'resume',
'jobin': str(keyword_id),
'jobclass_search': str(keyword_id),
'cityin': '',
'cityclass_search': '',
'keyword': '',
'minsalary': '',
'maxsalary': '',
'minage': '',
'maxage': '',
'exp': '',
'edu': '',
'uptime': '',
'sex': '',
'type': '',
'page': str(page),
}
url = '/member/index.php'
resp = Requests.get(url, params=params)
res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", resp.text)
return res
def read_excel2df(file_path):
df = pd.read_excel(file_path)
return df
def get_resume_list(df):
resume_id_list = []
for index, row in df.iterrows():
resume_id_list.append(row['resume_id'])
return resume_id_list
def get_cookies():
url = '/login/c_loginsave.html'
data = {
'act_login': '0',
'num': '2',
'referurl': 'https://www.zhrczp.com/',
'username': '18713831026',
'password': '18713831026',
'loginname': '0',
'authcode': '',
'verify_token': '',
'verify_str': '',
}
resp = Requests.post(url, data=data)
return resp.cookies_dict()
def get_resumeInfo(resume_id):
url = '/member/index.php'
params = {
'c': 'hr',
'act': 'resumeInfo',
'eid': str(resume_id),
'state': '',
'from': 'rck',
}
resp = Requests.get(url, params=params)
return resp.xpath(), resp.text
def get_phone(xpath):
phone = xpath.xpath('//div[contains(text(), "手机")]/span/text()')
return phone[0].strip() if phone else ''
def get_email(xpath):
email = xpath.xpath('//div[contains(text(), "邮箱")]/text()')
if not email:
return ''
email = re.search(r'[\w\.-]+@[\w\.-]+', ''.join(email[0])).group()
return email if email else ''
def post_phone(resume_id):
url = '/index.php'
params = {
'm': 'ajax',
'c': 'for_link',
}
data = {
'eid': str(resume_id),
}
resp = Requests.post(url, params=params, data=data)
return resp.json()
def data_integration():
df = read_excel2df(excelfilepath)
resume_list = get_resume_list(df)
phones = []
emails = []
for resume_id in resume_list:
phone = ''
email = ''
xobj, html = get_resumeInfo(resume_id)
phone = get_phone(xobj)
email = get_email(xobj)
if phone == '' and email == '':
data = post_phone(resume_id)
if data.get('msg') == '请先登录!':
Requests.set_default_cookies(get_cookies())
if data.get('html'):
xobj = etree.HTML(data.get('html'))
phone = get_phone(xobj)
email = get_email(xobj)
phones.append(phone)
emails.append(email)
df['phone'] = phones
df['email'] = emails
df.to_excel(f'遵化_{datetime.datetime.now():%Y%m%d%H%M%S}_{xlsx_name}_p.xlsx', index=False)
if __name__ == '__main__':
data_integration()