157 lines
4.4 KiB
Python
157 lines
4.4 KiB
Python
import datetime
|
|
|
|
import pandas as pd
|
|
from lxml import etree
|
|
from pathlib import Path
|
|
from Requests_Except import *
|
|
|
|
base_url = 'www.zhrczp.com'
|
|
protocol = 'https'
|
|
default_headers = {
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Pragma': 'no-cache',
|
|
'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=',
|
|
'Sec-Fetch-Dest': 'document',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
'Sec-Fetch-User': '?1',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36',
|
|
'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
}
|
|
Requests = MR(base_url, protocol)
|
|
Requests.set_default_headers(default_headers)
|
|
|
|
excelfilepath = r"C:\Users\Franklin_Kali\Documents\WeChat Files\w19960619\FileStorage\File\2025-06\遵化电话.xlsx"
|
|
xlsx_name = str(Path(excelfilepath).stem)
|
|
|
|
def get_search_page(keyword_id, page):
|
|
params = {
|
|
'c': 'resume',
|
|
'jobin': str(keyword_id),
|
|
'jobclass_search': str(keyword_id),
|
|
'cityin': '',
|
|
'cityclass_search': '',
|
|
'keyword': '',
|
|
'minsalary': '',
|
|
'maxsalary': '',
|
|
'minage': '',
|
|
'maxage': '',
|
|
'exp': '',
|
|
'edu': '',
|
|
'uptime': '',
|
|
'sex': '',
|
|
'type': '',
|
|
'page': str(page),
|
|
}
|
|
url = '/member/index.php'
|
|
resp = Requests.get(url, params=params)
|
|
res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", resp.text)
|
|
return res
|
|
|
|
|
|
def read_excel2df(file_path):
|
|
df = pd.read_excel(file_path)
|
|
return df
|
|
|
|
|
|
def get_resume_list(df):
|
|
resume_id_list = []
|
|
for index, row in df.iterrows():
|
|
resume_id_list.append(row['resume_id'])
|
|
return resume_id_list
|
|
|
|
|
|
def get_cookies():
|
|
url = '/login/c_loginsave.html'
|
|
data = {
|
|
'act_login': '0',
|
|
'num': '2',
|
|
'referurl': 'https://www.zhrczp.com/',
|
|
'username': '18713831026',
|
|
'password': '18713831026',
|
|
'loginname': '0',
|
|
'authcode': '',
|
|
'verify_token': '',
|
|
'verify_str': '',
|
|
}
|
|
resp = Requests.post(url, data=data)
|
|
return resp.cookies_dict()
|
|
|
|
|
|
def get_resumeInfo(resume_id):
|
|
url = '/member/index.php'
|
|
params = {
|
|
'c': 'hr',
|
|
'act': 'resumeInfo',
|
|
'eid': str(resume_id),
|
|
'state': '',
|
|
'from': 'rck',
|
|
}
|
|
resp = Requests.get(url, params=params)
|
|
return resp.xpath(), resp.text
|
|
|
|
|
|
def get_phone(xpath):
|
|
phone = xpath.xpath('//div[contains(text(), "手机")]/span/text()')
|
|
return phone[0].strip() if phone else ''
|
|
|
|
|
|
def get_email(xpath):
|
|
email = xpath.xpath('//div[contains(text(), "邮箱")]/text()')
|
|
if not email:
|
|
return ''
|
|
email = re.search(r'[\w\.-]+@[\w\.-]+', ''.join(email[0])).group()
|
|
return email if email else ''
|
|
|
|
|
|
def post_phone(resume_id):
|
|
url = '/index.php'
|
|
params = {
|
|
'm': 'ajax',
|
|
'c': 'for_link',
|
|
}
|
|
|
|
data = {
|
|
'eid': str(resume_id),
|
|
}
|
|
resp = Requests.post(url, params=params, data=data)
|
|
return resp.json()
|
|
|
|
def data_integration():
|
|
df = read_excel2df(excelfilepath)
|
|
resume_list = get_resume_list(df)
|
|
phones = []
|
|
emails = []
|
|
for resume_id in resume_list:
|
|
phone = ''
|
|
email = ''
|
|
xobj, html = get_resumeInfo(resume_id)
|
|
|
|
phone = get_phone(xobj)
|
|
email = get_email(xobj)
|
|
|
|
if phone == '' and email == '':
|
|
data = post_phone(resume_id)
|
|
if data.get('msg') == '请先登录!':
|
|
Requests.set_default_cookies(get_cookies())
|
|
if data.get('html'):
|
|
xobj = etree.HTML(data.get('html'))
|
|
phone = get_phone(xobj)
|
|
email = get_email(xobj)
|
|
|
|
phones.append(phone)
|
|
emails.append(email)
|
|
|
|
df['phone'] = phones
|
|
df['email'] = emails
|
|
df.to_excel(f'遵化_{datetime.datetime.now():%Y%m%d%H%M%S}_{xlsx_name}_p.xlsx', index=False)
|
|
|
|
if __name__ == '__main__':
|
|
data_integration()
|