Crawler/web/yutian_top/main.py

import time

import pandas as pd

from Requests_Except import MR

base_url = 'www.yutian.top'
protocol = 'https'
default_headers = {
    "accept": "application/json, text/plain, */*",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "cache-control": "no-cache",
    "content-type": "application/json;charset=UTF-8",
    "origin": "https://www.yutian.top",
    "pragma": "no-cache",
    "priority": "u=1, i",
    "referer": "https://www.yutian.top/enterprise/resume_store/list",
    "sec-ch-ua": "\"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"138\", \"Google Chrome\";v=\"138\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
}

default_cookies = {
    "PHPSESSID": "8622ac2f6caf545585d9b3c4537bc036",
    "auth-token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NTQ4NzUzOTksImp0aSI6IjMxMzY5YmQ3LTIwOTMtNGI4Ni04ZGY3LWUzZTY1NDhjOTg0OCIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIwM2M2MmI5ODM4Yjk3Y2UzYmQxZTQwNDllZGVlNmI0OCIsInRlbmFudF90b2tlbiI6IjY1OTAxM2RlNjAxZmJmNjg1MzZmYTU0OTc4ODVkMTA2In0.TYpA94cCO7-HCeeksicrtpBDJB2AsbBvsuGBrJiFVWU",
    "company_sign": "",
    "company_nonce": "",
    "cuid": ""
}
Requests = MR(base_url, protocol, proxy_options=False)
Requests.set_default_headers(default_headers)
Requests.set_default_cookies(default_cookies)
# print(Requests.session.proxies)
pd_data = {
    'resume_id': [],
    '姓名': [],  # user_name
    '求职区域': [],  # area_show
    '生日': [],  # birthday
    '学历': [],  # education_level_msg
    '学校': [],  # education.school
    '期望职务': [],  # expect_job
    '最后活跃时间': [],  # last_edit_time
    '婚姻': [],  # marry_status_show
    '现居地': [],  # residence
    '年龄': [],  # user_age
    '电话': [],  # phone_encrypt
    '性别': [],  # sex_show
    '求职类型': [],  # work_type_show
    '求职状态': [],  # work_status_show
    '工作1经历': [],
    '工作1时间': [],
    '工作1内容': [],
    '工作2经历': [],
    '工作2时间': [],
    '工作2内容': [],
    '工作3经历': [],
    '工作3时间': [],
    '工作3内容': [],
    '工作4经历': [],
    '工作4时间': [],
    '工作4内容': [],
}
resume_list = []


def get_page(key_word, step=100):
    json_data = {
    'step': step,
    'page': 1,
    'education_level': [],
    'arrival_time': [],
    'work_time': [],
    'area_id': [],
    'keywords': key_word,
    'work_status': '',
    'work_status_show': '求职状态',
    'category_id': '',
    'work_type': '',
    'work_type_show': '是否兼职',
    'sex': '',
    'sex_show': '性别',
    'is_head': '',
    'is_head_show': '有无照片',
    'job_id': '',
    'age': [],
    'age_show': '年龄',
    'refresh_time': 0,
    'site_id': '',
    'site_id2': '',
    'province': '',
    'city': '',
    'county': '',
    'provinceArr': [],
    'cityArr': [],
    'countyArr': [],
    'only_job_category': 0,
}
    url = '/job/company/v1/resume/page'
    resp = Requests.post(url, json=json_data)
    return resp.to_Dict()


def organize_information_into_to_pandas(keyword):
    resp_obj = get_page(keyword, 100)
    for i in resp_obj.data:
        # resume_info = get_resume_info(i.resume_id)
        pd_data['resume_id'].append(i.resume_id)
        pd_data['姓名'].append(i.user_name)
        pd_data['求职区域'].append(i.area_show)
        pd_data['生日'].append(i.birthday)
        pd_data['学历'].append(i.education_level_msg)
        pd_data['学校'].append(';'.join([edu.school for edu in i.education]))
        pd_data['期望职务'].append(i.expect_job)
        pd_data['最后活跃时间'].append(i.last_edit_time)
        pd_data['婚姻'].append(i.marry_status_show)
        pd_data['现居地'].append(i.residence)
        pd_data['年龄'].append(i.user_age)
        pd_data['电话'].append(i.phone_encrypt)
        pd_data['性别'].append(i.sex_show)
        pd_data['求职类型'].append(i.work_type_show)
        pd_data['求职状态'].append(i.work_status_show)
        experience = i.experience
        for j in range(4):
            if j < len(experience) and experience[j].company:
                company = experience[j].company
                time_line = experience[j].time_line
                content = experience[j].content
            else:
                company = ''
                time_line = ''
                content = ''
            pd_data[f'工作{j + 1}经历'].append(company)
            pd_data[f'工作{j + 1}时间'].append(time_line)
            pd_data[f'工作{j + 1}内容'].append(content)


def main(keyword):
    organize_information_into_to_pandas(keyword)
    df = pd.DataFrame(pd_data)
    df.to_excel(keyword+"_"+str(int(time.time())) + '.xlsx', index=False)


if __name__ == '__main__':
    main("看护")