176 lines
8.6 KiB
Python
176 lines
8.6 KiB
Python
import json
|
||
import re
|
||
import time
|
||
|
||
import requests
|
||
from lxml import etree
|
||
|
||
|
||
class Tsrcw:
|
||
def __init__(self):
|
||
self.headers = {
|
||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||
"cache-control": "no-cache",
|
||
"pragma": "no-cache",
|
||
"priority": "u=0, i",
|
||
"referer": "https://www.tsrcw.com/persondh/latest.aspx",
|
||
"sec-ch-ua": "\"Chromium\";v=\"134\", \"Not:A-Brand\";v=\"24\", \"Google Chrome\";v=\"134\"",
|
||
"sec-ch-ua-mobile": "?0",
|
||
"sec-ch-ua-platform": "\"Windows\"",
|
||
"sec-fetch-dest": "document",
|
||
"sec-fetch-mode": "navigate",
|
||
"sec-fetch-site": "same-origin",
|
||
"sec-fetch-user": "?1",
|
||
"upgrade-insecure-requests": "1",
|
||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
|
||
}
|
||
self.cookies = {
|
||
"ASP.NET_SessionId": "1hroesd0og5cqszyv42jkf30",
|
||
"yzmCookiestr": "ImgCode=1132&ExpireDate=2025/3/19 13:22:40&HaveUsed=1",
|
||
"PersonUser": "name=wxfkali222&key=0A1AD61BFD75D12B25A946E01AA2E894"
|
||
}
|
||
|
||
def get_index2html(self):
|
||
url = "https://www.tsrcw.com/default.aspx"
|
||
response = requests.get(url, headers=self.headers, cookies=self.cookies)
|
||
with open("index.html", "w", encoding="utf-8") as f:
|
||
f.write(response.text)
|
||
html = response.text
|
||
url_list = re.findall(r'url: \'/html/ashx/globla\.ashx\?action=(.*?)\'', html)
|
||
return url_list
|
||
|
||
def get_idlist(self):
|
||
# url = "https://www.tsrcw.com/html/ashx/globla.ashx"
|
||
# params = {
|
||
# "action": "zwlistEight"
|
||
# }
|
||
# response = requests.get(url, headers=self.headers, cookies=self.cookies, params=params)
|
||
# jsonf = json.dumps(response.json().get('msg'), ensure_ascii=False)
|
||
# with open("idlist.json", "w", encoding="utf-8") as f:
|
||
# f.write(jsonf)
|
||
with open("idlist.json", "r", encoding="utf-8") as f:
|
||
jsonf = f.read()
|
||
return jsonf
|
||
|
||
def get_detaillist(self, jsonf):
|
||
idlist = json.loads(jsonf)
|
||
for item in idlist:
|
||
for c in item.get('child'):
|
||
cid = c.get('cid')
|
||
if c.get("cname") == item.get("name"):
|
||
continue
|
||
url = "https://www.tsrcw.com/persondh/latest.aspx"
|
||
params = {
|
||
"job": "{}".format(cid)
|
||
}
|
||
response = requests.get(url, headers=self.headers, cookies=self.cookies, params=params)
|
||
html = response.text
|
||
xpathobj = etree.HTML(html)
|
||
position_name = xpathobj.xpath("//td[@class='text-left']/p/a/text()")
|
||
position_url = xpathobj.xpath("//td[@class='text-left']/p/a/@href")
|
||
company_name = xpathobj.xpath("//td[@class='w400']/div/span/a/text()")
|
||
company_url = xpathobj.xpath("//td[@class='w400']/div/span/a/@href")
|
||
|
||
if len(position_url) > 0:
|
||
position_list = [{
|
||
"position_name":position_name[index],
|
||
"position_url":position_url[index],
|
||
"company_name":company_name[index],
|
||
"company_url":company_url[index]
|
||
}for index,i in enumerate(position_name)]
|
||
if len(position_list) >= 20:
|
||
params2 = params.copy()
|
||
params2["page"] = "2" # 整个网站没有第三页的数据
|
||
response2 = requests.get(url, headers=self.headers, cookies=self.cookies, params=params2)
|
||
html2 = response2.text
|
||
xpathobj2 = etree.HTML(html2)
|
||
position_name2 = xpathobj2.xpath("//td[@class='text-left']/p/a/text()")
|
||
position_url2 = xpathobj2.xpath("//td[@class='text-left']/p/a/@href")
|
||
company_name2 = xpathobj2.xpath("//td[@class='w400']/div/span/a/text()")
|
||
company_url2 = xpathobj2.xpath("//td[@class='w400']/div/span/a/@href")
|
||
for index,i in enumerate(position_name2):
|
||
position_list.append({
|
||
"position_name":position_name2[index],
|
||
"position_url":position_url2[index],
|
||
"company_name":company_name2[index],
|
||
"company_url":company_url2[index]
|
||
})
|
||
c["position_list"] = position_list
|
||
else:
|
||
c["position_list"] = []
|
||
|
||
|
||
p_list = json.dumps(idlist, ensure_ascii=False)
|
||
with open("plist.json", "w", encoding="utf-8") as f:
|
||
f.write(p_list)
|
||
|
||
def get_poition_info(self):
|
||
q = []
|
||
y = 0
|
||
with open("plist.json", "r", encoding="utf-8") as f:
|
||
jsonf = f.read()
|
||
plist = json.loads(jsonf)
|
||
for item in plist:
|
||
for c in item.get('child'):
|
||
if c.get("cname") == item.get("name"):
|
||
continue
|
||
if len(c.get("position_list")) == 0:
|
||
continue
|
||
position_list = c.get("position_list")
|
||
for position in position_list:
|
||
href = position.get("position_url")
|
||
url = "https://www.tsrcw.com" + href
|
||
print(url)
|
||
response = requests.get(url, headers=self.headers, cookies=self.cookies)
|
||
html = response.text
|
||
xpathobj = etree.HTML(html)
|
||
job_info = {}
|
||
position_table = xpathobj.xpath("//div[@class='baseinfo']/table/tr")
|
||
for row in position_table:
|
||
position_key_list = [key.strip() for key in row.xpath("th/text()") if key.strip()]
|
||
position_value_list = [''.join(value.xpath(".//text()")).strip() for value in row.xpath("td")]
|
||
while len(position_value_list) < len(position_key_list):
|
||
position_value_list.append('') # 在末尾补充空字符串
|
||
|
||
for key, value in zip(position_key_list, position_value_list):
|
||
if ":" in value:
|
||
value = value.replace(":", "")
|
||
if "\u3000\u3000" in key:
|
||
key = key.replace("\u3000\u3000", "")
|
||
if "\r\n " in value:
|
||
value = value.replace("\r\n ", "")
|
||
job_info[key] = value
|
||
fl = xpathobj.xpath("//div[@class='s12_div']/text()")
|
||
job_info["福利"] = fl
|
||
yq = xpathobj.xpath("//div[@class='requirement']/div[@class='content']/text()")
|
||
yq = [i.replace('\r\n ','').replace('\r','').strip() for i in yq if i.strip()]
|
||
job_info["要求"] = yq
|
||
lxk = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/span/text()")
|
||
lxk = [i.replace(' ','').strip() for i in lxk if i.strip()]
|
||
lxv = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/text()")
|
||
lxv = [i.replace(':','').strip() for i in lxv if i.strip()]
|
||
lximg = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/img/@src")
|
||
if len(yq) == 0 and len(lxk) == 0:
|
||
q.append(url)
|
||
continue
|
||
|
||
if lxv[1] == '' and lxv[2] == '':
|
||
lxv[1] = lximg[0].split('value=')[1]
|
||
lxv[2] = lximg[1].split('value=')[1]
|
||
lx = dict(zip(lxk, lxv))
|
||
job_info["联系"] = lx
|
||
# time.sleepe11)
|
||
position["job_info"] = job_info
|
||
print("=====",y,"=====")
|
||
y += 1
|
||
|
||
with open("job_info.json", "w", encoding="utf-8") as f:
|
||
f.write(json.dumps(plist, ensure_ascii=False))
|
||
with open("position_info_back.json", "w", encoding="utf-8") as f:
|
||
f.write(json.dumps(c, ensure_ascii=False))
|
||
|
||
if __name__ == '__main__':
|
||
tsrcw = Tsrcw()
|
||
tsrcw.get_poition_info()
|