Crawler/web/tsrcw/main.py

176 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
import time
import requests
from lxml import etree
class Tsrcw:
def __init__(self):
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=0, i",
"referer": "https://www.tsrcw.com/persondh/latest.aspx",
"sec-ch-ua": "\"Chromium\";v=\"134\", \"Not:A-Brand\";v=\"24\", \"Google Chrome\";v=\"134\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
}
self.cookies = {
"ASP.NET_SessionId": "1hroesd0og5cqszyv42jkf30",
"yzmCookiestr": "ImgCode=1132&ExpireDate=2025/3/19 13:22:40&HaveUsed=1",
"PersonUser": "name=wxfkali222&key=0A1AD61BFD75D12B25A946E01AA2E894"
}
def get_index2html(self):
url = "https://www.tsrcw.com/default.aspx"
response = requests.get(url, headers=self.headers, cookies=self.cookies)
with open("index.html", "w", encoding="utf-8") as f:
f.write(response.text)
html = response.text
url_list = re.findall(r'url: \'/html/ashx/globla\.ashx\?action=(.*?)\'', html)
return url_list
def get_idlist(self):
# url = "https://www.tsrcw.com/html/ashx/globla.ashx"
# params = {
# "action": "zwlistEight"
# }
# response = requests.get(url, headers=self.headers, cookies=self.cookies, params=params)
# jsonf = json.dumps(response.json().get('msg'), ensure_ascii=False)
# with open("idlist.json", "w", encoding="utf-8") as f:
# f.write(jsonf)
with open("idlist.json", "r", encoding="utf-8") as f:
jsonf = f.read()
return jsonf
def get_detaillist(self, jsonf):
idlist = json.loads(jsonf)
for item in idlist:
for c in item.get('child'):
cid = c.get('cid')
if c.get("cname") == item.get("name"):
continue
url = "https://www.tsrcw.com/persondh/latest.aspx"
params = {
"job": "{}".format(cid)
}
response = requests.get(url, headers=self.headers, cookies=self.cookies, params=params)
html = response.text
xpathobj = etree.HTML(html)
position_name = xpathobj.xpath("//td[@class='text-left']/p/a/text()")
position_url = xpathobj.xpath("//td[@class='text-left']/p/a/@href")
company_name = xpathobj.xpath("//td[@class='w400']/div/span/a/text()")
company_url = xpathobj.xpath("//td[@class='w400']/div/span/a/@href")
if len(position_url) > 0:
position_list = [{
"position_name":position_name[index],
"position_url":position_url[index],
"company_name":company_name[index],
"company_url":company_url[index]
}for index,i in enumerate(position_name)]
if len(position_list) >= 20:
params2 = params.copy()
params2["page"] = "2" # 整个网站没有第三页的数据
response2 = requests.get(url, headers=self.headers, cookies=self.cookies, params=params2)
html2 = response2.text
xpathobj2 = etree.HTML(html2)
position_name2 = xpathobj2.xpath("//td[@class='text-left']/p/a/text()")
position_url2 = xpathobj2.xpath("//td[@class='text-left']/p/a/@href")
company_name2 = xpathobj2.xpath("//td[@class='w400']/div/span/a/text()")
company_url2 = xpathobj2.xpath("//td[@class='w400']/div/span/a/@href")
for index,i in enumerate(position_name2):
position_list.append({
"position_name":position_name2[index],
"position_url":position_url2[index],
"company_name":company_name2[index],
"company_url":company_url2[index]
})
c["position_list"] = position_list
else:
c["position_list"] = []
p_list = json.dumps(idlist, ensure_ascii=False)
with open("plist.json", "w", encoding="utf-8") as f:
f.write(p_list)
def get_poition_info(self):
q = []
y = 0
with open("plist.json", "r", encoding="utf-8") as f:
jsonf = f.read()
plist = json.loads(jsonf)
for item in plist:
for c in item.get('child'):
if c.get("cname") == item.get("name"):
continue
if len(c.get("position_list")) == 0:
continue
position_list = c.get("position_list")
for position in position_list:
href = position.get("position_url")
url = "https://www.tsrcw.com" + href
print(url)
response = requests.get(url, headers=self.headers, cookies=self.cookies)
html = response.text
xpathobj = etree.HTML(html)
job_info = {}
position_table = xpathobj.xpath("//div[@class='baseinfo']/table/tr")
for row in position_table:
position_key_list = [key.strip() for key in row.xpath("th/text()") if key.strip()]
position_value_list = [''.join(value.xpath(".//text()")).strip() for value in row.xpath("td")]
while len(position_value_list) < len(position_key_list):
position_value_list.append('') # 在末尾补充空字符串
for key, value in zip(position_key_list, position_value_list):
if "" in value:
value = value.replace("", "")
if "\u3000\u3000" in key:
key = key.replace("\u3000\u3000", "")
if "\r\n " in value:
value = value.replace("\r\n ", "")
job_info[key] = value
fl = xpathobj.xpath("//div[@class='s12_div']/text()")
job_info["福利"] = fl
yq = xpathobj.xpath("//div[@class='requirement']/div[@class='content']/text()")
yq = [i.replace('\r\n ','').replace('\r','').strip() for i in yq if i.strip()]
job_info["要求"] = yq
lxk = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/span/text()")
lxk = [i.replace(' ','').strip() for i in lxk if i.strip()]
lxv = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/text()")
lxv = [i.replace('','').strip() for i in lxv if i.strip()]
lximg = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/img/@src")
if len(yq) == 0 and len(lxk) == 0:
q.append(url)
continue
if lxv[1] == '' and lxv[2] == '':
lxv[1] = lximg[0].split('value=')[1]
lxv[2] = lximg[1].split('value=')[1]
lx = dict(zip(lxk, lxv))
job_info["联系"] = lx
# time.sleepe11)
position["job_info"] = job_info
print("=====",y,"=====")
y += 1
with open("job_info.json", "w", encoding="utf-8") as f:
f.write(json.dumps(plist, ensure_ascii=False))
with open("position_info_back.json", "w", encoding="utf-8") as f:
f.write(json.dumps(c, ensure_ascii=False))
if __name__ == '__main__':
tsrcw = Tsrcw()
tsrcw.get_poition_info()