import json import re import time import requests from lxml import etree class Tsrcw: def __init__(self): self.headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", "cache-control": "no-cache", "pragma": "no-cache", "priority": "u=0, i", "referer": "https://www.tsrcw.com/persondh/latest.aspx", "sec-ch-ua": "\"Chromium\";v=\"134\", \"Not:A-Brand\";v=\"24\", \"Google Chrome\";v=\"134\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36" } self.cookies = { "ASP.NET_SessionId": "1hroesd0og5cqszyv42jkf30", "yzmCookiestr": "ImgCode=1132&ExpireDate=2025/3/19 13:22:40&HaveUsed=1", "PersonUser": "name=wxfkali222&key=0A1AD61BFD75D12B25A946E01AA2E894" } def get_index2html(self): url = "https://www.tsrcw.com/default.aspx" response = requests.get(url, headers=self.headers, cookies=self.cookies) with open("index.html", "w", encoding="utf-8") as f: f.write(response.text) html = response.text url_list = re.findall(r'url: \'/html/ashx/globla\.ashx\?action=(.*?)\'', html) return url_list def get_idlist(self): # url = "https://www.tsrcw.com/html/ashx/globla.ashx" # params = { # "action": "zwlistEight" # } # response = requests.get(url, headers=self.headers, cookies=self.cookies, params=params) # jsonf = json.dumps(response.json().get('msg'), ensure_ascii=False) # with open("idlist.json", "w", encoding="utf-8") as f: # f.write(jsonf) with open("idlist.json", "r", encoding="utf-8") as f: jsonf = f.read() return jsonf def get_detaillist(self, jsonf): idlist = json.loads(jsonf) for item in idlist: for c in item.get('child'): cid = c.get('cid') if c.get("cname") == item.get("name"): continue url = "https://www.tsrcw.com/persondh/latest.aspx" params = { "job": "{}".format(cid) } response = requests.get(url, headers=self.headers, cookies=self.cookies, params=params) html = response.text xpathobj = etree.HTML(html) position_name = xpathobj.xpath("//td[@class='text-left']/p/a/text()") position_url = xpathobj.xpath("//td[@class='text-left']/p/a/@href") company_name = xpathobj.xpath("//td[@class='w400']/div/span/a/text()") company_url = xpathobj.xpath("//td[@class='w400']/div/span/a/@href") if len(position_url) > 0: position_list = [{ "position_name":position_name[index], "position_url":position_url[index], "company_name":company_name[index], "company_url":company_url[index] }for index,i in enumerate(position_name)] if len(position_list) >= 20: params2 = params.copy() params2["page"] = "2" # 整个网站没有第三页的数据 response2 = requests.get(url, headers=self.headers, cookies=self.cookies, params=params2) html2 = response2.text xpathobj2 = etree.HTML(html2) position_name2 = xpathobj2.xpath("//td[@class='text-left']/p/a/text()") position_url2 = xpathobj2.xpath("//td[@class='text-left']/p/a/@href") company_name2 = xpathobj2.xpath("//td[@class='w400']/div/span/a/text()") company_url2 = xpathobj2.xpath("//td[@class='w400']/div/span/a/@href") for index,i in enumerate(position_name2): position_list.append({ "position_name":position_name2[index], "position_url":position_url2[index], "company_name":company_name2[index], "company_url":company_url2[index] }) c["position_list"] = position_list else: c["position_list"] = [] p_list = json.dumps(idlist, ensure_ascii=False) with open("plist.json", "w", encoding="utf-8") as f: f.write(p_list) def get_poition_info(self): q = [] y = 0 with open("plist.json", "r", encoding="utf-8") as f: jsonf = f.read() plist = json.loads(jsonf) for item in plist: for c in item.get('child'): if c.get("cname") == item.get("name"): continue if len(c.get("position_list")) == 0: continue position_list = c.get("position_list") for position in position_list: href = position.get("position_url") url = "https://www.tsrcw.com" + href print(url) response = requests.get(url, headers=self.headers, cookies=self.cookies) html = response.text xpathobj = etree.HTML(html) job_info = {} position_table = xpathobj.xpath("//div[@class='baseinfo']/table/tr") for row in position_table: position_key_list = [key.strip() for key in row.xpath("th/text()") if key.strip()] position_value_list = [''.join(value.xpath(".//text()")).strip() for value in row.xpath("td")] while len(position_value_list) < len(position_key_list): position_value_list.append('') # 在末尾补充空字符串 for key, value in zip(position_key_list, position_value_list): if ":" in value: value = value.replace(":", "") if "\u3000\u3000" in key: key = key.replace("\u3000\u3000", "") if "\r\n " in value: value = value.replace("\r\n ", "") job_info[key] = value fl = xpathobj.xpath("//div[@class='s12_div']/text()") job_info["福利"] = fl yq = xpathobj.xpath("//div[@class='requirement']/div[@class='content']/text()") yq = [i.replace('\r\n ','').replace('\r','').strip() for i in yq if i.strip()] job_info["要求"] = yq lxk = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/span/text()") lxk = [i.replace(' ','').strip() for i in lxk if i.strip()] lxv = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/text()") lxv = [i.replace(':','').strip() for i in lxv if i.strip()] lximg = xpathobj.xpath("//div[@class='contactus']/div[@class='content']/ul/li/img/@src") if len(yq) == 0 and len(lxk) == 0: q.append(url) continue if lxv[1] == '' and lxv[2] == '': lxv[1] = lximg[0].split('value=')[1] lxv[2] = lximg[1].split('value=')[1] lx = dict(zip(lxk, lxv)) job_info["联系"] = lx # time.sleepe11) position["job_info"] = job_info print("=====",y,"=====") y += 1 with open("job_info.json", "w", encoding="utf-8") as f: f.write(json.dumps(plist, ensure_ascii=False)) with open("position_info_back.json", "w", encoding="utf-8") as f: f.write(json.dumps(c, ensure_ascii=False)) if __name__ == '__main__': tsrcw = Tsrcw() tsrcw.get_poition_info()