Crawler/web/qj050_com/Requests_Except.py

209 lines
8.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
import requests
import logging
import time
from lxml import etree
from types import SimpleNamespace
from http.cookies import SimpleCookie
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
class ExtendedResponse(requests.Response):
def xpath(self):
try:
tree = etree.HTML(self.text)
return tree
except Exception as e:
raise ValueError("XPath解析错误: " + str(e))
def to_Dict(self):
try:
data = self.json()
return self.dict_to_obj(data)
except Exception as e:
raise ValueError("JSON转换错误: " + str(e))
def to_Re_findall(self, regex):
try:
data = self.text
return re.findall(regex, data)
except Exception as e:
raise ValueError("Re搜索错误: " + str(e))
def cookies_dict(self):
try:
# 获取原有的 cookies 字典
cookie_dict = self.cookies.get_dict()
# 如果响应头中有 Set-Cookie则解析并补充 cookies
if 'Set-Cookie' in self.headers:
from http.cookies import SimpleCookie
sc = SimpleCookie()
sc.load(self.headers['Set-Cookie'])
for key, morsel in sc.items():
cookie_dict[key] = morsel.value
return cookie_dict
except Exception as e:
raise ValueError("Cookies转换错误: " + str(e))
def save_cookies(self, filepath, format='json'):
"""
将当前响应中的cookie信息保存到指定文件中。
参数:
filepath (str): 保存文件的路径
format (str): 保存格式,支持 'json''pickle''txt' 三种格式,默认为 'json'
"""
try:
cookie_dict = self.cookies_dict()
if format.lower() == 'json':
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(cookie_dict, f, ensure_ascii=False, indent=4)
elif format.lower() == 'pickle':
import pickle
with open(filepath, 'wb') as f:
pickle.dump(cookie_dict, f)
elif format.lower() == 'txt':
with open(filepath, 'w', encoding='utf-8') as f:
for key, value in cookie_dict.items():
f.write(f"{key}: {value}\n")
else:
raise ValueError("不支持的格式,请选择 'json''pickle''txt'")
except Exception as e:
raise ValueError("保存cookies出错: " + str(e))
@staticmethod
def dict_to_obj(d):
if isinstance(d, dict):
return SimpleNamespace(**{k: ExtendedResponse.dict_to_obj(v) for k, v in d.items()})
elif isinstance(d, list):
return [ExtendedResponse.dict_to_obj(item) for item in d]
else:
return d
class MyRequests:
def __init__(self, base_url, protocol='http', retries=3, proxy_options=True, default_timeout=10,
default_cookies=None):
"""
初始化 MyRequests 对象,自动加载本地 cookies 文件(根据 base_url 生成文件名,如 "www_zhrczp_com_cookies.json")中的 cookies
如果文件存在,则将其加载到 session 中;否则使用 default_cookies如果提供更新 session。
参数:
base_url (str): 基础 URL
protocol (str): 协议(默认为 'http'
retries (int): 请求重试次数
proxy_options (bool): 是否使用代理
default_timeout (int): 默认超时时间
default_cookies (dict): 默认的 cookies 字典
"""
self.base_url = base_url.rstrip('/')
self.protocol = protocol
self.retries = retries
self.default_timeout = default_timeout
self.session = requests.Session()
if proxy_options:
self.session.proxies = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890"}
# 优先使用传入的 default_cookies 更新 session
if default_cookies:
self.session.cookies.update(default_cookies)
# 根据 base_url 生成 cookies 文件名,将 '.' 替换为 '_'
self.cookie_file = f"{self.base_url.replace('.', '_')}_cookies.json"
# 尝试加载本地已保存的 cookies 文件
try:
with open(self.cookie_file, 'r', encoding='utf-8') as f:
loaded_cookies = json.load(f)
self.session.cookies.update(loaded_cookies)
logging.info("成功加载本地 cookies")
except FileNotFoundError:
logging.info("本地 cookies 文件不存在,将在请求后自动保存")
except Exception as e:
logging.error("加载本地 cookies 失败:" + str(e))
def _save_cookies(self):
"""
将当前 session 中的 cookies 保存到本地文件(基于 base_url 的文件名),以 JSON 格式存储。
"""
try:
with open(self.cookie_file, 'w', encoding='utf-8') as f:
json.dump(self.session.cookies.get_dict(), f, ensure_ascii=False, indent=4)
logging.info("cookies 已保存到本地文件:" + self.cookie_file)
except Exception as e:
logging.error("保存 cookies 文件失败:" + str(e))
def _build_url(self, url):
if url.startswith("http://") or url.startswith("https://"):
return url
return f"{self.protocol}://{self.base_url}/{url.lstrip('/')}"
def set_default_headers(self, headers):
self.session.headers.update(headers)
def set_default_cookies(self, cookies):
self.session.cookies.update(cookies)
self._save_cookies()
def get(self, url, params=None, headers=None, cookies=None, **kwargs):
full_url = self._build_url(url)
return self._request("GET", full_url, params=params, headers=headers, cookies=cookies, **kwargs)
def post(self, url, data=None, json=None, headers=None, cookies=None, **kwargs):
full_url = self._build_url(url)
return self._request("POST", full_url, data=data, json=json, headers=headers, cookies=cookies, **kwargs)
def update(self, url, data=None, json=None, headers=None, cookies=None, **kwargs):
full_url = self._build_url(url)
return self._request("PUT", full_url, data=data, json=json, headers=headers, cookies=cookies, **kwargs)
def delete(self, url, params=None, headers=None, cookies=None, **kwargs):
full_url = self._build_url(url)
return self._request("DELETE", full_url, params=params, headers=headers, cookies=cookies, **kwargs)
def _request(self, method, url, retries=None, autosave=False, **kwargs):
if retries is None:
retries = self.retries
if 'timeout' not in kwargs:
kwargs['timeout'] = self.default_timeout
try:
response = self.session.request(method, url, **kwargs)
response.raise_for_status()
self.session.cookies.update(response.cookies)
if 'Set-Cookie' in response.headers:
from http.cookies import SimpleCookie
sc = SimpleCookie()
sc.load(response.headers['Set-Cookie'])
for key, morsel in sc.items():
if morsel.value.lower() != 'deleted':
self.session.cookies.set(key, morsel.value)
if autosave:
self._save_cookies()
response.__class__ = ExtendedResponse
return response
except Exception as e:
if retries > 0:
logging.warning(f"请求 {method} {url} 失败,剩余重试次数 {retries},错误: {e}")
time.sleep(2 ** (self.retries - retries))
return self._request(method, url, retries=retries - 1, autosave=autosave, **kwargs)
else:
logging.error(f"请求 {method} {url} 重试次数用尽")
raise e
def get_cookies(self):
try:
return self.session.cookies.get_dict()
except Exception as e:
raise ValueError("获取 cookies 失败:" + str(e))
class MR(MyRequests):
pass