Crawler/web/yutian_top/MRequest.py

112 lines
4.5 KiB
Python

import requests
import logging
import time
from lxml import etree
from types import SimpleNamespace
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
class ExtendedResponse(requests.Response):
def xpath(self, xpath_expr):
try:
tree = etree.HTML(self.text)
return tree.xpath(xpath_expr)
except Exception as e:
raise ValueError("XPath解析错误: " + str(e))
def to_Dict(self):
try:
data = self.json()
return self.dict_to_obj(data)
except Exception as e:
raise ValueError("JSON转换错误: " + str(e))
@staticmethod
def dict_to_obj(d):
if isinstance(d, dict):
return SimpleNamespace(**{k: ExtendedResponse.dict_to_obj(v) for k, v in d.items()})
elif isinstance(d, list):
return [ExtendedResponse.dict_to_obj(item) for item in d]
else:
return d
class MyRequests:
def __init__(self, base_url, protocol='http', retries=3, proxy_options=True, default_timeout=10, default_cookies=None):
self.base_url = base_url.rstrip('/')
self.protocol = protocol
self.retries = retries
self.default_timeout = default_timeout
self.session = requests.Session()
if proxy_options:
self.session.proxies = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890"}
if default_cookies:
self.session.cookies.update(default_cookies)
def _build_url(self, url):
if url.startswith("http://") or url.startswith("https://"):
return url
return f"{self.protocol}://{self.base_url}/{url.lstrip('/')}"
def set_default_headers(self, headers):
self.session.headers.update(headers)
def set_default_cookies(self, cookies):
self.session.cookies.update(cookies)
def get(self, url, params=None, headers=None, cookies=None, **kwargs):
full_url = self._build_url(url)
return self._request("GET", full_url, params=params, headers=headers, cookies=cookies, **kwargs)
def post(self, url, data=None, json=None, headers=None, cookies=None, **kwargs):
full_url = self._build_url(url)
return self._request("POST", full_url, data=data, json=json, headers=headers, cookies=cookies, **kwargs)
def update(self, url, data=None, json=None, headers=None, cookies=None, **kwargs):
full_url = self._build_url(url)
return self._request("PUT", full_url, data=data, json=json, headers=headers, cookies=cookies, **kwargs)
def delete(self, url, params=None, headers=None, cookies=None, **kwargs):
full_url = self._build_url(url)
return self._request("DELETE", full_url, params=params, headers=headers, cookies=cookies, **kwargs)
def _request(self, method, url, retries=None, **kwargs):
if retries is None:
retries = self.retries
if 'timeout' not in kwargs:
kwargs['timeout'] = self.default_timeout
if 'headers' in kwargs and kwargs['headers']:
headers = kwargs['headers']
if 'referer' in headers:
headers['referer'] = self._build_url(headers['referer'])
try:
response = self.session.request(method, url, **kwargs)
response.raise_for_status()
response.__class__ = ExtendedResponse
return response
except Exception as e:
if retries > 0:
logging.warning(f"请求 {method} {url} 失败,剩余重试次数 {retries},错误: {e}")
time.sleep(2 ** (self.retries - retries))
return self._request(method, url, retries=retries - 1, **kwargs)
else:
logging.error(f"请求 {method} {url} 重试次数用尽")
raise e
def close(self):
self.session.close()
if __name__ == '__main__':
req = MyRequests("httpbin.org", protocol="https", retries=3, proxy_options=True, default_timeout=5, default_cookies={"session": "abc"})
req.set_default_headers({"User-Agent": "MyRequests/1.0"})
try:
resp = req.get("/get", headers={"referer": "/page"})
logging.info("状态码: %s", resp.status_code)
logging.info("JSON: %s", resp.json())
logging.info("XPath: %s", resp.xpath('//title/text()'))
obj = resp.to_Dict()
logging.info("转换对象: %s", obj)
except Exception as ex:
logging.error("请求失败: %s", ex)
finally:
req.close()