76 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Scrapy 项目 TS_resume_spider 的配置文件
# 指定项目名称,默认会用在 User-Agent 和内部调用
BOT_NAME = "TS_resume_spider"
# 指定爬虫类所在的模块(路径)
SPIDER_MODULES = ["TS_resume_spider.spiders"]
# 新建爬虫默认生成的地方
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
# 自定义 User-Agent默认用 Scrapy的可改成模仿浏览器的
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
# 是否遵守 robots.txt 规则(推荐 False
ROBOTSTXT_OBEY = False
# 配置 Scrapy 最大并发请求数(默认 16
CONCURRENT_REQUESTS = 64 # 设置并发量为8减少服务器压力避免被断连
# 同一网站请求间隔时(秒),有效避免被拦截
DOWNLOAD_DELAY = 0.1
# 每个域名最多并发请求数(最高限制)
CONCURRENT_REQUESTS_PER_DOMAIN = 64
# 禁用网络绑定到单个IP默认就是 False
# CONCURRENT_REQUESTS_PER_IP = 8
# 是否禁用 cookies如果有记录状态需要开启
COOKIES_ENABLED = False
# 默认请求头,如有特殊需要,在请求中单独指定
# DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "zh-CN,zh;q=0.9",
# }
# 启用自动量制(加载随时调整下载延迟),有利于避免被阻止
AUTOTHROTTLE_ENABLED = False
# 初始的下载延迟(秒)
AUTOTHROTTLE_START_DELAY = 0
# 最大下载延迟(秒),对高延迟服务器有效
AUTOTHROTTLE_MAX_DELAY = 60
# 平均并发数1.0代表一次一个
AUTOTHROTTLE_TARGET_CONCURRENCY = 10
# 是否打印每次调整的日志
AUTOTHROTTLE_DEBUG = False
# 启用请求重试,有效处理连接失败,网络错误
RETRY_ENABLED = True
# 重试次数默认是2调高可接受网络不稳定
RETRY_TIMES = 5
# 对些网络错误、服务器错误以及超过限制重试
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
# 配置使用的数据管道,数字小表明执行顺序
ITEM_PIPELINES = {
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
'TS_resume_spider.pipelines.YTSavePipeline': 500,
}
# 设置输出文件编码,防止中文乱码
FEED_EXPORT_ENCODING = "utf-8"
# 启用未来可用的指纹,增强系统兼容性
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
# 如需添加 http cache可以后期再考虑不必须
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
JOBDIR = 'job_info/ts_resume_spider'