76 lines
2.6 KiB
Python
76 lines
2.6 KiB
Python
# Scrapy 项目 TS_resume_spider 的配置文件
|
||
|
||
# 指定项目名称,默认会用在 User-Agent 和内部调用
|
||
BOT_NAME = "TS_resume_spider"
|
||
|
||
# 指定爬虫类所在的模块(路径)
|
||
SPIDER_MODULES = ["TS_resume_spider.spiders"]
|
||
# 新建爬虫默认生成的地方
|
||
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
|
||
|
||
# 自定义 User-Agent,默认用 Scrapy的,可改成模仿浏览器的
|
||
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
|
||
|
||
# 是否遵守 robots.txt 规则(推荐 False)
|
||
ROBOTSTXT_OBEY = False
|
||
|
||
# 配置 Scrapy 最大并发请求数(默认 16)
|
||
CONCURRENT_REQUESTS = 64 # 设置并发量为8,减少服务器压力,避免被断连
|
||
|
||
# 同一网站请求间隔时(秒),有效避免被拦截
|
||
DOWNLOAD_DELAY = 0.1
|
||
|
||
# 每个域名最多并发请求数(最高限制)
|
||
CONCURRENT_REQUESTS_PER_DOMAIN = 64
|
||
|
||
# 禁用网络绑定到单个IP(默认就是 False)
|
||
# CONCURRENT_REQUESTS_PER_IP = 8
|
||
|
||
# 是否禁用 cookies,如果有记录状态需要开启
|
||
COOKIES_ENABLED = False
|
||
|
||
# 默认请求头,如有特殊需要,在请求中单独指定
|
||
# DEFAULT_REQUEST_HEADERS = {
|
||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||
# "Accept-Language": "zh-CN,zh;q=0.9",
|
||
# }
|
||
|
||
# 启用自动量制(加载随时调整下载延迟),有利于避免被阻止
|
||
AUTOTHROTTLE_ENABLED = False
|
||
# 初始的下载延迟(秒)
|
||
AUTOTHROTTLE_START_DELAY = 0
|
||
# 最大下载延迟(秒),对高延迟服务器有效
|
||
AUTOTHROTTLE_MAX_DELAY = 60
|
||
# 平均并发数,1.0代表一次一个
|
||
AUTOTHROTTLE_TARGET_CONCURRENCY = 10
|
||
# 是否打印每次调整的日志
|
||
AUTOTHROTTLE_DEBUG = False
|
||
|
||
# 启用请求重试,有效处理连接失败,网络错误
|
||
RETRY_ENABLED = True
|
||
# 重试次数(次),默认是2,调高可接受网络不稳定
|
||
RETRY_TIMES = 5
|
||
# 对些网络错误、服务器错误以及超过限制重试
|
||
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
|
||
|
||
# 配置使用的数据管道,数字小表明执行顺序
|
||
ITEM_PIPELINES = {
|
||
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
|
||
'TS_resume_spider.pipelines.YTSavePipeline': 500,
|
||
}
|
||
|
||
# 设置输出文件编码,防止中文乱码
|
||
FEED_EXPORT_ENCODING = "utf-8"
|
||
|
||
# 启用未来可用的指纹,增强系统兼容性
|
||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||
|
||
# 如需添加 http cache,可以后期再考虑(不必须)
|
||
# HTTPCACHE_ENABLED = True
|
||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||
# HTTPCACHE_DIR = "httpcache"
|
||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||
JOBDIR = 'job_info/ts_resume_spider'
|