Add FnrcVipSpider and QJ050ComSpider; update pipelines and db logic for new sources
This commit is contained in:
parent
90217778be
commit
f153c6d250
@ -59,7 +59,7 @@ class YTSpiderPipeline:
|
||||
return datetime(2019, 12, 12)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if spider.name != 'yutian_top':
|
||||
if spider.name not in ['yutian_top','fnrc_vip']:
|
||||
return item
|
||||
experience = item.get("experience", [])
|
||||
for j in range(4):
|
||||
@ -93,15 +93,18 @@ class YTSpiderPipeline:
|
||||
|
||||
if "update_time" in item:
|
||||
item["update_time"] = self.parse_datetime(item["update_time"])
|
||||
|
||||
item["source_id"] = 2
|
||||
|
||||
if spider.name == "yutian_top":
|
||||
item["source_id"] = 2
|
||||
elif spider.name == "fnrc_vip":
|
||||
item["source_id"] = 3
|
||||
else:
|
||||
item["source_id"] = None
|
||||
return item
|
||||
|
||||
|
||||
class YTSavePipeline:
|
||||
def process_item(self, item, spider):
|
||||
if spider.name not in ['yutian_top' ,'zhrczp_com']:
|
||||
if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']:
|
||||
return item
|
||||
resume_id = item.get("resume_id")
|
||||
if not resume_id:
|
||||
|
@ -1,93 +1,75 @@
|
||||
# Scrapy settings for TS_resume_spider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# Scrapy 项目 TS_resume_spider 的配置文件
|
||||
|
||||
# 指定项目名称,默认会用在 User-Agent 和内部调用
|
||||
BOT_NAME = "TS_resume_spider"
|
||||
|
||||
# 指定爬虫类所在的模块(路径)
|
||||
SPIDER_MODULES = ["TS_resume_spider.spiders"]
|
||||
# 新建爬虫默认生成的地方
|
||||
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
# 自定义 User-Agent,默认用 Scrapy的,可改成模仿浏览器的
|
||||
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
|
||||
|
||||
# Obey robots.txt rules
|
||||
# 是否遵守 robots.txt 规则(推荐 False)
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
# 配置 Scrapy 最大并发请求数(默认 16)
|
||||
CONCURRENT_REQUESTS = 64 # 设置并发量为8,减少服务器压力,避免被断连
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
# DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
# 同一网站请求间隔时(秒),有效避免被拦截
|
||||
DOWNLOAD_DELAY = 0.1
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
# COOKIES_ENABLED = False
|
||||
# 每个域名最多并发请求数(最高限制)
|
||||
CONCURRENT_REQUESTS_PER_DOMAIN = 64
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
# 禁用网络绑定到单个IP(默认就是 False)
|
||||
# CONCURRENT_REQUESTS_PER_IP = 8
|
||||
|
||||
# Override the default request headers:
|
||||
# 是否禁用 cookies,如果有记录状态需要开启
|
||||
COOKIES_ENABLED = False
|
||||
|
||||
# 默认请求头,如有特殊需要,在请求中单独指定
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
# "Accept-Language": "zh-CN,zh;q=0.9",
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
|
||||
# }
|
||||
# 启用自动量制(加载随时调整下载延迟),有利于避免被阻止
|
||||
AUTOTHROTTLE_ENABLED = False
|
||||
# 初始的下载延迟(秒)
|
||||
AUTOTHROTTLE_START_DELAY = 0
|
||||
# 最大下载延迟(秒),对高延迟服务器有效
|
||||
AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# 平均并发数,1.0代表一次一个
|
||||
AUTOTHROTTLE_TARGET_CONCURRENCY = 10
|
||||
# 是否打印每次调整的日志
|
||||
AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# DOWNLOADER_MIDDLEWARES = {
|
||||
# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
|
||||
# }
|
||||
# 启用请求重试,有效处理连接失败,网络错误
|
||||
RETRY_ENABLED = True
|
||||
# 重试次数(次),默认是2,调高可接受网络不稳定
|
||||
RETRY_TIMES = 5
|
||||
# 对些网络错误、服务器错误以及超过限制重试
|
||||
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
# 配置使用的数据管道,数字小表明执行顺序
|
||||
ITEM_PIPELINES = {
|
||||
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
|
||||
'TS_resume_spider.pipelines.YTSavePipeline': 500,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
# 设置输出文件编码,防止中文乱码
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
# 启用未来可用的指纹,增强系统兼容性
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
|
||||
# 如需添加 http cache,可以后期再考虑(不必须)
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = "httpcache"
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
JOBDIR = 'job_info/ts_resume_spider'
|
||||
|
112
TS_resume_spider/spiders/fnrc_vpi.py
Normal file
112
TS_resume_spider/spiders/fnrc_vpi.py
Normal file
@ -0,0 +1,112 @@
|
||||
import requests
|
||||
import scrapy
|
||||
import json
|
||||
|
||||
from pyasn1_modules.rfc7292 import pbeWithSHAAnd40BitRC4
|
||||
from requests import session
|
||||
|
||||
# 尽量不要跑这个脚本太他妈慢了
|
||||
|
||||
class FnrcVipSpider(scrapy.Spider):
|
||||
name = 'fnrc_vip'
|
||||
|
||||
def start_requests(self):
|
||||
url = "https://www.fnrc.vip/job/company/v1/resume/page"
|
||||
cookies = {
|
||||
'PHPSESSID': 'ca613ae99706037e356a247500acb97b',
|
||||
'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM',
|
||||
'company_sign': '',
|
||||
'company_nonce': '',
|
||||
'cuid': '',
|
||||
}
|
||||
|
||||
headers = {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'cache-control': 'no-cache',
|
||||
'content-type': 'application/json;charset=UTF-8',
|
||||
'origin': 'https://www.fnrc.vip',
|
||||
'pragma': 'no-cache',
|
||||
'priority': 'u=1, i',
|
||||
'referer': 'https://www.fnrc.vip/enterprise/resume_store/list',
|
||||
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sec-fetch-dest': 'empty',
|
||||
'sec-fetch-mode': 'cors',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||
}
|
||||
|
||||
json_data = {
|
||||
'step': 1000,
|
||||
'page': 1,
|
||||
'education_level': [],
|
||||
'arrival_time': [],
|
||||
'work_time': [],
|
||||
'area_id': [],
|
||||
'keywords': '',
|
||||
'work_status': '',
|
||||
'work_status_show': '求职状态',
|
||||
'category_id': '',
|
||||
'work_type': '',
|
||||
'work_type_show': '是否兼职',
|
||||
'sex': '',
|
||||
'sex_show': '性别',
|
||||
'is_head': '',
|
||||
'is_head_show': '有无照片',
|
||||
'job_id': '',
|
||||
'age': [],
|
||||
'age_show': '年龄',
|
||||
'refresh_time': 0,
|
||||
'site_id': '',
|
||||
'site_id2': '',
|
||||
'province': '',
|
||||
'city': '',
|
||||
'county': '',
|
||||
'provinceArr': [],
|
||||
'cityArr': [],
|
||||
'countyArr': [],
|
||||
'only_job_category': 0,
|
||||
}
|
||||
session = requests.Session()
|
||||
session.headers.update(headers)
|
||||
session.cookies.update(cookies)
|
||||
for page in range(1, 6):
|
||||
payload = {
|
||||
'step': 1000,
|
||||
'page': page,
|
||||
'education_level': [],
|
||||
'arrival_time': [],
|
||||
'work_time': [],
|
||||
'area_id': [],
|
||||
'keywords': '',
|
||||
'work_status': '',
|
||||
'work_status_show': '求职状态',
|
||||
'category_id': '',
|
||||
'work_type': '',
|
||||
'work_type_show': '是否兼职',
|
||||
'sex': '',
|
||||
'sex_show': '性别',
|
||||
'is_head': '',
|
||||
'is_head_show': '有无照片',
|
||||
'job_id': '',
|
||||
'age': [],
|
||||
'age_show': '年龄',
|
||||
'refresh_time': 0,
|
||||
'site_id': '',
|
||||
'site_id2': '',
|
||||
'province': '',
|
||||
'city': '',
|
||||
'county': '',
|
||||
'provinceArr': [],
|
||||
'cityArr': [],
|
||||
'countyArr': [],
|
||||
'only_job_category': 0,
|
||||
}
|
||||
|
||||
response = session.post(url, json=payload)
|
||||
data = response.json()
|
||||
|
||||
for item in data.get('data', []):
|
||||
yield item
|
108
TS_resume_spider/spiders/qj050_com.py
Normal file
108
TS_resume_spider/spiders/qj050_com.py
Normal file
@ -0,0 +1,108 @@
|
||||
import json
|
||||
import scrapy
|
||||
import time
|
||||
import urllib
|
||||
from datetime import datetime
|
||||
from scrapy.http import Response
|
||||
|
||||
class QJ050ComSpider(scrapy.Spider):
|
||||
name = 'qj050_com'
|
||||
allowed_domains = ['qj050.com']
|
||||
start_urls = ['https://www.qj050.com/api/v1/resumes']
|
||||
|
||||
headers = {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'authorization': 'Bearer 你的token',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||
'x-platform': '1',
|
||||
'x-site-id': 'undefined',
|
||||
}
|
||||
cookies = {
|
||||
'x-trace-id': '7d60110f6a7a4df595db14e54ee772dd',
|
||||
'has_login_log': 'yes',
|
||||
'HMACCOUNT': '52014CC932A93E9B',
|
||||
'token': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQ1NzU3MjM4LCJleHAiOjE3NzcyOTMyMzh9.uU9G81yizRRUYCyJymit4n9vuysXCT-2V9PLdmdohgA',
|
||||
'token.sig': 'Zta83bKMN9mPlsm9ZVnv7PaA7MwJZrLYHYrQK4Ft1rY',
|
||||
'logged': '1',
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
for page in range(1, 5):
|
||||
params = {
|
||||
'_': str(int(time.time() * 1000)),
|
||||
'tab': 'resume',
|
||||
'pageSize': '1000',
|
||||
'pageIndex': str(page),
|
||||
'showStatus': 'true',
|
||||
}
|
||||
query_string = urllib.parse.urlencode(params)
|
||||
url = f"{self.start_urls[0]}?{query_string}"
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
method='GET',
|
||||
headers=self.headers,
|
||||
cookies=self.cookies,
|
||||
callback=self.parse
|
||||
)
|
||||
|
||||
def parse(self, response: Response, **kwargs):
|
||||
data = json.loads(response.text)
|
||||
for item in data['data']['items']:
|
||||
resume_id = item['id']
|
||||
detail_url = f"https://www.qj050.com/api/v1/resume/{resume_id}?_={int(time.time() * 1000)}&view_type=resumeLibrary&privacy_description=1"
|
||||
yield scrapy.Request(
|
||||
url=detail_url,
|
||||
method='GET',
|
||||
headers=self.headers,
|
||||
cookies=self.cookies,
|
||||
callback=self.parse_detail
|
||||
)
|
||||
|
||||
def parse_detail(self, response):
|
||||
info = json.loads(response.text).get('data', {})
|
||||
|
||||
data = {}
|
||||
|
||||
# 常规字段
|
||||
data['resume_id'] = info.get('id')
|
||||
data['name'] = info.get('name') or None
|
||||
data['age'] = int(info.get('age')) if info.get('age') else None
|
||||
data['birthday'] = info.get('birthday') or None
|
||||
data['work_years'] = info.get('work_exp_value') or None
|
||||
data['highest_education'] = info.get('edu_value') or None
|
||||
data['marital_status'] = info.get('marriage_value') or None
|
||||
data['phone'] = info.get('phone') or None
|
||||
data['intended_position'] = ','.join(
|
||||
[item.get('name') for item in info.get('infoCateforyArrObj', [])]) if info.get(
|
||||
'infoCateforyArrObj') else None
|
||||
data['expected_salary'] = info.get('salaryDesc') or None
|
||||
data['job_property'] = info.get('work_type_value') or None
|
||||
data['job_status'] = info.get('job_instant_value') or None
|
||||
data['job_location'] = info.get('job_region_value') or None
|
||||
|
||||
# 更新时间 (如果有 last_edit_time,要转 datetime)
|
||||
if 'last_edit_time' in info and info.get('last_edit_time'):
|
||||
try:
|
||||
data['update_time'] = datetime.strptime(info['last_edit_time'], "%Y-%m-%d %H:%M:%S")
|
||||
except Exception:
|
||||
data['update_time'] = None
|
||||
else:
|
||||
data['update_time'] = None
|
||||
|
||||
# 工作经历处理
|
||||
works = info.get('works', [])
|
||||
for i in range(4):
|
||||
if i < len(works):
|
||||
company = works[i].get('company', '')
|
||||
content = works[i].get('content', '')
|
||||
combined = f"{company}:{content}" if company or content else ''
|
||||
data[f'work_{i + 1}_experience'] = combined or None
|
||||
else:
|
||||
data[f'work_{i + 1}_experience'] = None
|
||||
|
||||
data[f'work_{i + 1}_time'] = None
|
||||
data[f'work_{i + 1}_description'] = None
|
||||
data['source_id'] = 4
|
||||
data['crawl_keywords'] = ''
|
||||
yield data
|
@ -51,16 +51,23 @@ class DB:
|
||||
def insert_resume(cls, data: dict):
|
||||
cls.init() # 保证连接已初始化
|
||||
|
||||
# 只保留基本数据类型
|
||||
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
|
||||
|
||||
if 'resume_id' not in safe_data or 'source_id' not in safe_data:
|
||||
# 必须有 source_id + resume_id
|
||||
return
|
||||
|
||||
table = 'resumes_resumebasic'
|
||||
keys = ', '.join(safe_data.keys())
|
||||
placeholders = ', '.join(['%s'] * len(safe_data))
|
||||
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
|
||||
|
||||
# 注意:update时排除 source_id 和 resume_id
|
||||
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k not in ('source_id', 'resume_id')])
|
||||
|
||||
sql = f"""
|
||||
INSERT INTO {table} ({keys}) VALUES ({placeholders})
|
||||
ON DUPLICATE KEY UPDATE {update_clause}
|
||||
"""
|
||||
"""
|
||||
|
||||
cls._client.execute(sql, list(safe_data.values()))
|
||||
|
@ -7,10 +7,11 @@ from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from TS_resume_spider.spiders.yutian_top import YutianTopSpider
|
||||
from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
|
||||
from TS_resume_spider.spiders.fnrc_vpi import FnrcVipSpider
|
||||
|
||||
def main():
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
process.crawl(ZunHuaComSpider)
|
||||
process.crawl(FnrcVipSpider)
|
||||
process.start()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
Loading…
x
Reference in New Issue
Block a user