Add FnrcVipSpider and QJ050ComSpider; update pipelines and db logic for new sources

This commit is contained in:
晓丰 2025-04-27 22:00:23 +08:00
parent 90217778be
commit f153c6d250
6 changed files with 283 additions and 70 deletions

View File

@ -59,7 +59,7 @@ class YTSpiderPipeline:
return datetime(2019, 12, 12)
def process_item(self, item, spider):
if spider.name != 'yutian_top':
if spider.name not in ['yutian_top','fnrc_vip']:
return item
experience = item.get("experience", [])
for j in range(4):
@ -93,15 +93,18 @@ class YTSpiderPipeline:
if "update_time" in item:
item["update_time"] = self.parse_datetime(item["update_time"])
if spider.name == "yutian_top":
item["source_id"] = 2
elif spider.name == "fnrc_vip":
item["source_id"] = 3
else:
item["source_id"] = None
return item
class YTSavePipeline:
def process_item(self, item, spider):
if spider.name not in ['yutian_top' ,'zhrczp_com']:
if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']:
return item
resume_id = item.get("resume_id")
if not resume_id:

View File

@ -1,93 +1,75 @@
# Scrapy settings for TS_resume_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# Scrapy 项目 TS_resume_spider 的配置文件
# 指定项目名称,默认会用在 User-Agent 和内部调用
BOT_NAME = "TS_resume_spider"
# 指定爬虫类所在的模块(路径)
SPIDER_MODULES = ["TS_resume_spider.spiders"]
# 新建爬虫默认生成的地方
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# 自定义 User-Agent默认用 Scrapy的可改成模仿浏览器的
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
# Obey robots.txt rules
# 是否遵守 robots.txt 规则(推荐 False
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# 配置 Scrapy 最大并发请求数(默认 16
CONCURRENT_REQUESTS = 64 # 设置并发量为8减少服务器压力避免被断连
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# 同一网站请求间隔时(秒),有效避免被拦截
DOWNLOAD_DELAY = 0.1
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# 每个域名最多并发请求数(最高限制)
CONCURRENT_REQUESTS_PER_DOMAIN = 64
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# 禁用网络绑定到单个IP默认就是 False
# CONCURRENT_REQUESTS_PER_IP = 8
# Override the default request headers:
# 是否禁用 cookies如果有记录状态需要开启
COOKIES_ENABLED = False
# 默认请求头,如有特殊需要,在请求中单独指定
# DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
# "Accept-Language": "zh-CN,zh;q=0.9",
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
# }
# 启用自动量制(加载随时调整下载延迟),有利于避免被阻止
AUTOTHROTTLE_ENABLED = False
# 初始的下载延迟(秒)
AUTOTHROTTLE_START_DELAY = 0
# 最大下载延迟(秒),对高延迟服务器有效
AUTOTHROTTLE_MAX_DELAY = 60
# 平均并发数1.0代表一次一个
AUTOTHROTTLE_TARGET_CONCURRENCY = 10
# 是否打印每次调整的日志
AUTOTHROTTLE_DEBUG = False
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
# }
# 启用请求重试,有效处理连接失败,网络错误
RETRY_ENABLED = True
# 重试次数默认是2调高可接受网络不稳定
RETRY_TIMES = 5
# 对些网络错误、服务器错误以及超过限制重试
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 配置使用的数据管道,数字小表明执行顺序
ITEM_PIPELINES = {
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
'TS_resume_spider.pipelines.YTSavePipeline': 500,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# 设置输出文件编码,防止中文乱码
FEED_EXPORT_ENCODING = "utf-8"
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# 启用未来可用的指纹,增强系统兼容性
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
# 如需添加 http cache可以后期再考虑不必须
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
JOBDIR = 'job_info/ts_resume_spider'

View File

@ -0,0 +1,112 @@
import requests
import scrapy
import json
from pyasn1_modules.rfc7292 import pbeWithSHAAnd40BitRC4
from requests import session
# 尽量不要跑这个脚本太他妈慢了
class FnrcVipSpider(scrapy.Spider):
name = 'fnrc_vip'
def start_requests(self):
url = "https://www.fnrc.vip/job/company/v1/resume/page"
cookies = {
'PHPSESSID': 'ca613ae99706037e356a247500acb97b',
'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM',
'company_sign': '',
'company_nonce': '',
'cuid': '',
}
headers = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cache-control': 'no-cache',
'content-type': 'application/json;charset=UTF-8',
'origin': 'https://www.fnrc.vip',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.fnrc.vip/enterprise/resume_store/list',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
}
json_data = {
'step': 1000,
'page': 1,
'education_level': [],
'arrival_time': [],
'work_time': [],
'area_id': [],
'keywords': '',
'work_status': '',
'work_status_show': '求职状态',
'category_id': '',
'work_type': '',
'work_type_show': '是否兼职',
'sex': '',
'sex_show': '性别',
'is_head': '',
'is_head_show': '有无照片',
'job_id': '',
'age': [],
'age_show': '年龄',
'refresh_time': 0,
'site_id': '',
'site_id2': '',
'province': '',
'city': '',
'county': '',
'provinceArr': [],
'cityArr': [],
'countyArr': [],
'only_job_category': 0,
}
session = requests.Session()
session.headers.update(headers)
session.cookies.update(cookies)
for page in range(1, 6):
payload = {
'step': 1000,
'page': page,
'education_level': [],
'arrival_time': [],
'work_time': [],
'area_id': [],
'keywords': '',
'work_status': '',
'work_status_show': '求职状态',
'category_id': '',
'work_type': '',
'work_type_show': '是否兼职',
'sex': '',
'sex_show': '性别',
'is_head': '',
'is_head_show': '有无照片',
'job_id': '',
'age': [],
'age_show': '年龄',
'refresh_time': 0,
'site_id': '',
'site_id2': '',
'province': '',
'city': '',
'county': '',
'provinceArr': [],
'cityArr': [],
'countyArr': [],
'only_job_category': 0,
}
response = session.post(url, json=payload)
data = response.json()
for item in data.get('data', []):
yield item

View File

@ -0,0 +1,108 @@
import json
import scrapy
import time
import urllib
from datetime import datetime
from scrapy.http import Response
class QJ050ComSpider(scrapy.Spider):
name = 'qj050_com'
allowed_domains = ['qj050.com']
start_urls = ['https://www.qj050.com/api/v1/resumes']
headers = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'authorization': 'Bearer 你的token',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
'x-platform': '1',
'x-site-id': 'undefined',
}
cookies = {
'x-trace-id': '7d60110f6a7a4df595db14e54ee772dd',
'has_login_log': 'yes',
'HMACCOUNT': '52014CC932A93E9B',
'token': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQ1NzU3MjM4LCJleHAiOjE3NzcyOTMyMzh9.uU9G81yizRRUYCyJymit4n9vuysXCT-2V9PLdmdohgA',
'token.sig': 'Zta83bKMN9mPlsm9ZVnv7PaA7MwJZrLYHYrQK4Ft1rY',
'logged': '1',
}
def start_requests(self):
for page in range(1, 5):
params = {
'_': str(int(time.time() * 1000)),
'tab': 'resume',
'pageSize': '1000',
'pageIndex': str(page),
'showStatus': 'true',
}
query_string = urllib.parse.urlencode(params)
url = f"{self.start_urls[0]}?{query_string}"
yield scrapy.Request(
url=url,
method='GET',
headers=self.headers,
cookies=self.cookies,
callback=self.parse
)
def parse(self, response: Response, **kwargs):
data = json.loads(response.text)
for item in data['data']['items']:
resume_id = item['id']
detail_url = f"https://www.qj050.com/api/v1/resume/{resume_id}?_={int(time.time() * 1000)}&view_type=resumeLibrary&privacy_description=1"
yield scrapy.Request(
url=detail_url,
method='GET',
headers=self.headers,
cookies=self.cookies,
callback=self.parse_detail
)
def parse_detail(self, response):
info = json.loads(response.text).get('data', {})
data = {}
# 常规字段
data['resume_id'] = info.get('id')
data['name'] = info.get('name') or None
data['age'] = int(info.get('age')) if info.get('age') else None
data['birthday'] = info.get('birthday') or None
data['work_years'] = info.get('work_exp_value') or None
data['highest_education'] = info.get('edu_value') or None
data['marital_status'] = info.get('marriage_value') or None
data['phone'] = info.get('phone') or None
data['intended_position'] = ','.join(
[item.get('name') for item in info.get('infoCateforyArrObj', [])]) if info.get(
'infoCateforyArrObj') else None
data['expected_salary'] = info.get('salaryDesc') or None
data['job_property'] = info.get('work_type_value') or None
data['job_status'] = info.get('job_instant_value') or None
data['job_location'] = info.get('job_region_value') or None
# 更新时间 (如果有 last_edit_time要转 datetime)
if 'last_edit_time' in info and info.get('last_edit_time'):
try:
data['update_time'] = datetime.strptime(info['last_edit_time'], "%Y-%m-%d %H:%M:%S")
except Exception:
data['update_time'] = None
else:
data['update_time'] = None
# 工作经历处理
works = info.get('works', [])
for i in range(4):
if i < len(works):
company = works[i].get('company', '')
content = works[i].get('content', '')
combined = f"{company}:{content}" if company or content else ''
data[f'work_{i + 1}_experience'] = combined or None
else:
data[f'work_{i + 1}_experience'] = None
data[f'work_{i + 1}_time'] = None
data[f'work_{i + 1}_description'] = None
data['source_id'] = 4
data['crawl_keywords'] = ''
yield data

View File

@ -51,12 +51,19 @@ class DB:
def insert_resume(cls, data: dict):
cls.init() # 保证连接已初始化
# 只保留基本数据类型
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
if 'resume_id' not in safe_data or 'source_id' not in safe_data:
# 必须有 source_id + resume_id
return
table = 'resumes_resumebasic'
keys = ', '.join(safe_data.keys())
placeholders = ', '.join(['%s'] * len(safe_data))
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
# 注意update时排除 source_id 和 resume_id
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k not in ('source_id', 'resume_id')])
sql = f"""
INSERT INTO {table} ({keys}) VALUES ({placeholders})

View File

@ -7,10 +7,11 @@ from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from TS_resume_spider.spiders.yutian_top import YutianTopSpider
from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
from TS_resume_spider.spiders.fnrc_vpi import FnrcVipSpider
def main():
process = CrawlerProcess(get_project_settings())
process.crawl(ZunHuaComSpider)
process.crawl(FnrcVipSpider)
process.start()
if __name__ == '__main__':