From f153c6d2505bd0223171b1862b2dd3c26cfc7efd Mon Sep 17 00:00:00 2001 From: Franklin-F Date: Sun, 27 Apr 2025 22:00:23 +0800 Subject: [PATCH] Add FnrcVipSpider and QJ050ComSpider; update pipelines and db logic for new sources --- TS_resume_spider/pipelines.py | 13 +-- TS_resume_spider/settings.py | 106 ++++++++++-------------- TS_resume_spider/spiders/fnrc_vpi.py | 112 ++++++++++++++++++++++++++ TS_resume_spider/spiders/qj050_com.py | 108 +++++++++++++++++++++++++ TS_resume_spider/utils/db.py | 11 ++- debug/Debug_yutian_top.py | 3 +- 6 files changed, 283 insertions(+), 70 deletions(-) create mode 100644 TS_resume_spider/spiders/fnrc_vpi.py create mode 100644 TS_resume_spider/spiders/qj050_com.py diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py index 8e08eb1..ea9b5f5 100644 --- a/TS_resume_spider/pipelines.py +++ b/TS_resume_spider/pipelines.py @@ -59,7 +59,7 @@ class YTSpiderPipeline: return datetime(2019, 12, 12) def process_item(self, item, spider): - if spider.name != 'yutian_top': + if spider.name not in ['yutian_top','fnrc_vip']: return item experience = item.get("experience", []) for j in range(4): @@ -93,15 +93,18 @@ class YTSpiderPipeline: if "update_time" in item: item["update_time"] = self.parse_datetime(item["update_time"]) - - item["source_id"] = 2 - + if spider.name == "yutian_top": + item["source_id"] = 2 + elif spider.name == "fnrc_vip": + item["source_id"] = 3 + else: + item["source_id"] = None return item class YTSavePipeline: def process_item(self, item, spider): - if spider.name not in ['yutian_top' ,'zhrczp_com']: + if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']: return item resume_id = item.get("resume_id") if not resume_id: diff --git a/TS_resume_spider/settings.py b/TS_resume_spider/settings.py index 6bab753..0d9e979 100644 --- a/TS_resume_spider/settings.py +++ b/TS_resume_spider/settings.py @@ -1,93 +1,75 @@ -# Scrapy settings for TS_resume_spider project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# Scrapy 项目 TS_resume_spider 的配置文件 +# 指定项目名称,默认会用在 User-Agent 和内部调用 BOT_NAME = "TS_resume_spider" +# 指定爬虫类所在的模块(路径) SPIDER_MODULES = ["TS_resume_spider.spiders"] +# 新建爬虫默认生成的地方 NEWSPIDER_MODULE = "TS_resume_spider.spiders" -# Crawl responsibly by identifying yourself (and your website) on the user-agent +# 自定义 User-Agent,默认用 Scrapy的,可改成模仿浏览器的 # USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)" -# Obey robots.txt rules +# 是否遵守 robots.txt 规则(推荐 False) ROBOTSTXT_OBEY = False -# Configure maximum concurrent requests performed by Scrapy (default: 16) -# CONCURRENT_REQUESTS = 32 +# 配置 Scrapy 最大并发请求数(默认 16) +CONCURRENT_REQUESTS = 64 # 设置并发量为8,减少服务器压力,避免被断连 -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -# DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -# CONCURRENT_REQUESTS_PER_DOMAIN = 16 -# CONCURRENT_REQUESTS_PER_IP = 16 +# 同一网站请求间隔时(秒),有效避免被拦截 +DOWNLOAD_DELAY = 0.1 -# Disable cookies (enabled by default) -# COOKIES_ENABLED = False +# 每个域名最多并发请求数(最高限制) +CONCURRENT_REQUESTS_PER_DOMAIN = 64 -# Disable Telnet Console (enabled by default) -# TELNETCONSOLE_ENABLED = False +# 禁用网络绑定到单个IP(默认就是 False) +# CONCURRENT_REQUESTS_PER_IP = 8 -# Override the default request headers: +# 是否禁用 cookies,如果有记录状态需要开启 +COOKIES_ENABLED = False + +# 默认请求头,如有特殊需要,在请求中单独指定 # DEFAULT_REQUEST_HEADERS = { # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", -# "Accept-Language": "en", +# "Accept-Language": "zh-CN,zh;q=0.9", # } -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -# SPIDER_MIDDLEWARES = { -# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543, -# } +# 启用自动量制(加载随时调整下载延迟),有利于避免被阻止 +AUTOTHROTTLE_ENABLED = False +# 初始的下载延迟(秒) +AUTOTHROTTLE_START_DELAY = 0 +# 最大下载延迟(秒),对高延迟服务器有效 +AUTOTHROTTLE_MAX_DELAY = 60 +# 平均并发数,1.0代表一次一个 +AUTOTHROTTLE_TARGET_CONCURRENCY = 10 +# 是否打印每次调整的日志 +AUTOTHROTTLE_DEBUG = False -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543, -# } +# 启用请求重试,有效处理连接失败,网络错误 +RETRY_ENABLED = True +# 重试次数(次),默认是2,调高可接受网络不稳定 +RETRY_TIMES = 5 +# 对些网络错误、服务器错误以及超过限制重试 +RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429] -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -# EXTENSIONS = { -# "scrapy.extensions.telnet.TelnetConsole": None, -# } - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +# 配置使用的数据管道,数字小表明执行顺序 ITEM_PIPELINES = { 'TS_resume_spider.pipelines.YTSpiderPipeline': 300, 'TS_resume_spider.pipelines.YTSavePipeline': 500, } -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -# AUTOTHROTTLE_ENABLED = True -# The initial download delay -# AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -# AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -# AUTOTHROTTLE_DEBUG = False +# 设置输出文件编码,防止中文乱码 +FEED_EXPORT_ENCODING = "utf-8" -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# 启用未来可用的指纹,增强系统兼容性 +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" + +# 如需添加 http cache,可以后期再考虑(不必须) # HTTPCACHE_ENABLED = True # HTTPCACHE_EXPIRATION_SECS = 0 # HTTPCACHE_DIR = "httpcache" # HTTPCACHE_IGNORE_HTTP_CODES = [] # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" - -# Set settings whose default value is deprecated to a future-proof value -REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" -TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" -FEED_EXPORT_ENCODING = "utf-8" +JOBDIR = 'job_info/ts_resume_spider' diff --git a/TS_resume_spider/spiders/fnrc_vpi.py b/TS_resume_spider/spiders/fnrc_vpi.py new file mode 100644 index 0000000..b69f81d --- /dev/null +++ b/TS_resume_spider/spiders/fnrc_vpi.py @@ -0,0 +1,112 @@ +import requests +import scrapy +import json + +from pyasn1_modules.rfc7292 import pbeWithSHAAnd40BitRC4 +from requests import session + +# 尽量不要跑这个脚本太他妈慢了 + +class FnrcVipSpider(scrapy.Spider): + name = 'fnrc_vip' + + def start_requests(self): + url = "https://www.fnrc.vip/job/company/v1/resume/page" + cookies = { + 'PHPSESSID': 'ca613ae99706037e356a247500acb97b', + 'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM', + 'company_sign': '', + 'company_nonce': '', + 'cuid': '', + } + + headers = { + 'accept': 'application/json, text/plain, */*', + 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'cache-control': 'no-cache', + 'content-type': 'application/json;charset=UTF-8', + 'origin': 'https://www.fnrc.vip', + 'pragma': 'no-cache', + 'priority': 'u=1, i', + 'referer': 'https://www.fnrc.vip/enterprise/resume_store/list', + 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + } + + json_data = { + 'step': 1000, + 'page': 1, + 'education_level': [], + 'arrival_time': [], + 'work_time': [], + 'area_id': [], + 'keywords': '', + 'work_status': '', + 'work_status_show': '求职状态', + 'category_id': '', + 'work_type': '', + 'work_type_show': '是否兼职', + 'sex': '', + 'sex_show': '性别', + 'is_head': '', + 'is_head_show': '有无照片', + 'job_id': '', + 'age': [], + 'age_show': '年龄', + 'refresh_time': 0, + 'site_id': '', + 'site_id2': '', + 'province': '', + 'city': '', + 'county': '', + 'provinceArr': [], + 'cityArr': [], + 'countyArr': [], + 'only_job_category': 0, + } + session = requests.Session() + session.headers.update(headers) + session.cookies.update(cookies) + for page in range(1, 6): + payload = { + 'step': 1000, + 'page': page, + 'education_level': [], + 'arrival_time': [], + 'work_time': [], + 'area_id': [], + 'keywords': '', + 'work_status': '', + 'work_status_show': '求职状态', + 'category_id': '', + 'work_type': '', + 'work_type_show': '是否兼职', + 'sex': '', + 'sex_show': '性别', + 'is_head': '', + 'is_head_show': '有无照片', + 'job_id': '', + 'age': [], + 'age_show': '年龄', + 'refresh_time': 0, + 'site_id': '', + 'site_id2': '', + 'province': '', + 'city': '', + 'county': '', + 'provinceArr': [], + 'cityArr': [], + 'countyArr': [], + 'only_job_category': 0, + } + + response = session.post(url, json=payload) + data = response.json() + + for item in data.get('data', []): + yield item \ No newline at end of file diff --git a/TS_resume_spider/spiders/qj050_com.py b/TS_resume_spider/spiders/qj050_com.py new file mode 100644 index 0000000..0011c99 --- /dev/null +++ b/TS_resume_spider/spiders/qj050_com.py @@ -0,0 +1,108 @@ +import json +import scrapy +import time +import urllib +from datetime import datetime +from scrapy.http import Response + +class QJ050ComSpider(scrapy.Spider): + name = 'qj050_com' + allowed_domains = ['qj050.com'] + start_urls = ['https://www.qj050.com/api/v1/resumes'] + + headers = { + 'accept': 'application/json, text/plain, */*', + 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'authorization': 'Bearer 你的token', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + 'x-platform': '1', + 'x-site-id': 'undefined', + } + cookies = { + 'x-trace-id': '7d60110f6a7a4df595db14e54ee772dd', + 'has_login_log': 'yes', + 'HMACCOUNT': '52014CC932A93E9B', + 'token': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQ1NzU3MjM4LCJleHAiOjE3NzcyOTMyMzh9.uU9G81yizRRUYCyJymit4n9vuysXCT-2V9PLdmdohgA', + 'token.sig': 'Zta83bKMN9mPlsm9ZVnv7PaA7MwJZrLYHYrQK4Ft1rY', + 'logged': '1', + } + + def start_requests(self): + for page in range(1, 5): + params = { + '_': str(int(time.time() * 1000)), + 'tab': 'resume', + 'pageSize': '1000', + 'pageIndex': str(page), + 'showStatus': 'true', + } + query_string = urllib.parse.urlencode(params) + url = f"{self.start_urls[0]}?{query_string}" + yield scrapy.Request( + url=url, + method='GET', + headers=self.headers, + cookies=self.cookies, + callback=self.parse + ) + + def parse(self, response: Response, **kwargs): + data = json.loads(response.text) + for item in data['data']['items']: + resume_id = item['id'] + detail_url = f"https://www.qj050.com/api/v1/resume/{resume_id}?_={int(time.time() * 1000)}&view_type=resumeLibrary&privacy_description=1" + yield scrapy.Request( + url=detail_url, + method='GET', + headers=self.headers, + cookies=self.cookies, + callback=self.parse_detail + ) + + def parse_detail(self, response): + info = json.loads(response.text).get('data', {}) + + data = {} + + # 常规字段 + data['resume_id'] = info.get('id') + data['name'] = info.get('name') or None + data['age'] = int(info.get('age')) if info.get('age') else None + data['birthday'] = info.get('birthday') or None + data['work_years'] = info.get('work_exp_value') or None + data['highest_education'] = info.get('edu_value') or None + data['marital_status'] = info.get('marriage_value') or None + data['phone'] = info.get('phone') or None + data['intended_position'] = ','.join( + [item.get('name') for item in info.get('infoCateforyArrObj', [])]) if info.get( + 'infoCateforyArrObj') else None + data['expected_salary'] = info.get('salaryDesc') or None + data['job_property'] = info.get('work_type_value') or None + data['job_status'] = info.get('job_instant_value') or None + data['job_location'] = info.get('job_region_value') or None + + # 更新时间 (如果有 last_edit_time,要转 datetime) + if 'last_edit_time' in info and info.get('last_edit_time'): + try: + data['update_time'] = datetime.strptime(info['last_edit_time'], "%Y-%m-%d %H:%M:%S") + except Exception: + data['update_time'] = None + else: + data['update_time'] = None + + # 工作经历处理 + works = info.get('works', []) + for i in range(4): + if i < len(works): + company = works[i].get('company', '') + content = works[i].get('content', '') + combined = f"{company}:{content}" if company or content else '' + data[f'work_{i + 1}_experience'] = combined or None + else: + data[f'work_{i + 1}_experience'] = None + + data[f'work_{i + 1}_time'] = None + data[f'work_{i + 1}_description'] = None + data['source_id'] = 4 + data['crawl_keywords'] = '' + yield data diff --git a/TS_resume_spider/utils/db.py b/TS_resume_spider/utils/db.py index 834c6f9..c84b7b6 100644 --- a/TS_resume_spider/utils/db.py +++ b/TS_resume_spider/utils/db.py @@ -51,16 +51,23 @@ class DB: def insert_resume(cls, data: dict): cls.init() # 保证连接已初始化 + # 只保留基本数据类型 safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))} + if 'resume_id' not in safe_data or 'source_id' not in safe_data: + # 必须有 source_id + resume_id + return + table = 'resumes_resumebasic' keys = ', '.join(safe_data.keys()) placeholders = ', '.join(['%s'] * len(safe_data)) - update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id']) + + # 注意:update时排除 source_id 和 resume_id + update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k not in ('source_id', 'resume_id')]) sql = f""" INSERT INTO {table} ({keys}) VALUES ({placeholders}) ON DUPLICATE KEY UPDATE {update_clause} - """ + """ cls._client.execute(sql, list(safe_data.values())) diff --git a/debug/Debug_yutian_top.py b/debug/Debug_yutian_top.py index a190a37..65a4fa3 100644 --- a/debug/Debug_yutian_top.py +++ b/debug/Debug_yutian_top.py @@ -7,10 +7,11 @@ from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from TS_resume_spider.spiders.yutian_top import YutianTopSpider from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider +from TS_resume_spider.spiders.fnrc_vpi import FnrcVipSpider def main(): process = CrawlerProcess(get_project_settings()) - process.crawl(ZunHuaComSpider) + process.crawl(FnrcVipSpider) process.start() if __name__ == '__main__':