Add FnrcVipSpider and QJ050ComSpider; update pipelines and db logic for new sources

2025-04-27 22:00:23 +08:00 · 2025-04-27 22:00:23 +08:00 · f153c6d250
commit f153c6d250
parent 90217778be
6 changed files with 283 additions and 70 deletions
--- a/TS_resume_spider/pipelines.py
+++ b/TS_resume_spider/pipelines.py
@ -59,7 +59,7 @@ class YTSpiderPipeline:
            return datetime(2019, 12, 12)
    def process_item(self, item, spider):
-        if spider.name != 'yutian_top':
+        if spider.name not in ['yutian_top','fnrc_vip']:
            return item
        experience = item.get("experience", [])
        for j in range(4):
@ -93,15 +93,18 @@ class YTSpiderPipeline:
        if "update_time" in item:
            item["update_time"] = self.parse_datetime(item["update_time"])
-
+        if spider.name == "yutian_top":
-        item["source_id"] = 2
+            item["source_id"] = 2
-
+        elif spider.name == "fnrc_vip":
            item["source_id"] = 3
        else:
            item["source_id"] = None
        return item
 class YTSavePipeline:
    def process_item(self, item, spider):
-        if spider.name not in ['yutian_top' ,'zhrczp_com']:
+        if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']:
            return item
        resume_id = item.get("resume_id")
        if not resume_id:
--- a/TS_resume_spider/settings.py
+++ b/TS_resume_spider/settings.py
@ -1,93 +1,75 @@
-# Scrapy settings for TS_resume_spider project
+# Scrapy 项目 TS_resume_spider 的配置文件
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 # 指定项目名称，默认会用在 User-Agent 和内部调用
 BOT_NAME = "TS_resume_spider"
 # 指定爬虫类所在的模块（路径）
 SPIDER_MODULES = ["TS_resume_spider.spiders"]
 # 新建爬虫默认生成的地方
 NEWSPIDER_MODULE = "TS_resume_spider.spiders"
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# 自定义 User-Agent，默认用 Scrapy的，可改成模仿浏览器的
 # USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
-# Obey robots.txt rules
+# 是否遵守 robots.txt 规则（推荐 False）
 ROBOTSTXT_OBEY = False
-# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# 配置 Scrapy 最大并发请求数（默认 16）
-# CONCURRENT_REQUESTS = 32
+CONCURRENT_REQUESTS = 64  # 设置并发量为8，减少服务器压力，避免被断连
-# Configure a delay for requests for the same website (default: 0)
+# 同一网站请求间隔时（秒），有效避免被拦截
-# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+DOWNLOAD_DELAY = 0.1
 # See also autothrottle settings and docs
 # DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 # CONCURRENT_REQUESTS_PER_IP = 16
-# Disable cookies (enabled by default)
+# 每个域名最多并发请求数（最高限制）
-# COOKIES_ENABLED = False
+CONCURRENT_REQUESTS_PER_DOMAIN = 64
-# Disable Telnet Console (enabled by default)
+# 禁用网络绑定到单个IP（默认就是 False）
-# TELNETCONSOLE_ENABLED = False
+# CONCURRENT_REQUESTS_PER_IP = 8
-# Override the default request headers:
+# 是否禁用 cookies，如果有记录状态需要开启
 COOKIES_ENABLED = False
 # 默认请求头，如有特殊需要，在请求中单独指定
 # DEFAULT_REQUEST_HEADERS = {
 #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-#    "Accept-Language": "en",
+#    "Accept-Language": "zh-CN,zh;q=0.9",
 # }
-# Enable or disable spider middlewares
+# 启用自动量制（加载随时调整下载延迟），有利于避免被阻止
-# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+AUTOTHROTTLE_ENABLED = False
-# SPIDER_MIDDLEWARES = {
+# 初始的下载延迟（秒）
-#    "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
+AUTOTHROTTLE_START_DELAY = 0
-# }
+# 最大下载延迟（秒），对高延迟服务器有效
 AUTOTHROTTLE_MAX_DELAY = 60
 # 平均并发数，1.0代表一次一个
 AUTOTHROTTLE_TARGET_CONCURRENCY = 10
 # 是否打印每次调整的日志
 AUTOTHROTTLE_DEBUG = False
-# Enable or disable downloader middlewares
+# 启用请求重试，有效处理连接失败，网络错误
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+RETRY_ENABLED = True
-# DOWNLOADER_MIDDLEWARES = {
+# 重试次数（次），默认是2，调高可接受网络不稳定
-#    "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
+RETRY_TIMES = 5
-# }
+# 对些网络错误、服务器错误以及超过限制重试
 RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
-# Enable or disable extensions
+# 配置使用的数据管道，数字小表明执行顺序
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 # EXTENSIONS = {
 #    "scrapy.extensions.telnet.TelnetConsole": None,
 # }
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
    'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
    'TS_resume_spider.pipelines.YTSavePipeline': 500,
 }
-# Enable and configure the AutoThrottle extension (disabled by default)
+# 设置输出文件编码，防止中文乱码
-# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+FEED_EXPORT_ENCODING = "utf-8"
 # AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 # AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 # AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 # AUTOTHROTTLE_DEBUG = False
-# Enable and configure HTTP caching (disabled by default)
+# 启用未来可用的指纹，增强系统兼容性
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 # 如需添加 http cache，可以后期再考虑（不必须）
 # HTTPCACHE_ENABLED = True
 # HTTPCACHE_EXPIRATION_SECS = 0
 # HTTPCACHE_DIR = "httpcache"
 # HTTPCACHE_IGNORE_HTTP_CODES = []
 # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
-
+JOBDIR = 'job_info/ts_resume_spider'
 # Set settings whose default value is deprecated to a future-proof value
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"
--- a/TS_resume_spider/spiders/fnrc_vpi.py
+++ b/TS_resume_spider/spiders/fnrc_vpi.py
@ -0,0 +1,112 @@
 import requests
 import scrapy
 import json
 from pyasn1_modules.rfc7292 import pbeWithSHAAnd40BitRC4
 from requests import session
 # 尽量不要跑这个脚本太他妈慢了
 class FnrcVipSpider(scrapy.Spider):
    name = 'fnrc_vip'
    def start_requests(self):
        url = "https://www.fnrc.vip/job/company/v1/resume/page"
        cookies = {
            'PHPSESSID': 'ca613ae99706037e356a247500acb97b',
            'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM',
            'company_sign': '',
            'company_nonce': '',
            'cuid': '',
        }
        headers = {
            'accept': 'application/json, text/plain, */*',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'cache-control': 'no-cache',
            'content-type': 'application/json;charset=UTF-8',
            'origin': 'https://www.fnrc.vip',
            'pragma': 'no-cache',
            'priority': 'u=1, i',
            'referer': 'https://www.fnrc.vip/enterprise/resume_store/list',
            'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
        }
        json_data = {
            'step': 1000,
            'page': 1,
            'education_level': [],
            'arrival_time': [],
            'work_time': [],
            'area_id': [],
            'keywords': '',
            'work_status': '',
            'work_status_show': '求职状态',
            'category_id': '',
            'work_type': '',
            'work_type_show': '是否兼职',
            'sex': '',
            'sex_show': '性别',
            'is_head': '',
            'is_head_show': '有无照片',
            'job_id': '',
            'age': [],
            'age_show': '年龄',
            'refresh_time': 0,
            'site_id': '',
            'site_id2': '',
            'province': '',
            'city': '',
            'county': '',
            'provinceArr': [],
            'cityArr': [],
            'countyArr': [],
            'only_job_category': 0,
        }
        session = requests.Session()
        session.headers.update(headers)
        session.cookies.update(cookies)
        for page in range(1, 6):
            payload = {
                'step': 1000,
                'page': page,
                'education_level': [],
                'arrival_time': [],
                'work_time': [],
                'area_id': [],
                'keywords': '',
                'work_status': '',
                'work_status_show': '求职状态',
                'category_id': '',
                'work_type': '',
                'work_type_show': '是否兼职',
                'sex': '',
                'sex_show': '性别',
                'is_head': '',
                'is_head_show': '有无照片',
                'job_id': '',
                'age': [],
                'age_show': '年龄',
                'refresh_time': 0,
                'site_id': '',
                'site_id2': '',
                'province': '',
                'city': '',
                'county': '',
                'provinceArr': [],
                'cityArr': [],
                'countyArr': [],
                'only_job_category': 0,
            }
            response = session.post(url, json=payload)
            data = response.json()
            for item in data.get('data', []):
                yield item
--- a/TS_resume_spider/spiders/qj050_com.py
+++ b/TS_resume_spider/spiders/qj050_com.py
@ -0,0 +1,108 @@
 import json
 import scrapy
 import time
 import urllib
 from datetime import datetime
 from scrapy.http import Response
 class QJ050ComSpider(scrapy.Spider):
    name = 'qj050_com'
    allowed_domains = ['qj050.com']
    start_urls = ['https://www.qj050.com/api/v1/resumes']
    headers = {
        'accept': 'application/json, text/plain, */*',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'authorization': 'Bearer 你的token',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
        'x-platform': '1',
        'x-site-id': 'undefined',
    }
    cookies = {
        'x-trace-id': '7d60110f6a7a4df595db14e54ee772dd',
        'has_login_log': 'yes',
        'HMACCOUNT': '52014CC932A93E9B',
        'token': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQ1NzU3MjM4LCJleHAiOjE3NzcyOTMyMzh9.uU9G81yizRRUYCyJymit4n9vuysXCT-2V9PLdmdohgA',
        'token.sig': 'Zta83bKMN9mPlsm9ZVnv7PaA7MwJZrLYHYrQK4Ft1rY',
        'logged': '1',
    }
    def start_requests(self):
        for page in range(1, 5):
            params = {
                '_': str(int(time.time() * 1000)),
                'tab': 'resume',
                'pageSize': '1000',
                'pageIndex': str(page),
                'showStatus': 'true',
            }
            query_string = urllib.parse.urlencode(params)
            url = f"{self.start_urls[0]}?{query_string}"
            yield scrapy.Request(
                url=url,
                method='GET',
                headers=self.headers,
                cookies=self.cookies,
                callback=self.parse
            )
    def parse(self, response: Response, **kwargs):
        data = json.loads(response.text)
        for item in data['data']['items']:
            resume_id = item['id']
            detail_url = f"https://www.qj050.com/api/v1/resume/{resume_id}?_={int(time.time() * 1000)}&view_type=resumeLibrary&privacy_description=1"
            yield scrapy.Request(
                url=detail_url,
                method='GET',
                headers=self.headers,
                cookies=self.cookies,
                callback=self.parse_detail
            )
    def parse_detail(self, response):
        info = json.loads(response.text).get('data', {})
        data = {}
        # 常规字段
        data['resume_id'] = info.get('id')
        data['name'] = info.get('name') or None
        data['age'] = int(info.get('age')) if info.get('age') else None
        data['birthday'] = info.get('birthday') or None
        data['work_years'] = info.get('work_exp_value') or None
        data['highest_education'] = info.get('edu_value') or None
        data['marital_status'] = info.get('marriage_value') or None
        data['phone'] = info.get('phone') or None
        data['intended_position'] = ','.join(
            [item.get('name') for item in info.get('infoCateforyArrObj', [])]) if info.get(
            'infoCateforyArrObj') else None
        data['expected_salary'] = info.get('salaryDesc') or None
        data['job_property'] = info.get('work_type_value') or None
        data['job_status'] = info.get('job_instant_value') or None
        data['job_location'] = info.get('job_region_value') or None
        # 更新时间 (如果有 last_edit_time，要转 datetime)
        if 'last_edit_time' in info and info.get('last_edit_time'):
            try:
                data['update_time'] = datetime.strptime(info['last_edit_time'], "%Y-%m-%d %H:%M:%S")
            except Exception:
                data['update_time'] = None
        else:
            data['update_time'] = None
        # 工作经历处理
        works = info.get('works', [])
        for i in range(4):
            if i < len(works):
                company = works[i].get('company', '')
                content = works[i].get('content', '')
                combined = f"{company}:{content}" if company or content else ''
                data[f'work_{i + 1}_experience'] = combined or None
            else:
                data[f'work_{i + 1}_experience'] = None
            data[f'work_{i + 1}_time'] = None
            data[f'work_{i + 1}_description'] = None
        data['source_id'] = 4
        data['crawl_keywords'] = ''
        yield data
--- a/TS_resume_spider/utils/db.py
+++ b/TS_resume_spider/utils/db.py
@ -51,16 +51,23 @@ class DB:
    def insert_resume(cls, data: dict):
        cls.init()  # 保证连接已初始化
        # 只保留基本数据类型
        safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
        if 'resume_id' not in safe_data or 'source_id' not in safe_data:
            # 必须有 source_id + resume_id
            return
        table = 'resumes_resumebasic'
        keys = ', '.join(safe_data.keys())
        placeholders = ', '.join(['%s'] * len(safe_data))
-        update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
+
        # 注意：update时排除 source_id 和 resume_id
        update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k not in ('source_id', 'resume_id')])
        sql = f"""
            INSERT INTO {table} ({keys}) VALUES ({placeholders})
            ON DUPLICATE KEY UPDATE {update_clause}
-            """
+        """
        cls._client.execute(sql, list(safe_data.values()))
--- a/debug/Debug_yutian_top.py
+++ b/debug/Debug_yutian_top.py
@ -7,10 +7,11 @@ from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
 from TS_resume_spider.spiders.yutian_top import YutianTopSpider
 from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
 from TS_resume_spider.spiders.fnrc_vpi import FnrcVipSpider
 def main():
    process = CrawlerProcess(get_project_settings())
-    process.crawl(ZunHuaComSpider)
+    process.crawl(FnrcVipSpider)
    process.start()
 if __name__ == '__main__':