Add FnrcVipSpider and QJ050ComSpider; update pipelines and db logic for new sources

2025-04-27 22:00:23 +08:00 · 2025-04-27 22:00:23 +08:00 · f153c6d250
commit f153c6d250
parent 90217778be
6 changed files with 283 additions and 70 deletions
--- a/TS_resume_spider/pipelines.py
+++ b/TS_resume_spider/pipelines.py
@ -59,7 +59,7 @@ class YTSpiderPipeline:
            return datetime(2019, 12, 12)

    def process_item(self, item, spider):
-        if spider.name != 'yutian_top':
+        if spider.name not in ['yutian_top','fnrc_vip']:
            return item
        experience = item.get("experience", [])
        for j in range(4):
@ -93,15 +93,18 @@ class YTSpiderPipeline:

        if "update_time" in item:
            item["update_time"] = self.parse_datetime(item["update_time"])
-
+        if spider.name == "yutian_top":
            item["source_id"] = 2
-
+        elif spider.name == "fnrc_vip":
+            item["source_id"] = 3
+        else:
+            item["source_id"] = None
        return item


 class YTSavePipeline:
    def process_item(self, item, spider):
-        if spider.name not in ['yutian_top' ,'zhrczp_com']:
+        if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']:
            return item
        resume_id = item.get("resume_id")
        if not resume_id:
--- a/TS_resume_spider/settings.py
+++ b/TS_resume_spider/settings.py
@ -1,93 +1,75 @@
-# Scrapy settings for TS_resume_spider project
-#
-# For simplicity, this file contains only settings considered important or
-# commonly used. You can find more settings consulting the documentation:
-#
-#     https://docs.scrapy.org/en/latest/topics/settings.html
-#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# Scrapy 项目 TS_resume_spider 的配置文件

+# 指定项目名称，默认会用在 User-Agent 和内部调用
 BOT_NAME = "TS_resume_spider"

+# 指定爬虫类所在的模块（路径）
 SPIDER_MODULES = ["TS_resume_spider.spiders"]
+# 新建爬虫默认生成的地方
 NEWSPIDER_MODULE = "TS_resume_spider.spiders"

-# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# 自定义 User-Agent，默认用 Scrapy的，可改成模仿浏览器的
 # USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"

-# Obey robots.txt rules
+# 是否遵守 robots.txt 规则（推荐 False）
 ROBOTSTXT_OBEY = False

-# Configure maximum concurrent requests performed by Scrapy (default: 16)
-# CONCURRENT_REQUESTS = 32
+# 配置 Scrapy 最大并发请求数（默认 16）
+CONCURRENT_REQUESTS = 64  # 设置并发量为8，减少服务器压力，避免被断连

-# Configure a delay for requests for the same website (default: 0)
-# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
-# See also autothrottle settings and docs
-# DOWNLOAD_DELAY = 3
-# The download delay setting will honor only one of:
-# CONCURRENT_REQUESTS_PER_DOMAIN = 16
-# CONCURRENT_REQUESTS_PER_IP = 16
+# 同一网站请求间隔时（秒），有效避免被拦截
+DOWNLOAD_DELAY = 0.1

-# Disable cookies (enabled by default)
-# COOKIES_ENABLED = False
+# 每个域名最多并发请求数（最高限制）
+CONCURRENT_REQUESTS_PER_DOMAIN = 64

-# Disable Telnet Console (enabled by default)
-# TELNETCONSOLE_ENABLED = False
+# 禁用网络绑定到单个IP（默认就是 False）
+# CONCURRENT_REQUESTS_PER_IP = 8

-# Override the default request headers:
+# 是否禁用 cookies，如果有记录状态需要开启
+COOKIES_ENABLED = False
+
+# 默认请求头，如有特殊需要，在请求中单独指定
 # DEFAULT_REQUEST_HEADERS = {
 #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-#    "Accept-Language": "en",
+#    "Accept-Language": "zh-CN,zh;q=0.9",
 # }

-# Enable or disable spider middlewares
-# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-# SPIDER_MIDDLEWARES = {
-#    "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
-# }
+# 启用自动量制（加载随时调整下载延迟），有利于避免被阻止
+AUTOTHROTTLE_ENABLED = False
+# 初始的下载延迟（秒）
+AUTOTHROTTLE_START_DELAY = 0
+# 最大下载延迟（秒），对高延迟服务器有效
+AUTOTHROTTLE_MAX_DELAY = 60
+# 平均并发数，1.0代表一次一个
+AUTOTHROTTLE_TARGET_CONCURRENCY = 10
+# 是否打印每次调整的日志
+AUTOTHROTTLE_DEBUG = False

-# Enable or disable downloader middlewares
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-# DOWNLOADER_MIDDLEWARES = {
-#    "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
-# }
+# 启用请求重试，有效处理连接失败，网络错误
+RETRY_ENABLED = True
+# 重试次数（次），默认是2，调高可接受网络不稳定
+RETRY_TIMES = 5
+# 对些网络错误、服务器错误以及超过限制重试
+RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]

-# Enable or disable extensions
-# See https://docs.scrapy.org/en/latest/topics/extensions.html
-# EXTENSIONS = {
-#    "scrapy.extensions.telnet.TelnetConsole": None,
-# }
-
-# Configure item pipelines
-# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# 配置使用的数据管道，数字小表明执行顺序
 ITEM_PIPELINES = {
    'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
    'TS_resume_spider.pipelines.YTSavePipeline': 500,
 }

-# Enable and configure the AutoThrottle extension (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-# AUTOTHROTTLE_ENABLED = True
-# The initial download delay
-# AUTOTHROTTLE_START_DELAY = 5
-# The maximum download delay to be set in case of high latencies
-# AUTOTHROTTLE_MAX_DELAY = 60
-# The average number of requests Scrapy should be sending in parallel to
-# each remote server
-# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-# Enable showing throttling stats for every response received:
-# AUTOTHROTTLE_DEBUG = False
+# 设置输出文件编码，防止中文乱码
+FEED_EXPORT_ENCODING = "utf-8"

-# Enable and configure HTTP caching (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# 启用未来可用的指纹，增强系统兼容性
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+
+# 如需添加 http cache，可以后期再考虑（不必须）
 # HTTPCACHE_ENABLED = True
 # HTTPCACHE_EXPIRATION_SECS = 0
 # HTTPCACHE_DIR = "httpcache"
 # HTTPCACHE_IGNORE_HTTP_CODES = []
 # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
-
-# Set settings whose default value is deprecated to a future-proof value
-REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
-TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
-FEED_EXPORT_ENCODING = "utf-8"
+JOBDIR = 'job_info/ts_resume_spider'
--- a/TS_resume_spider/spiders/fnrc_vpi.py
+++ b/TS_resume_spider/spiders/fnrc_vpi.py
@ -0,0 +1,112 @@
+import requests
+import scrapy
+import json
+
+from pyasn1_modules.rfc7292 import pbeWithSHAAnd40BitRC4
+from requests import session
+
+# 尽量不要跑这个脚本太他妈慢了
+
+class FnrcVipSpider(scrapy.Spider):
+    name = 'fnrc_vip'
+
+    def start_requests(self):
+        url = "https://www.fnrc.vip/job/company/v1/resume/page"
+        cookies = {
+            'PHPSESSID': 'ca613ae99706037e356a247500acb97b',
+            'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM',
+            'company_sign': '',
+            'company_nonce': '',
+            'cuid': '',
+        }
+
+        headers = {
+            'accept': 'application/json, text/plain, */*',
+            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
+            'cache-control': 'no-cache',
+            'content-type': 'application/json;charset=UTF-8',
+            'origin': 'https://www.fnrc.vip',
+            'pragma': 'no-cache',
+            'priority': 'u=1, i',
+            'referer': 'https://www.fnrc.vip/enterprise/resume_store/list',
+            'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"Windows"',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
+        }
+
+        json_data = {
+            'step': 1000,
+            'page': 1,
+            'education_level': [],
+            'arrival_time': [],
+            'work_time': [],
+            'area_id': [],
+            'keywords': '',
+            'work_status': '',
+            'work_status_show': '求职状态',
+            'category_id': '',
+            'work_type': '',
+            'work_type_show': '是否兼职',
+            'sex': '',
+            'sex_show': '性别',
+            'is_head': '',
+            'is_head_show': '有无照片',
+            'job_id': '',
+            'age': [],
+            'age_show': '年龄',
+            'refresh_time': 0,
+            'site_id': '',
+            'site_id2': '',
+            'province': '',
+            'city': '',
+            'county': '',
+            'provinceArr': [],
+            'cityArr': [],
+            'countyArr': [],
+            'only_job_category': 0,
+        }
+        session = requests.Session()
+        session.headers.update(headers)
+        session.cookies.update(cookies)
+        for page in range(1, 6):
+            payload = {
+                'step': 1000,
+                'page': page,
+                'education_level': [],
+                'arrival_time': [],
+                'work_time': [],
+                'area_id': [],
+                'keywords': '',
+                'work_status': '',
+                'work_status_show': '求职状态',
+                'category_id': '',
+                'work_type': '',
+                'work_type_show': '是否兼职',
+                'sex': '',
+                'sex_show': '性别',
+                'is_head': '',
+                'is_head_show': '有无照片',
+                'job_id': '',
+                'age': [],
+                'age_show': '年龄',
+                'refresh_time': 0,
+                'site_id': '',
+                'site_id2': '',
+                'province': '',
+                'city': '',
+                'county': '',
+                'provinceArr': [],
+                'cityArr': [],
+                'countyArr': [],
+                'only_job_category': 0,
+            }
+
+            response = session.post(url, json=payload)
+            data = response.json()
+
+            for item in data.get('data', []):
+                yield item
--- a/TS_resume_spider/spiders/qj050_com.py
+++ b/TS_resume_spider/spiders/qj050_com.py
@ -0,0 +1,108 @@
+import json
+import scrapy
+import time
+import urllib
+from datetime import datetime
+from scrapy.http import Response
+
+class QJ050ComSpider(scrapy.Spider):
+    name = 'qj050_com'
+    allowed_domains = ['qj050.com']
+    start_urls = ['https://www.qj050.com/api/v1/resumes']
+
+    headers = {
+        'accept': 'application/json, text/plain, */*',
+        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
+        'authorization': 'Bearer 你的token',
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
+        'x-platform': '1',
+        'x-site-id': 'undefined',
+    }
+    cookies = {
+        'x-trace-id': '7d60110f6a7a4df595db14e54ee772dd',
+        'has_login_log': 'yes',
+        'HMACCOUNT': '52014CC932A93E9B',
+        'token': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQ1NzU3MjM4LCJleHAiOjE3NzcyOTMyMzh9.uU9G81yizRRUYCyJymit4n9vuysXCT-2V9PLdmdohgA',
+        'token.sig': 'Zta83bKMN9mPlsm9ZVnv7PaA7MwJZrLYHYrQK4Ft1rY',
+        'logged': '1',
+    }
+
+    def start_requests(self):
+        for page in range(1, 5):
+            params = {
+                '_': str(int(time.time() * 1000)),
+                'tab': 'resume',
+                'pageSize': '1000',
+                'pageIndex': str(page),
+                'showStatus': 'true',
+            }
+            query_string = urllib.parse.urlencode(params)
+            url = f"{self.start_urls[0]}?{query_string}"
+            yield scrapy.Request(
+                url=url,
+                method='GET',
+                headers=self.headers,
+                cookies=self.cookies,
+                callback=self.parse
+            )
+
+    def parse(self, response: Response, **kwargs):
+        data = json.loads(response.text)
+        for item in data['data']['items']:
+            resume_id = item['id']
+            detail_url = f"https://www.qj050.com/api/v1/resume/{resume_id}?_={int(time.time() * 1000)}&view_type=resumeLibrary&privacy_description=1"
+            yield scrapy.Request(
+                url=detail_url,
+                method='GET',
+                headers=self.headers,
+                cookies=self.cookies,
+                callback=self.parse_detail
+            )
+
+    def parse_detail(self, response):
+        info = json.loads(response.text).get('data', {})
+
+        data = {}
+
+        # 常规字段
+        data['resume_id'] = info.get('id')
+        data['name'] = info.get('name') or None
+        data['age'] = int(info.get('age')) if info.get('age') else None
+        data['birthday'] = info.get('birthday') or None
+        data['work_years'] = info.get('work_exp_value') or None
+        data['highest_education'] = info.get('edu_value') or None
+        data['marital_status'] = info.get('marriage_value') or None
+        data['phone'] = info.get('phone') or None
+        data['intended_position'] = ','.join(
+            [item.get('name') for item in info.get('infoCateforyArrObj', [])]) if info.get(
+            'infoCateforyArrObj') else None
+        data['expected_salary'] = info.get('salaryDesc') or None
+        data['job_property'] = info.get('work_type_value') or None
+        data['job_status'] = info.get('job_instant_value') or None
+        data['job_location'] = info.get('job_region_value') or None
+
+        # 更新时间 (如果有 last_edit_time，要转 datetime)
+        if 'last_edit_time' in info and info.get('last_edit_time'):
+            try:
+                data['update_time'] = datetime.strptime(info['last_edit_time'], "%Y-%m-%d %H:%M:%S")
+            except Exception:
+                data['update_time'] = None
+        else:
+            data['update_time'] = None
+
+        # 工作经历处理
+        works = info.get('works', [])
+        for i in range(4):
+            if i < len(works):
+                company = works[i].get('company', '')
+                content = works[i].get('content', '')
+                combined = f"{company}:{content}" if company or content else ''
+                data[f'work_{i + 1}_experience'] = combined or None
+            else:
+                data[f'work_{i + 1}_experience'] = None
+
+            data[f'work_{i + 1}_time'] = None
+            data[f'work_{i + 1}_description'] = None
+        data['source_id'] = 4
+        data['crawl_keywords'] = ''
+        yield data
--- a/TS_resume_spider/utils/db.py
+++ b/TS_resume_spider/utils/db.py
@ -51,12 +51,19 @@ class DB:
    def insert_resume(cls, data: dict):
        cls.init()  # 保证连接已初始化

+        # 只保留基本数据类型
        safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}

+        if 'resume_id' not in safe_data or 'source_id' not in safe_data:
+            # 必须有 source_id + resume_id
+            return
+
        table = 'resumes_resumebasic'
        keys = ', '.join(safe_data.keys())
        placeholders = ', '.join(['%s'] * len(safe_data))
-        update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
+
+        # 注意：update时排除 source_id 和 resume_id
+        update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k not in ('source_id', 'resume_id')])

        sql = f"""
            INSERT INTO {table} ({keys}) VALUES ({placeholders})
--- a/debug/Debug_yutian_top.py
+++ b/debug/Debug_yutian_top.py
@ -7,10 +7,11 @@ from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
 from TS_resume_spider.spiders.yutian_top import YutianTopSpider
 from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
+from TS_resume_spider.spiders.fnrc_vpi import FnrcVipSpider

 def main():
    process = CrawlerProcess(get_project_settings())
-    process.crawl(ZunHuaComSpider)
+    process.crawl(FnrcVipSpider)
    process.start()

 if __name__ == '__main__':