a

2025-04-20 01:49:43 +08:00 · 2025-04-20 01:49:43 +08:00 · 90217778be
commit 90217778be
26 changed files with 655 additions and 0 deletions
--- a/TS_resume_spider/init.py
+++ b/TS_resume_spider/init.py
--- a/TS_resume_spider/pycache/init.cpython-312.pyc
+++ b/TS_resume_spider/pycache/init.cpython-312.pyc
--- a/TS_resume_spider/pycache/init.cpython-39.pyc
+++ b/TS_resume_spider/pycache/init.cpython-39.pyc
--- a/TS_resume_spider/pycache/pipelines.cpython-312.pyc
+++ b/TS_resume_spider/pycache/pipelines.cpython-312.pyc
--- a/TS_resume_spider/pycache/pipelines.cpython-39.pyc
+++ b/TS_resume_spider/pycache/pipelines.cpython-39.pyc
--- a/TS_resume_spider/pycache/settings.cpython-312.pyc
+++ b/TS_resume_spider/pycache/settings.cpython-312.pyc
--- a/TS_resume_spider/pycache/settings.cpython-39.pyc
+++ b/TS_resume_spider/pycache/settings.cpython-39.pyc
--- a/TS_resume_spider/items.py
+++ b/TS_resume_spider/items.py
@ -0,0 +1,12 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
 class TsResumeSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
--- a/TS_resume_spider/middlewares.py
+++ b/TS_resume_spider/middlewares.py
@ -0,0 +1,103 @@
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 # useful for handling different item types with a single interface
 from itemadapter import is_item, ItemAdapter
 class TsResumeSpiderSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Request or item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
 class TsResumeSpiderDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info("Spider opened: %s" % spider.name)
--- a/TS_resume_spider/pipelines.py
+++ b/TS_resume_spider/pipelines.py
@ -0,0 +1,116 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 from datetime import datetime
 import re
 from TS_resume_spider.utils.db import DB
 from scrapy.exceptions import DropItem
 # useful for handling different item types with a single interface
 from itemadapter import ItemAdapter
 class TsResumeSpiderPipeline:
    def process_item(self, item, spider):
        return item
 class YTSpiderPipeline:
    reverse_field_map = {
        'resume_id': 'resume_id',
        'user_name': 'name',
        'sex_show': 'gender',
        'user_age': 'age',
        'area_show': 'job_location',
        'birthday': 'birthday',
        'education_level_msg': 'education',
        'expect_job': 'expected_position',
        'last_edit_time': 'update_time',
        'marry_status_show': 'marital_status',
        'residence': 'current_location',
        'phone_encrypt': 'phone',
        'work_type_show': 'job_property',
        'work_status_show': 'job_status',
        'work_1_description': 'work_1_description',
        'work_1_time': 'work_1_time',
        'work_1_experience': 'work_1_experience',
        'work_2_description': 'work_2_description',
        'work_2_time': 'work_2_time',
        'work_2_experience': 'work_2_experience',
        'work_3_description': 'work_3_description',
        'work_3_time': 'work_3_time',
        'work_3_experience': 'work_3_experience',
        'work_4_description': 'work_4_description',
        'work_4_time': 'work_4_time',
        'work_4_experience': 'work_4_experience',
    }
    def extract_int(self, s):
        try:
            return int(re.search(r'\d+', str(s)).group())
        except:
            return None
    def parse_datetime(self, s):
        try:
            return datetime.fromisoformat(s)
        except:
            return datetime(2019, 12, 12)
    def process_item(self, item, spider):
        if spider.name != 'yutian_top':
            return item
        experience = item.get("experience", [])
        for j in range(4):
            if j < len(experience):
                company = experience[j].get("company", "")
                time_line = experience[j].get("time_line", "")
                content = experience[j].get("content", "")
            else:
                company = ''
                time_line = ''
                content = ''
            item[f"work_{j + 1}_experience"] = company
            item[f"work_{j + 1}_time"] = time_line
            item[f"work_{j + 1}_description"] = content
        item = {
            self.reverse_field_map[k]: v
            for k, v in item.items()
            if k in self.reverse_field_map
        }
        if "age" in item:
            item["age"] = self.extract_int(item["age"])
        if "height" in item:
            item["height"] = self.extract_int(item["height"])
        if "weight" in item:
            item["weight"] = self.extract_int(item["weight"])
        if "update_time" in item:
            item["update_time"] = self.parse_datetime(item["update_time"])
        item["source_id"] = 2
        return item
 class YTSavePipeline:
    def process_item(self, item, spider):
        if spider.name not in ['yutian_top' ,'zhrczp_com']:
            return item
        resume_id = item.get("resume_id")
        if not resume_id:
            raise DropItem("⚠️ resume_id 缺失，已丢弃")
        try:
            DB.insert_resume(item)
        except Exception as e:
            spider.logger.warning(f"❌ 写入失败：resume_id={resume_id}, 错误={e}")
        return item
--- a/TS_resume_spider/settings.py
+++ b/TS_resume_spider/settings.py
@ -0,0 +1,93 @@
 # Scrapy settings for TS_resume_spider project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://docs.scrapy.org/en/latest/topics/settings.html
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = "TS_resume_spider"
 SPIDER_MODULES = ["TS_resume_spider.spiders"]
 NEWSPIDER_MODULE = "TS_resume_spider.spiders"
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 # USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 # CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 # DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 # CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 # COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 # TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 # DEFAULT_REQUEST_HEADERS = {
 #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 #    "Accept-Language": "en",
 # }
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 # SPIDER_MIDDLEWARES = {
 #    "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
 # }
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 # DOWNLOADER_MIDDLEWARES = {
 #    "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
 # }
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
 # EXTENSIONS = {
 #    "scrapy.extensions.telnet.TelnetConsole": None,
 # }
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
    'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
    'TS_resume_spider.pipelines.YTSavePipeline': 500,
 }
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 # AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 # AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 # AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 # AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 # HTTPCACHE_ENABLED = True
 # HTTPCACHE_EXPIRATION_SECS = 0
 # HTTPCACHE_DIR = "httpcache"
 # HTTPCACHE_IGNORE_HTTP_CODES = []
 # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
 # Set settings whose default value is deprecated to a future-proof value
 REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"
--- a/TS_resume_spider/spiders/init.py
+++ b/TS_resume_spider/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/TS_resume_spider/spiders/pycache/init.cpython-312.pyc
+++ b/TS_resume_spider/spiders/pycache/init.cpython-312.pyc
--- a/TS_resume_spider/spiders/pycache/init.cpython-39.pyc
+++ b/TS_resume_spider/spiders/pycache/init.cpython-39.pyc
--- a/TS_resume_spider/spiders/pycache/yutian_top.cpython-312.pyc
+++ b/TS_resume_spider/spiders/pycache/yutian_top.cpython-312.pyc
--- a/TS_resume_spider/spiders/pycache/yutian_top.cpython-39.pyc
+++ b/TS_resume_spider/spiders/pycache/yutian_top.cpython-39.pyc
--- a/TS_resume_spider/spiders/pycache/zhrczp_com.cpython-312.pyc
+++ b/TS_resume_spider/spiders/pycache/zhrczp_com.cpython-312.pyc
--- a/TS_resume_spider/spiders/yutian_top.py
+++ b/TS_resume_spider/spiders/yutian_top.py
@ -0,0 +1,85 @@
 import scrapy
 import json
 class YutianTopSpider(scrapy.Spider):
    name = 'yutian_top'
    allowed_domains = ['yutian.top']
    start_urls = ['https://www.yutian.top/job/company/v1/resume/page']
    def start_requests(self):
        headers = {
            'accept': 'application/json, text/plain, */*',
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'no-cache',
            'content-type': 'application/json;charset=UTF-8',
            'origin': 'https://www.yutian.top',
            'pragma': 'no-cache',
            'priority': 'u=1, i',
            'referer': 'https://www.yutian.top/enterprise/resume_store/list',
            'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
        }
        cookies = {
            'company_sign': '',
            'company_nonce': '',
            'cuid': '',
            'PHPSESSID': '210b19c9d51dbf8eec8e8ffb0540ad33',
            'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDY4MTIxNTksImp0aSI6IjgwZGVjMzY4LWUwODktNGYxYS1hNWJjLWExNDMzMDYzMjdmYiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIwM2M2MmI5ODM4Yjk3Y2UzYmQxZTQwNDllZGVlNmI0OCIsInRlbmFudF90b2tlbiI6IjY1OTAxM2RlNjAxZmJmNjg1MzZmYTU0OTc4ODVkMTA2In0.0rXFe1iQClJ33rgXnTjhmye3zqVEZkJQvHGGET9dsz0',
        }
        for i in range(1,6):
            payload = {
                'step': 1000,
                'page': i,
                'education_level': [],
                'arrival_time': [],
                'work_time': [],
                'area_id': [],
                'keywords': '',
                'work_status': '',
                'work_status_show': '求职状态',
                'category_id': '',
                'work_type': '',
                'work_type_show': '是否兼职',
                'sex': '',
                'sex_show': '性别',
                'is_head': '',
                'is_head_show': '有无照片',
                'job_id': '',
                'age': [],
                'age_show': '年龄',
                'refresh_time': 0,
                'site_id': '',
                'site_id2': '',
                'province': '',
                'city': '',
                'county': '',
                'provinceArr': [],
                'cityArr': [],
                'countyArr': [],
                'only_job_category': 0,
            }
            yield scrapy.Request(
                url=self.start_urls[0],
                method='POST',
                headers=headers,
                cookies=cookies,
                body=json.dumps(payload),
                callback=self.parse,
            )
    def parse(self, response):
        status_code = response.status
        print(status_code)
        data = json.loads(response.text)
        for item in data.get('data', []):
            yield item
--- a/TS_resume_spider/spiders/zhrczp_com.py
+++ b/TS_resume_spider/spiders/zhrczp_com.py
@ -0,0 +1,148 @@
 import re
 import urllib
 from typing import Iterable
 import scrapy
 from lxml import etree
 from scrapy import Request
 class ZunHuaComSpider(scrapy.Spider):
    name = 'zhrczp_com'
    allowed_domains = ['zhrczp.com']
    start_urls = ['https://www.zhrczp.com/member/index.php']
    cookies = {
        'Hm_lvt_115013d5b34e45eb09d0baedeb1c845a': '1745062179',
        'HMACCOUNT': 'B05D7338A384928F',
        'Hm_lpvt_115013d5b34e45eb09d0baedeb1c845a': '1745062980',
        'PHPSESSID': 'f2o89gakk79jl43hcl4ptnea3r',
        'uid': '60531',
        'shell': '9246a8c91784a3981081a37dd4bdcef9',
        'usertype': '2',
        'userdid': '0',
        'amtype': '0',
        'jobrefresh': '1',
        'gzh': '1',
        'acw_tc': '1a0c63d517450682931821154e003e6b210262ee0f2d393aa4e3b2a163053b',
        'pc_bannerFlag': '1',
    }
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    def start_requests(self) -> Iterable[Request]:
        for page in range(1, 251):
            params = {
                'c': 'resume',
                'page': str(page),
            }
            query_string = urllib.parse.urlencode(params)
            full_url = f"{self.start_urls[0]}?{query_string}"
            yield scrapy.Request(
                url=full_url,
                method='GET',
                headers=self.headers,
                cookies=self.cookies,
                callback=self.parse,
            )
    def parse(self, response):
        status_code = response.status
        print(status_code)
        html = response.text
        res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html)
        resume_id_list = list(set(res))
        for item in resume_id_list:
            params = {
                'c': 'hr',
                'act': 'resumeInfo',
                'eid': item,
                'state': 'undefined',
                'from': '',
            }
            query_string = urllib.parse.urlencode(params)
            full_url = f"{self.start_urls[0]}?{query_string}"
            yield scrapy.Request(
                url=full_url,
                method='GET',
                headers=self.headers,
                cookies=self.cookies,
                callback=self.parse2,
                meta={'resume_id': item},
            )
    def parse2(self, response):
        resume_id = response.meta.get('resume_id')
        parts_raw = response.xpath('//div[@class="hr_resume_item"]/text()').get()
        extra_span = response.xpath('//div[@class="hr_resume_item"]/span/text()').get()
        parts = []
        if parts_raw:
            cleaned = re.sub(r'\s+', ' ', parts_raw).strip()
            parts = [p.strip() for p in cleaned.split('·') if p.strip()]
        if extra_span:
            parts.append(extra_span.strip())
        current_location = ''
        if parts and '现居' in parts[-1]:
            current_location = parts[-1]
            parts = parts[:-1]
        text = " ".join(parts)
        age = re.search(r'(\d{2})岁', text)
        height = re.search(r'(\d{2,3})\s*cm', text, re.I)
        weight = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.I)
        experience = re.search(r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)', text)
        education = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text)
        marital = re.search(r'(已婚|未婚)', text)
        ethnic = re.search(r'(汉|满|回|壮|蒙古)', text)
        # 页面字段 XPath 提取
        name = response.xpath('//span[@class="hr_resume_username"]/text()').get()
        update_time_raw = response.xpath('//span[@class="hr_resume_time_l "]/text()').get()
        update_time = re.sub(r'^更新时间[:：]?', '', update_time_raw).strip() if update_time_raw else ''
        job_funcs = response.xpath('//span[@class="yun_newedition_yx_job"]/text()').getall()
        job_titles = response.xpath('//li[span[contains(text(),"意向岗位")]]/text()').get()
        industry = response.xpath('//li[span[contains(text(),"从事行业")]]/text()').get()
        salary = response.xpath('//li[span[contains(text(),"期望薪资")]]/text()').get()
        report_time = response.xpath('//li[span[contains(text(),"到岗时间")]]/text()').get()
        job_type = response.xpath('//li[span[contains(text(),"工作性质")]]/text()').get()
        job_status = response.xpath('//li[span[contains(text(),"求职状态")]]/text()').get()
        location = response.xpath('//li[span[contains(text(),"工作地点")]]/text()').get()
        yield {
            'resume_id': resume_id,
            'name': name.strip() if name else None,
            'age': age.group(1) if age else None,
            'height': height.group(1) if height else None,
            'weight': weight.group(1) if weight else None,
            'work_years': experience.group(1) if experience else None,
            'education': education.group(1) if education else None,
            'marital_status': marital.group(1) if marital else None,
            'ethnicity': ethnic.group(1) if ethnic else None,
            'current_location': current_location.replace('现居', '').strip() if current_location else None,
            'update_time': update_time[3:] if update_time else None,
            'job_function': ', '.join([j.strip() for j in job_funcs]) if job_funcs else None,
            'intended_position': job_titles.strip() if job_titles else None,
            'industry': industry.strip() if industry else None,
            'expected_salary': salary.strip() if salary else None,
            'available_time': report_time.strip() if report_time else None,
            'job_property': job_type.strip() if job_type else None,
            'job_status': job_status.strip() if job_status else None,
            'job_location': location.strip() if location else None,
            'source_id': 1,
        }
--- a/TS_resume_spider/utils/init.py
+++ b/TS_resume_spider/utils/init.py
--- a/TS_resume_spider/utils/pycache/init.cpython-312.pyc
+++ b/TS_resume_spider/utils/pycache/init.cpython-312.pyc
--- a/TS_resume_spider/utils/pycache/db.cpython-312.pyc
+++ b/TS_resume_spider/utils/pycache/db.cpython-312.pyc
--- a/TS_resume_spider/utils/db.py
+++ b/TS_resume_spider/utils/db.py
@ -0,0 +1,66 @@
 from datetime import datetime
 import pymysql
 class MySQLClient:
    def __init__(self, host, user, password, db, port=3306):
        self.conn = pymysql.connect(
            host=host,
            user=user,
            password=password,
            db=db,
            port=port,
            charset='utf8mb4',
            cursorclass=pymysql.cursors.DictCursor,
            autocommit=True
        )
        self.cursor = self.conn.cursor()
    def execute(self, sql, values=None):
        try:
            self.cursor.execute(sql, values or [])
        except Exception as e:
            print(f"[MySQL] 执行失败: {e}")
            self.conn.rollback()
    def __del__(self):
        try:
            self.cursor.close()
            self.conn.close()
        except Exception:
            pass
 class DB:
    _client: MySQLClient = None  # 类属性持有连接
    @classmethod
    def init(cls):
        if cls._client is None:
            cls._client = MySQLClient(
                host='39.101.135.56',
                user='tsreshub_prod',
                password='Tr5h$Prod!92@TsRH',
                db='tsreshub_db',
                port=3306
            )
    @classmethod
    def insert_resume(cls, data: dict):
        cls.init()  # 保证连接已初始化
        safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
        table = 'resumes_resumebasic'
        keys = ', '.join(safe_data.keys())
        placeholders = ', '.join(['%s'] * len(safe_data))
        update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
        sql = f"""
            INSERT INTO {table} ({keys}) VALUES ({placeholders})
            ON DUPLICATE KEY UPDATE {update_clause}
            """
        cls._client.execute(sql, list(safe_data.values()))
--- a/debug/Debug_yutian_top.py
+++ b/debug/Debug_yutian_top.py
@ -0,0 +1,17 @@
 # debug/debug_spider.py
 import sys
 import os
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 sys.path.append(project_root)
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
 from TS_resume_spider.spiders.yutian_top import YutianTopSpider
 from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
 def main():
    process = CrawlerProcess(get_project_settings())
    process.crawl(ZunHuaComSpider)
    process.start()
 if __name__ == '__main__':
    main()
--- a/debug/init.py
+++ b/debug/init.py
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = TS_resume_spider.settings
 [deploy]
 #url = http://localhost:6800/
 project = TS_resume_spider