a

2025-04-20 01:49:43 +08:00 · 2025-04-20 01:49:43 +08:00 · 90217778be
commit 90217778be
26 changed files with 655 additions and 0 deletions
--- a/TS_resume_spider/init.py
+++ b/TS_resume_spider/init.py
--- a/TS_resume_spider/pycache/init.cpython-312.pyc
+++ b/TS_resume_spider/pycache/init.cpython-312.pyc
--- a/TS_resume_spider/pycache/init.cpython-39.pyc
+++ b/TS_resume_spider/pycache/init.cpython-39.pyc
--- a/TS_resume_spider/pycache/pipelines.cpython-312.pyc
+++ b/TS_resume_spider/pycache/pipelines.cpython-312.pyc
--- a/TS_resume_spider/pycache/pipelines.cpython-39.pyc
+++ b/TS_resume_spider/pycache/pipelines.cpython-39.pyc
--- a/TS_resume_spider/pycache/settings.cpython-312.pyc
+++ b/TS_resume_spider/pycache/settings.cpython-312.pyc
--- a/TS_resume_spider/pycache/settings.cpython-39.pyc
+++ b/TS_resume_spider/pycache/settings.cpython-39.pyc
--- a/TS_resume_spider/items.py
+++ b/TS_resume_spider/items.py
@ -0,0 +1,12 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class TsResumeSpiderItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/TS_resume_spider/middlewares.py
+++ b/TS_resume_spider/middlewares.py
@ -0,0 +1,103 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+# useful for handling different item types with a single interface
+from itemadapter import is_item, ItemAdapter
+
+
+class TsResumeSpiderSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class TsResumeSpiderDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
--- a/TS_resume_spider/pipelines.py
+++ b/TS_resume_spider/pipelines.py
@ -0,0 +1,116 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+from datetime import datetime
+import re
+from TS_resume_spider.utils.db import DB
+from scrapy.exceptions import DropItem
+
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter
+
+
+class TsResumeSpiderPipeline:
+    def process_item(self, item, spider):
+        return item
+
+
+class YTSpiderPipeline:
+    reverse_field_map = {
+        'resume_id': 'resume_id',
+        'user_name': 'name',
+        'sex_show': 'gender',
+        'user_age': 'age',
+        'area_show': 'job_location',
+        'birthday': 'birthday',
+        'education_level_msg': 'education',
+        'expect_job': 'expected_position',
+        'last_edit_time': 'update_time',
+        'marry_status_show': 'marital_status',
+        'residence': 'current_location',
+        'phone_encrypt': 'phone',
+        'work_type_show': 'job_property',
+        'work_status_show': 'job_status',
+        'work_1_description': 'work_1_description',
+        'work_1_time': 'work_1_time',
+        'work_1_experience': 'work_1_experience',
+        'work_2_description': 'work_2_description',
+        'work_2_time': 'work_2_time',
+        'work_2_experience': 'work_2_experience',
+        'work_3_description': 'work_3_description',
+        'work_3_time': 'work_3_time',
+        'work_3_experience': 'work_3_experience',
+        'work_4_description': 'work_4_description',
+        'work_4_time': 'work_4_time',
+        'work_4_experience': 'work_4_experience',
+    }
+
+    def extract_int(self, s):
+        try:
+            return int(re.search(r'\d+', str(s)).group())
+        except:
+            return None
+
+    def parse_datetime(self, s):
+        try:
+            return datetime.fromisoformat(s)
+        except:
+            return datetime(2019, 12, 12)
+
+    def process_item(self, item, spider):
+        if spider.name != 'yutian_top':
+            return item
+        experience = item.get("experience", [])
+        for j in range(4):
+            if j < len(experience):
+                company = experience[j].get("company", "")
+                time_line = experience[j].get("time_line", "")
+                content = experience[j].get("content", "")
+            else:
+                company = ''
+                time_line = ''
+                content = ''
+
+            item[f"work_{j + 1}_experience"] = company
+            item[f"work_{j + 1}_time"] = time_line
+            item[f"work_{j + 1}_description"] = content
+
+        item = {
+            self.reverse_field_map[k]: v
+            for k, v in item.items()
+            if k in self.reverse_field_map
+        }
+
+        if "age" in item:
+            item["age"] = self.extract_int(item["age"])
+
+        if "height" in item:
+            item["height"] = self.extract_int(item["height"])
+
+        if "weight" in item:
+            item["weight"] = self.extract_int(item["weight"])
+
+        if "update_time" in item:
+            item["update_time"] = self.parse_datetime(item["update_time"])
+
+        item["source_id"] = 2
+
+        return item
+
+
+class YTSavePipeline:
+    def process_item(self, item, spider):
+        if spider.name not in ['yutian_top' ,'zhrczp_com']:
+            return item
+        resume_id = item.get("resume_id")
+        if not resume_id:
+            raise DropItem("⚠️ resume_id 缺失，已丢弃")
+
+        try:
+            DB.insert_resume(item)
+        except Exception as e:
+            spider.logger.warning(f"❌ 写入失败：resume_id={resume_id}, 错误={e}")
+
+        return item
+
--- a/TS_resume_spider/settings.py
+++ b/TS_resume_spider/settings.py
@ -0,0 +1,93 @@
+# Scrapy settings for TS_resume_spider project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "TS_resume_spider"
+
+SPIDER_MODULES = ["TS_resume_spider.spiders"]
+NEWSPIDER_MODULE = "TS_resume_spider.spiders"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+# }
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+#    "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
+# }
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
+    'TS_resume_spider.pipelines.YTSavePipeline': 500,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = "httpcache"
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
--- a/TS_resume_spider/spiders/init.py
+++ b/TS_resume_spider/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/TS_resume_spider/spiders/pycache/init.cpython-312.pyc
+++ b/TS_resume_spider/spiders/pycache/init.cpython-312.pyc
--- a/TS_resume_spider/spiders/pycache/init.cpython-39.pyc
+++ b/TS_resume_spider/spiders/pycache/init.cpython-39.pyc
--- a/TS_resume_spider/spiders/pycache/yutian_top.cpython-312.pyc
+++ b/TS_resume_spider/spiders/pycache/yutian_top.cpython-312.pyc
--- a/TS_resume_spider/spiders/pycache/yutian_top.cpython-39.pyc
+++ b/TS_resume_spider/spiders/pycache/yutian_top.cpython-39.pyc
--- a/TS_resume_spider/spiders/pycache/zhrczp_com.cpython-312.pyc
+++ b/TS_resume_spider/spiders/pycache/zhrczp_com.cpython-312.pyc
--- a/TS_resume_spider/spiders/yutian_top.py
+++ b/TS_resume_spider/spiders/yutian_top.py
@ -0,0 +1,85 @@
+import scrapy
+import json
+
+
+class YutianTopSpider(scrapy.Spider):
+    name = 'yutian_top'
+    allowed_domains = ['yutian.top']
+    start_urls = ['https://www.yutian.top/job/company/v1/resume/page']
+
+    def start_requests(self):
+        headers = {
+            'accept': 'application/json, text/plain, */*',
+            'accept-language': 'zh-CN,zh;q=0.9',
+            'cache-control': 'no-cache',
+            'content-type': 'application/json;charset=UTF-8',
+            'origin': 'https://www.yutian.top',
+            'pragma': 'no-cache',
+            'priority': 'u=1, i',
+            'referer': 'https://www.yutian.top/enterprise/resume_store/list',
+            'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
+            'sec-ch-ua-mobile': '?0',
+            'sec-ch-ua-platform': '"Windows"',
+            'sec-fetch-dest': 'empty',
+            'sec-fetch-mode': 'cors',
+            'sec-fetch-site': 'same-origin',
+            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
+        }
+
+        cookies = {
+            'company_sign': '',
+            'company_nonce': '',
+            'cuid': '',
+            'PHPSESSID': '210b19c9d51dbf8eec8e8ffb0540ad33',
+            'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDY4MTIxNTksImp0aSI6IjgwZGVjMzY4LWUwODktNGYxYS1hNWJjLWExNDMzMDYzMjdmYiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIwM2M2MmI5ODM4Yjk3Y2UzYmQxZTQwNDllZGVlNmI0OCIsInRlbmFudF90b2tlbiI6IjY1OTAxM2RlNjAxZmJmNjg1MzZmYTU0OTc4ODVkMTA2In0.0rXFe1iQClJ33rgXnTjhmye3zqVEZkJQvHGGET9dsz0',
+        }
+
+        for i in range(1,6):
+
+            payload = {
+                'step': 1000,
+                'page': i,
+                'education_level': [],
+                'arrival_time': [],
+                'work_time': [],
+                'area_id': [],
+                'keywords': '',
+                'work_status': '',
+                'work_status_show': '求职状态',
+                'category_id': '',
+                'work_type': '',
+                'work_type_show': '是否兼职',
+                'sex': '',
+                'sex_show': '性别',
+                'is_head': '',
+                'is_head_show': '有无照片',
+                'job_id': '',
+                'age': [],
+                'age_show': '年龄',
+                'refresh_time': 0,
+                'site_id': '',
+                'site_id2': '',
+                'province': '',
+                'city': '',
+                'county': '',
+                'provinceArr': [],
+                'cityArr': [],
+                'countyArr': [],
+                'only_job_category': 0,
+            }
+
+            yield scrapy.Request(
+                url=self.start_urls[0],
+                method='POST',
+                headers=headers,
+                cookies=cookies,
+                body=json.dumps(payload),
+                callback=self.parse,
+            )
+
+    def parse(self, response):
+        status_code = response.status
+        print(status_code)
+        data = json.loads(response.text)
+        for item in data.get('data', []):
+            yield item
--- a/TS_resume_spider/spiders/zhrczp_com.py
+++ b/TS_resume_spider/spiders/zhrczp_com.py
@ -0,0 +1,148 @@
+import re
+import urllib
+from typing import Iterable
+import scrapy
+from lxml import etree
+from scrapy import Request
+
+
+class ZunHuaComSpider(scrapy.Spider):
+    name = 'zhrczp_com'
+    allowed_domains = ['zhrczp.com']
+    start_urls = ['https://www.zhrczp.com/member/index.php']
+    cookies = {
+        'Hm_lvt_115013d5b34e45eb09d0baedeb1c845a': '1745062179',
+        'HMACCOUNT': 'B05D7338A384928F',
+        'Hm_lpvt_115013d5b34e45eb09d0baedeb1c845a': '1745062980',
+        'PHPSESSID': 'f2o89gakk79jl43hcl4ptnea3r',
+        'uid': '60531',
+        'shell': '9246a8c91784a3981081a37dd4bdcef9',
+        'usertype': '2',
+        'userdid': '0',
+        'amtype': '0',
+        'jobrefresh': '1',
+        'gzh': '1',
+        'acw_tc': '1a0c63d517450682931821154e003e6b210262ee0f2d393aa4e3b2a163053b',
+        'pc_bannerFlag': '1',
+    }
+    headers = {
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+        'Accept-Language': 'zh-CN,zh;q=0.9',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'Pragma': 'no-cache',
+        'Sec-Fetch-Dest': 'document',
+        'Sec-Fetch-Mode': 'navigate',
+        'Sec-Fetch-Site': 'none',
+        'Sec-Fetch-User': '?1',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+    }
+
+    def start_requests(self) -> Iterable[Request]:
+        for page in range(1, 251):
+            params = {
+                'c': 'resume',
+                'page': str(page),
+            }
+            query_string = urllib.parse.urlencode(params)
+            full_url = f"{self.start_urls[0]}?{query_string}"
+            yield scrapy.Request(
+                url=full_url,
+                method='GET',
+                headers=self.headers,
+                cookies=self.cookies,
+                callback=self.parse,
+
+            )
+
+    def parse(self, response):
+        status_code = response.status
+        print(status_code)
+        html = response.text
+        res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html)
+        resume_id_list = list(set(res))
+        for item in resume_id_list:
+            params = {
+                'c': 'hr',
+                'act': 'resumeInfo',
+                'eid': item,
+                'state': 'undefined',
+                'from': '',
+            }
+            query_string = urllib.parse.urlencode(params)
+            full_url = f"{self.start_urls[0]}?{query_string}"
+            yield scrapy.Request(
+                url=full_url,
+                method='GET',
+                headers=self.headers,
+                cookies=self.cookies,
+                callback=self.parse2,
+                meta={'resume_id': item},
+            )
+
+    def parse2(self, response):
+        resume_id = response.meta.get('resume_id')
+
+        parts_raw = response.xpath('//div[@class="hr_resume_item"]/text()').get()
+        extra_span = response.xpath('//div[@class="hr_resume_item"]/span/text()').get()
+
+        parts = []
+        if parts_raw:
+            cleaned = re.sub(r'\s+', ' ', parts_raw).strip()
+            parts = [p.strip() for p in cleaned.split('·') if p.strip()]
+        if extra_span:
+            parts.append(extra_span.strip())
+
+        current_location = ''
+        if parts and '现居' in parts[-1]:
+            current_location = parts[-1]
+            parts = parts[:-1]
+
+        text = " ".join(parts)
+        age = re.search(r'(\d{2})岁', text)
+        height = re.search(r'(\d{2,3})\s*cm', text, re.I)
+        weight = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.I)
+        experience = re.search(r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)', text)
+        education = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text)
+        marital = re.search(r'(已婚|未婚)', text)
+        ethnic = re.search(r'(汉|满|回|壮|蒙古)', text)
+
+        # 页面字段 XPath 提取
+        name = response.xpath('//span[@class="hr_resume_username"]/text()').get()
+        update_time_raw = response.xpath('//span[@class="hr_resume_time_l "]/text()').get()
+        update_time = re.sub(r'^更新时间[:：]?', '', update_time_raw).strip() if update_time_raw else ''
+
+        job_funcs = response.xpath('//span[@class="yun_newedition_yx_job"]/text()').getall()
+        job_titles = response.xpath('//li[span[contains(text(),"意向岗位")]]/text()').get()
+        industry = response.xpath('//li[span[contains(text(),"从事行业")]]/text()').get()
+        salary = response.xpath('//li[span[contains(text(),"期望薪资")]]/text()').get()
+        report_time = response.xpath('//li[span[contains(text(),"到岗时间")]]/text()').get()
+        job_type = response.xpath('//li[span[contains(text(),"工作性质")]]/text()').get()
+        job_status = response.xpath('//li[span[contains(text(),"求职状态")]]/text()').get()
+        location = response.xpath('//li[span[contains(text(),"工作地点")]]/text()').get()
+        yield {
+            'resume_id': resume_id,
+            'name': name.strip() if name else None,
+            'age': age.group(1) if age else None,
+            'height': height.group(1) if height else None,
+            'weight': weight.group(1) if weight else None,
+            'work_years': experience.group(1) if experience else None,
+            'education': education.group(1) if education else None,
+            'marital_status': marital.group(1) if marital else None,
+            'ethnicity': ethnic.group(1) if ethnic else None,
+            'current_location': current_location.replace('现居', '').strip() if current_location else None,
+            'update_time': update_time[3:] if update_time else None,
+            'job_function': ', '.join([j.strip() for j in job_funcs]) if job_funcs else None,
+            'intended_position': job_titles.strip() if job_titles else None,
+            'industry': industry.strip() if industry else None,
+            'expected_salary': salary.strip() if salary else None,
+            'available_time': report_time.strip() if report_time else None,
+            'job_property': job_type.strip() if job_type else None,
+            'job_status': job_status.strip() if job_status else None,
+            'job_location': location.strip() if location else None,
+            'source_id': 1,
+        }
--- a/TS_resume_spider/utils/init.py
+++ b/TS_resume_spider/utils/init.py
--- a/TS_resume_spider/utils/pycache/init.cpython-312.pyc
+++ b/TS_resume_spider/utils/pycache/init.cpython-312.pyc
--- a/TS_resume_spider/utils/pycache/db.cpython-312.pyc
+++ b/TS_resume_spider/utils/pycache/db.cpython-312.pyc
--- a/TS_resume_spider/utils/db.py
+++ b/TS_resume_spider/utils/db.py
@ -0,0 +1,66 @@
+from datetime import datetime
+
+import pymysql
+
+
+class MySQLClient:
+    def __init__(self, host, user, password, db, port=3306):
+        self.conn = pymysql.connect(
+            host=host,
+            user=user,
+            password=password,
+            db=db,
+            port=port,
+            charset='utf8mb4',
+            cursorclass=pymysql.cursors.DictCursor,
+            autocommit=True
+        )
+        self.cursor = self.conn.cursor()
+
+    def execute(self, sql, values=None):
+        try:
+            self.cursor.execute(sql, values or [])
+
+        except Exception as e:
+            print(f"[MySQL] 执行失败: {e}")
+            self.conn.rollback()
+
+    def __del__(self):
+        try:
+            self.cursor.close()
+            self.conn.close()
+        except Exception:
+            pass
+
+
+class DB:
+    _client: MySQLClient = None  # 类属性持有连接
+
+    @classmethod
+    def init(cls):
+        if cls._client is None:
+            cls._client = MySQLClient(
+                host='39.101.135.56',
+                user='tsreshub_prod',
+                password='Tr5h$Prod!92@TsRH',
+                db='tsreshub_db',
+                port=3306
+            )
+
+    @classmethod
+    def insert_resume(cls, data: dict):
+        cls.init()  # 保证连接已初始化
+
+        safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
+
+        table = 'resumes_resumebasic'
+        keys = ', '.join(safe_data.keys())
+        placeholders = ', '.join(['%s'] * len(safe_data))
+        update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
+
+        sql = f"""
+            INSERT INTO {table} ({keys}) VALUES ({placeholders})
+            ON DUPLICATE KEY UPDATE {update_clause}
+            """
+
+        cls._client.execute(sql, list(safe_data.values()))
--- a/debug/Debug_yutian_top.py
+++ b/debug/Debug_yutian_top.py
@ -0,0 +1,17 @@
+# debug/debug_spider.py
+import sys
+import os
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(project_root)
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+from TS_resume_spider.spiders.yutian_top import YutianTopSpider
+from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
+
+def main():
+    process = CrawlerProcess(get_project_settings())
+    process.crawl(ZunHuaComSpider)
+    process.start()
+
+if __name__ == '__main__':
+    main()
--- a/debug/init.py
+++ b/debug/init.py
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = TS_resume_spider.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = TS_resume_spider