commit 90217778bee74b80765ac2b72e6272a630b419da Author: Franklin-F Date: Sun Apr 20 01:49:43 2025 +0800 a diff --git a/TS_resume_spider/__init__.py b/TS_resume_spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/TS_resume_spider/__pycache__/__init__.cpython-312.pyc b/TS_resume_spider/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..c5ac1a2 Binary files /dev/null and b/TS_resume_spider/__pycache__/__init__.cpython-312.pyc differ diff --git a/TS_resume_spider/__pycache__/__init__.cpython-39.pyc b/TS_resume_spider/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..fbfe5fe Binary files /dev/null and b/TS_resume_spider/__pycache__/__init__.cpython-39.pyc differ diff --git a/TS_resume_spider/__pycache__/pipelines.cpython-312.pyc b/TS_resume_spider/__pycache__/pipelines.cpython-312.pyc new file mode 100644 index 0000000..7b7b35e Binary files /dev/null and b/TS_resume_spider/__pycache__/pipelines.cpython-312.pyc differ diff --git a/TS_resume_spider/__pycache__/pipelines.cpython-39.pyc b/TS_resume_spider/__pycache__/pipelines.cpython-39.pyc new file mode 100644 index 0000000..3baa470 Binary files /dev/null and b/TS_resume_spider/__pycache__/pipelines.cpython-39.pyc differ diff --git a/TS_resume_spider/__pycache__/settings.cpython-312.pyc b/TS_resume_spider/__pycache__/settings.cpython-312.pyc new file mode 100644 index 0000000..5bbc25e Binary files /dev/null and b/TS_resume_spider/__pycache__/settings.cpython-312.pyc differ diff --git a/TS_resume_spider/__pycache__/settings.cpython-39.pyc b/TS_resume_spider/__pycache__/settings.cpython-39.pyc new file mode 100644 index 0000000..570a487 Binary files /dev/null and b/TS_resume_spider/__pycache__/settings.cpython-39.pyc differ diff --git a/TS_resume_spider/items.py b/TS_resume_spider/items.py new file mode 100644 index 0000000..7ff1068 --- /dev/null +++ b/TS_resume_spider/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class TsResumeSpiderItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/TS_resume_spider/middlewares.py b/TS_resume_spider/middlewares.py new file mode 100644 index 0000000..f29dd7b --- /dev/null +++ b/TS_resume_spider/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class TsResumeSpiderSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class TsResumeSpiderDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py new file mode 100644 index 0000000..8e08eb1 --- /dev/null +++ b/TS_resume_spider/pipelines.py @@ -0,0 +1,116 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +from datetime import datetime +import re +from TS_resume_spider.utils.db import DB +from scrapy.exceptions import DropItem + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class TsResumeSpiderPipeline: + def process_item(self, item, spider): + return item + + +class YTSpiderPipeline: + reverse_field_map = { + 'resume_id': 'resume_id', + 'user_name': 'name', + 'sex_show': 'gender', + 'user_age': 'age', + 'area_show': 'job_location', + 'birthday': 'birthday', + 'education_level_msg': 'education', + 'expect_job': 'expected_position', + 'last_edit_time': 'update_time', + 'marry_status_show': 'marital_status', + 'residence': 'current_location', + 'phone_encrypt': 'phone', + 'work_type_show': 'job_property', + 'work_status_show': 'job_status', + 'work_1_description': 'work_1_description', + 'work_1_time': 'work_1_time', + 'work_1_experience': 'work_1_experience', + 'work_2_description': 'work_2_description', + 'work_2_time': 'work_2_time', + 'work_2_experience': 'work_2_experience', + 'work_3_description': 'work_3_description', + 'work_3_time': 'work_3_time', + 'work_3_experience': 'work_3_experience', + 'work_4_description': 'work_4_description', + 'work_4_time': 'work_4_time', + 'work_4_experience': 'work_4_experience', + } + + def extract_int(self, s): + try: + return int(re.search(r'\d+', str(s)).group()) + except: + return None + + def parse_datetime(self, s): + try: + return datetime.fromisoformat(s) + except: + return datetime(2019, 12, 12) + + def process_item(self, item, spider): + if spider.name != 'yutian_top': + return item + experience = item.get("experience", []) + for j in range(4): + if j < len(experience): + company = experience[j].get("company", "") + time_line = experience[j].get("time_line", "") + content = experience[j].get("content", "") + else: + company = '' + time_line = '' + content = '' + + item[f"work_{j + 1}_experience"] = company + item[f"work_{j + 1}_time"] = time_line + item[f"work_{j + 1}_description"] = content + + item = { + self.reverse_field_map[k]: v + for k, v in item.items() + if k in self.reverse_field_map + } + + if "age" in item: + item["age"] = self.extract_int(item["age"]) + + if "height" in item: + item["height"] = self.extract_int(item["height"]) + + if "weight" in item: + item["weight"] = self.extract_int(item["weight"]) + + if "update_time" in item: + item["update_time"] = self.parse_datetime(item["update_time"]) + + item["source_id"] = 2 + + return item + + +class YTSavePipeline: + def process_item(self, item, spider): + if spider.name not in ['yutian_top' ,'zhrczp_com']: + return item + resume_id = item.get("resume_id") + if not resume_id: + raise DropItem("⚠️ resume_id 缺失,已丢弃") + + try: + DB.insert_resume(item) + except Exception as e: + spider.logger.warning(f"❌ 写入失败:resume_id={resume_id}, 错误={e}") + + return item + diff --git a/TS_resume_spider/settings.py b/TS_resume_spider/settings.py new file mode 100644 index 0000000..6bab753 --- /dev/null +++ b/TS_resume_spider/settings.py @@ -0,0 +1,93 @@ +# Scrapy settings for TS_resume_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "TS_resume_spider" + +SPIDER_MODULES = ["TS_resume_spider.spiders"] +NEWSPIDER_MODULE = "TS_resume_spider.spiders" + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'TS_resume_spider.pipelines.YTSpiderPipeline': 300, + 'TS_resume_spider.pipelines.YTSavePipeline': 500, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/TS_resume_spider/spiders/__init__.py b/TS_resume_spider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/TS_resume_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc b/TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..083e29c Binary files /dev/null and b/TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc differ diff --git a/TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc b/TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..ccc770b Binary files /dev/null and b/TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc differ diff --git a/TS_resume_spider/spiders/__pycache__/yutian_top.cpython-312.pyc b/TS_resume_spider/spiders/__pycache__/yutian_top.cpython-312.pyc new file mode 100644 index 0000000..3a79f6a Binary files /dev/null and b/TS_resume_spider/spiders/__pycache__/yutian_top.cpython-312.pyc differ diff --git a/TS_resume_spider/spiders/__pycache__/yutian_top.cpython-39.pyc b/TS_resume_spider/spiders/__pycache__/yutian_top.cpython-39.pyc new file mode 100644 index 0000000..33741a8 Binary files /dev/null and b/TS_resume_spider/spiders/__pycache__/yutian_top.cpython-39.pyc differ diff --git a/TS_resume_spider/spiders/__pycache__/zhrczp_com.cpython-312.pyc b/TS_resume_spider/spiders/__pycache__/zhrczp_com.cpython-312.pyc new file mode 100644 index 0000000..71d098e Binary files /dev/null and b/TS_resume_spider/spiders/__pycache__/zhrczp_com.cpython-312.pyc differ diff --git a/TS_resume_spider/spiders/yutian_top.py b/TS_resume_spider/spiders/yutian_top.py new file mode 100644 index 0000000..8d85f95 --- /dev/null +++ b/TS_resume_spider/spiders/yutian_top.py @@ -0,0 +1,85 @@ +import scrapy +import json + + +class YutianTopSpider(scrapy.Spider): + name = 'yutian_top' + allowed_domains = ['yutian.top'] + start_urls = ['https://www.yutian.top/job/company/v1/resume/page'] + + def start_requests(self): + headers = { + 'accept': 'application/json, text/plain, */*', + 'accept-language': 'zh-CN,zh;q=0.9', + 'cache-control': 'no-cache', + 'content-type': 'application/json;charset=UTF-8', + 'origin': 'https://www.yutian.top', + 'pragma': 'no-cache', + 'priority': 'u=1, i', + 'referer': 'https://www.yutian.top/enterprise/resume_store/list', + 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + } + + cookies = { + 'company_sign': '', + 'company_nonce': '', + 'cuid': '', + 'PHPSESSID': '210b19c9d51dbf8eec8e8ffb0540ad33', + 'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDY4MTIxNTksImp0aSI6IjgwZGVjMzY4LWUwODktNGYxYS1hNWJjLWExNDMzMDYzMjdmYiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIwM2M2MmI5ODM4Yjk3Y2UzYmQxZTQwNDllZGVlNmI0OCIsInRlbmFudF90b2tlbiI6IjY1OTAxM2RlNjAxZmJmNjg1MzZmYTU0OTc4ODVkMTA2In0.0rXFe1iQClJ33rgXnTjhmye3zqVEZkJQvHGGET9dsz0', + } + + for i in range(1,6): + + payload = { + 'step': 1000, + 'page': i, + 'education_level': [], + 'arrival_time': [], + 'work_time': [], + 'area_id': [], + 'keywords': '', + 'work_status': '', + 'work_status_show': '求职状态', + 'category_id': '', + 'work_type': '', + 'work_type_show': '是否兼职', + 'sex': '', + 'sex_show': '性别', + 'is_head': '', + 'is_head_show': '有无照片', + 'job_id': '', + 'age': [], + 'age_show': '年龄', + 'refresh_time': 0, + 'site_id': '', + 'site_id2': '', + 'province': '', + 'city': '', + 'county': '', + 'provinceArr': [], + 'cityArr': [], + 'countyArr': [], + 'only_job_category': 0, + } + + yield scrapy.Request( + url=self.start_urls[0], + method='POST', + headers=headers, + cookies=cookies, + body=json.dumps(payload), + callback=self.parse, + ) + + def parse(self, response): + status_code = response.status + print(status_code) + data = json.loads(response.text) + for item in data.get('data', []): + yield item \ No newline at end of file diff --git a/TS_resume_spider/spiders/zhrczp_com.py b/TS_resume_spider/spiders/zhrczp_com.py new file mode 100644 index 0000000..9946ca3 --- /dev/null +++ b/TS_resume_spider/spiders/zhrczp_com.py @@ -0,0 +1,148 @@ +import re +import urllib +from typing import Iterable +import scrapy +from lxml import etree +from scrapy import Request + + +class ZunHuaComSpider(scrapy.Spider): + name = 'zhrczp_com' + allowed_domains = ['zhrczp.com'] + start_urls = ['https://www.zhrczp.com/member/index.php'] + cookies = { + 'Hm_lvt_115013d5b34e45eb09d0baedeb1c845a': '1745062179', + 'HMACCOUNT': 'B05D7338A384928F', + 'Hm_lpvt_115013d5b34e45eb09d0baedeb1c845a': '1745062980', + 'PHPSESSID': 'f2o89gakk79jl43hcl4ptnea3r', + 'uid': '60531', + 'shell': '9246a8c91784a3981081a37dd4bdcef9', + 'usertype': '2', + 'userdid': '0', + 'amtype': '0', + 'jobrefresh': '1', + 'gzh': '1', + 'acw_tc': '1a0c63d517450682931821154e003e6b210262ee0f2d393aa4e3b2a163053b', + 'pc_bannerFlag': '1', + } + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Pragma': 'no-cache', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + } + + def start_requests(self) -> Iterable[Request]: + for page in range(1, 251): + params = { + 'c': 'resume', + 'page': str(page), + } + query_string = urllib.parse.urlencode(params) + full_url = f"{self.start_urls[0]}?{query_string}" + yield scrapy.Request( + url=full_url, + method='GET', + headers=self.headers, + cookies=self.cookies, + callback=self.parse, + + ) + + def parse(self, response): + status_code = response.status + print(status_code) + html = response.text + res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html) + resume_id_list = list(set(res)) + for item in resume_id_list: + params = { + 'c': 'hr', + 'act': 'resumeInfo', + 'eid': item, + 'state': 'undefined', + 'from': '', + } + query_string = urllib.parse.urlencode(params) + full_url = f"{self.start_urls[0]}?{query_string}" + yield scrapy.Request( + url=full_url, + method='GET', + headers=self.headers, + cookies=self.cookies, + callback=self.parse2, + meta={'resume_id': item}, + ) + + def parse2(self, response): + resume_id = response.meta.get('resume_id') + + parts_raw = response.xpath('//div[@class="hr_resume_item"]/text()').get() + extra_span = response.xpath('//div[@class="hr_resume_item"]/span/text()').get() + + parts = [] + if parts_raw: + cleaned = re.sub(r'\s+', ' ', parts_raw).strip() + parts = [p.strip() for p in cleaned.split('·') if p.strip()] + if extra_span: + parts.append(extra_span.strip()) + + current_location = '' + if parts and '现居' in parts[-1]: + current_location = parts[-1] + parts = parts[:-1] + + text = " ".join(parts) + age = re.search(r'(\d{2})岁', text) + height = re.search(r'(\d{2,3})\s*cm', text, re.I) + weight = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.I) + experience = re.search(r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)', text) + education = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text) + marital = re.search(r'(已婚|未婚)', text) + ethnic = re.search(r'(汉|满|回|壮|蒙古)', text) + + # 页面字段 XPath 提取 + name = response.xpath('//span[@class="hr_resume_username"]/text()').get() + update_time_raw = response.xpath('//span[@class="hr_resume_time_l "]/text()').get() + update_time = re.sub(r'^更新时间[::]?', '', update_time_raw).strip() if update_time_raw else '' + + job_funcs = response.xpath('//span[@class="yun_newedition_yx_job"]/text()').getall() + job_titles = response.xpath('//li[span[contains(text(),"意向岗位")]]/text()').get() + industry = response.xpath('//li[span[contains(text(),"从事行业")]]/text()').get() + salary = response.xpath('//li[span[contains(text(),"期望薪资")]]/text()').get() + report_time = response.xpath('//li[span[contains(text(),"到岗时间")]]/text()').get() + job_type = response.xpath('//li[span[contains(text(),"工作性质")]]/text()').get() + job_status = response.xpath('//li[span[contains(text(),"求职状态")]]/text()').get() + location = response.xpath('//li[span[contains(text(),"工作地点")]]/text()').get() + yield { + 'resume_id': resume_id, + 'name': name.strip() if name else None, + 'age': age.group(1) if age else None, + 'height': height.group(1) if height else None, + 'weight': weight.group(1) if weight else None, + 'work_years': experience.group(1) if experience else None, + 'education': education.group(1) if education else None, + 'marital_status': marital.group(1) if marital else None, + 'ethnicity': ethnic.group(1) if ethnic else None, + 'current_location': current_location.replace('现居', '').strip() if current_location else None, + 'update_time': update_time[3:] if update_time else None, + 'job_function': ', '.join([j.strip() for j in job_funcs]) if job_funcs else None, + 'intended_position': job_titles.strip() if job_titles else None, + 'industry': industry.strip() if industry else None, + 'expected_salary': salary.strip() if salary else None, + 'available_time': report_time.strip() if report_time else None, + 'job_property': job_type.strip() if job_type else None, + 'job_status': job_status.strip() if job_status else None, + 'job_location': location.strip() if location else None, + 'source_id': 1, + } diff --git a/TS_resume_spider/utils/__init__.py b/TS_resume_spider/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc b/TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..4254c5d Binary files /dev/null and b/TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/TS_resume_spider/utils/__pycache__/db.cpython-312.pyc b/TS_resume_spider/utils/__pycache__/db.cpython-312.pyc new file mode 100644 index 0000000..9f0aef5 Binary files /dev/null and b/TS_resume_spider/utils/__pycache__/db.cpython-312.pyc differ diff --git a/TS_resume_spider/utils/db.py b/TS_resume_spider/utils/db.py new file mode 100644 index 0000000..834c6f9 --- /dev/null +++ b/TS_resume_spider/utils/db.py @@ -0,0 +1,66 @@ +from datetime import datetime + +import pymysql + + +class MySQLClient: + def __init__(self, host, user, password, db, port=3306): + self.conn = pymysql.connect( + host=host, + user=user, + password=password, + db=db, + port=port, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor, + autocommit=True + ) + self.cursor = self.conn.cursor() + + def execute(self, sql, values=None): + try: + self.cursor.execute(sql, values or []) + + except Exception as e: + print(f"[MySQL] 执行失败: {e}") + self.conn.rollback() + + def __del__(self): + try: + self.cursor.close() + self.conn.close() + except Exception: + pass + + +class DB: + _client: MySQLClient = None # 类属性持有连接 + + @classmethod + def init(cls): + if cls._client is None: + cls._client = MySQLClient( + host='39.101.135.56', + user='tsreshub_prod', + password='Tr5h$Prod!92@TsRH', + db='tsreshub_db', + port=3306 + ) + + @classmethod + def insert_resume(cls, data: dict): + cls.init() # 保证连接已初始化 + + safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))} + + table = 'resumes_resumebasic' + keys = ', '.join(safe_data.keys()) + placeholders = ', '.join(['%s'] * len(safe_data)) + update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id']) + + sql = f""" + INSERT INTO {table} ({keys}) VALUES ({placeholders}) + ON DUPLICATE KEY UPDATE {update_clause} + """ + + cls._client.execute(sql, list(safe_data.values())) diff --git a/debug/Debug_yutian_top.py b/debug/Debug_yutian_top.py new file mode 100644 index 0000000..a190a37 --- /dev/null +++ b/debug/Debug_yutian_top.py @@ -0,0 +1,17 @@ +# debug/debug_spider.py +import sys +import os +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +sys.path.append(project_root) +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from TS_resume_spider.spiders.yutian_top import YutianTopSpider +from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider + +def main(): + process = CrawlerProcess(get_project_settings()) + process.crawl(ZunHuaComSpider) + process.start() + +if __name__ == '__main__': + main() diff --git a/debug/__init__.py b/debug/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..e3798f2 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = TS_resume_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = TS_resume_spider