This commit is contained in:
晓丰 2025-04-20 01:49:43 +08:00
commit 90217778be
26 changed files with 655 additions and 0 deletions

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

12
TS_resume_spider/items.py Normal file
View File

@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class TsResumeSpiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View File

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class TsResumeSpiderSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class TsResumeSpiderDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

View File

@ -0,0 +1,116 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from datetime import datetime
import re
from TS_resume_spider.utils.db import DB
from scrapy.exceptions import DropItem
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class TsResumeSpiderPipeline:
def process_item(self, item, spider):
return item
class YTSpiderPipeline:
reverse_field_map = {
'resume_id': 'resume_id',
'user_name': 'name',
'sex_show': 'gender',
'user_age': 'age',
'area_show': 'job_location',
'birthday': 'birthday',
'education_level_msg': 'education',
'expect_job': 'expected_position',
'last_edit_time': 'update_time',
'marry_status_show': 'marital_status',
'residence': 'current_location',
'phone_encrypt': 'phone',
'work_type_show': 'job_property',
'work_status_show': 'job_status',
'work_1_description': 'work_1_description',
'work_1_time': 'work_1_time',
'work_1_experience': 'work_1_experience',
'work_2_description': 'work_2_description',
'work_2_time': 'work_2_time',
'work_2_experience': 'work_2_experience',
'work_3_description': 'work_3_description',
'work_3_time': 'work_3_time',
'work_3_experience': 'work_3_experience',
'work_4_description': 'work_4_description',
'work_4_time': 'work_4_time',
'work_4_experience': 'work_4_experience',
}
def extract_int(self, s):
try:
return int(re.search(r'\d+', str(s)).group())
except:
return None
def parse_datetime(self, s):
try:
return datetime.fromisoformat(s)
except:
return datetime(2019, 12, 12)
def process_item(self, item, spider):
if spider.name != 'yutian_top':
return item
experience = item.get("experience", [])
for j in range(4):
if j < len(experience):
company = experience[j].get("company", "")
time_line = experience[j].get("time_line", "")
content = experience[j].get("content", "")
else:
company = ''
time_line = ''
content = ''
item[f"work_{j + 1}_experience"] = company
item[f"work_{j + 1}_time"] = time_line
item[f"work_{j + 1}_description"] = content
item = {
self.reverse_field_map[k]: v
for k, v in item.items()
if k in self.reverse_field_map
}
if "age" in item:
item["age"] = self.extract_int(item["age"])
if "height" in item:
item["height"] = self.extract_int(item["height"])
if "weight" in item:
item["weight"] = self.extract_int(item["weight"])
if "update_time" in item:
item["update_time"] = self.parse_datetime(item["update_time"])
item["source_id"] = 2
return item
class YTSavePipeline:
def process_item(self, item, spider):
if spider.name not in ['yutian_top' ,'zhrczp_com']:
return item
resume_id = item.get("resume_id")
if not resume_id:
raise DropItem("⚠️ resume_id 缺失,已丢弃")
try:
DB.insert_resume(item)
except Exception as e:
spider.logger.warning(f"❌ 写入失败resume_id={resume_id}, 错误={e}")
return item

View File

@ -0,0 +1,93 @@
# Scrapy settings for TS_resume_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "TS_resume_spider"
SPIDER_MODULES = ["TS_resume_spider.spiders"]
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
'TS_resume_spider.pipelines.YTSavePipeline': 500,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,85 @@
import scrapy
import json
class YutianTopSpider(scrapy.Spider):
name = 'yutian_top'
allowed_domains = ['yutian.top']
start_urls = ['https://www.yutian.top/job/company/v1/resume/page']
def start_requests(self):
headers = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json;charset=UTF-8',
'origin': 'https://www.yutian.top',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.yutian.top/enterprise/resume_store/list',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
}
cookies = {
'company_sign': '',
'company_nonce': '',
'cuid': '',
'PHPSESSID': '210b19c9d51dbf8eec8e8ffb0540ad33',
'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDY4MTIxNTksImp0aSI6IjgwZGVjMzY4LWUwODktNGYxYS1hNWJjLWExNDMzMDYzMjdmYiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIwM2M2MmI5ODM4Yjk3Y2UzYmQxZTQwNDllZGVlNmI0OCIsInRlbmFudF90b2tlbiI6IjY1OTAxM2RlNjAxZmJmNjg1MzZmYTU0OTc4ODVkMTA2In0.0rXFe1iQClJ33rgXnTjhmye3zqVEZkJQvHGGET9dsz0',
}
for i in range(1,6):
payload = {
'step': 1000,
'page': i,
'education_level': [],
'arrival_time': [],
'work_time': [],
'area_id': [],
'keywords': '',
'work_status': '',
'work_status_show': '求职状态',
'category_id': '',
'work_type': '',
'work_type_show': '是否兼职',
'sex': '',
'sex_show': '性别',
'is_head': '',
'is_head_show': '有无照片',
'job_id': '',
'age': [],
'age_show': '年龄',
'refresh_time': 0,
'site_id': '',
'site_id2': '',
'province': '',
'city': '',
'county': '',
'provinceArr': [],
'cityArr': [],
'countyArr': [],
'only_job_category': 0,
}
yield scrapy.Request(
url=self.start_urls[0],
method='POST',
headers=headers,
cookies=cookies,
body=json.dumps(payload),
callback=self.parse,
)
def parse(self, response):
status_code = response.status
print(status_code)
data = json.loads(response.text)
for item in data.get('data', []):
yield item

View File

@ -0,0 +1,148 @@
import re
import urllib
from typing import Iterable
import scrapy
from lxml import etree
from scrapy import Request
class ZunHuaComSpider(scrapy.Spider):
name = 'zhrczp_com'
allowed_domains = ['zhrczp.com']
start_urls = ['https://www.zhrczp.com/member/index.php']
cookies = {
'Hm_lvt_115013d5b34e45eb09d0baedeb1c845a': '1745062179',
'HMACCOUNT': 'B05D7338A384928F',
'Hm_lpvt_115013d5b34e45eb09d0baedeb1c845a': '1745062980',
'PHPSESSID': 'f2o89gakk79jl43hcl4ptnea3r',
'uid': '60531',
'shell': '9246a8c91784a3981081a37dd4bdcef9',
'usertype': '2',
'userdid': '0',
'amtype': '0',
'jobrefresh': '1',
'gzh': '1',
'acw_tc': '1a0c63d517450682931821154e003e6b210262ee0f2d393aa4e3b2a163053b',
'pc_bannerFlag': '1',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
def start_requests(self) -> Iterable[Request]:
for page in range(1, 251):
params = {
'c': 'resume',
'page': str(page),
}
query_string = urllib.parse.urlencode(params)
full_url = f"{self.start_urls[0]}?{query_string}"
yield scrapy.Request(
url=full_url,
method='GET',
headers=self.headers,
cookies=self.cookies,
callback=self.parse,
)
def parse(self, response):
status_code = response.status
print(status_code)
html = response.text
res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html)
resume_id_list = list(set(res))
for item in resume_id_list:
params = {
'c': 'hr',
'act': 'resumeInfo',
'eid': item,
'state': 'undefined',
'from': '',
}
query_string = urllib.parse.urlencode(params)
full_url = f"{self.start_urls[0]}?{query_string}"
yield scrapy.Request(
url=full_url,
method='GET',
headers=self.headers,
cookies=self.cookies,
callback=self.parse2,
meta={'resume_id': item},
)
def parse2(self, response):
resume_id = response.meta.get('resume_id')
parts_raw = response.xpath('//div[@class="hr_resume_item"]/text()').get()
extra_span = response.xpath('//div[@class="hr_resume_item"]/span/text()').get()
parts = []
if parts_raw:
cleaned = re.sub(r'\s+', ' ', parts_raw).strip()
parts = [p.strip() for p in cleaned.split('·') if p.strip()]
if extra_span:
parts.append(extra_span.strip())
current_location = ''
if parts and '现居' in parts[-1]:
current_location = parts[-1]
parts = parts[:-1]
text = " ".join(parts)
age = re.search(r'(\d{2})岁', text)
height = re.search(r'(\d{2,3})\s*cm', text, re.I)
weight = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.I)
experience = re.search(r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)', text)
education = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text)
marital = re.search(r'(已婚|未婚)', text)
ethnic = re.search(r'(汉|满|回|壮|蒙古)', text)
# 页面字段 XPath 提取
name = response.xpath('//span[@class="hr_resume_username"]/text()').get()
update_time_raw = response.xpath('//span[@class="hr_resume_time_l "]/text()').get()
update_time = re.sub(r'^更新时间[:]?', '', update_time_raw).strip() if update_time_raw else ''
job_funcs = response.xpath('//span[@class="yun_newedition_yx_job"]/text()').getall()
job_titles = response.xpath('//li[span[contains(text(),"意向岗位")]]/text()').get()
industry = response.xpath('//li[span[contains(text(),"从事行业")]]/text()').get()
salary = response.xpath('//li[span[contains(text(),"期望薪资")]]/text()').get()
report_time = response.xpath('//li[span[contains(text(),"到岗时间")]]/text()').get()
job_type = response.xpath('//li[span[contains(text(),"工作性质")]]/text()').get()
job_status = response.xpath('//li[span[contains(text(),"求职状态")]]/text()').get()
location = response.xpath('//li[span[contains(text(),"工作地点")]]/text()').get()
yield {
'resume_id': resume_id,
'name': name.strip() if name else None,
'age': age.group(1) if age else None,
'height': height.group(1) if height else None,
'weight': weight.group(1) if weight else None,
'work_years': experience.group(1) if experience else None,
'education': education.group(1) if education else None,
'marital_status': marital.group(1) if marital else None,
'ethnicity': ethnic.group(1) if ethnic else None,
'current_location': current_location.replace('现居', '').strip() if current_location else None,
'update_time': update_time[3:] if update_time else None,
'job_function': ', '.join([j.strip() for j in job_funcs]) if job_funcs else None,
'intended_position': job_titles.strip() if job_titles else None,
'industry': industry.strip() if industry else None,
'expected_salary': salary.strip() if salary else None,
'available_time': report_time.strip() if report_time else None,
'job_property': job_type.strip() if job_type else None,
'job_status': job_status.strip() if job_status else None,
'job_location': location.strip() if location else None,
'source_id': 1,
}

View File

Binary file not shown.

View File

@ -0,0 +1,66 @@
from datetime import datetime
import pymysql
class MySQLClient:
def __init__(self, host, user, password, db, port=3306):
self.conn = pymysql.connect(
host=host,
user=user,
password=password,
db=db,
port=port,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
self.cursor = self.conn.cursor()
def execute(self, sql, values=None):
try:
self.cursor.execute(sql, values or [])
except Exception as e:
print(f"[MySQL] 执行失败: {e}")
self.conn.rollback()
def __del__(self):
try:
self.cursor.close()
self.conn.close()
except Exception:
pass
class DB:
_client: MySQLClient = None # 类属性持有连接
@classmethod
def init(cls):
if cls._client is None:
cls._client = MySQLClient(
host='39.101.135.56',
user='tsreshub_prod',
password='Tr5h$Prod!92@TsRH',
db='tsreshub_db',
port=3306
)
@classmethod
def insert_resume(cls, data: dict):
cls.init() # 保证连接已初始化
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
table = 'resumes_resumebasic'
keys = ', '.join(safe_data.keys())
placeholders = ', '.join(['%s'] * len(safe_data))
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
sql = f"""
INSERT INTO {table} ({keys}) VALUES ({placeholders})
ON DUPLICATE KEY UPDATE {update_clause}
"""
cls._client.execute(sql, list(safe_data.values()))

17
debug/Debug_yutian_top.py Normal file
View File

@ -0,0 +1,17 @@
# debug/debug_spider.py
import sys
import os
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
sys.path.append(project_root)
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from TS_resume_spider.spiders.yutian_top import YutianTopSpider
from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
def main():
process = CrawlerProcess(get_project_settings())
process.crawl(ZunHuaComSpider)
process.start()
if __name__ == '__main__':
main()

0
debug/__init__.py Normal file
View File

11
scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = TS_resume_spider.settings
[deploy]
#url = http://localhost:6800/
project = TS_resume_spider