a
This commit is contained in:
commit
90217778be
0
TS_resume_spider/__init__.py
Normal file
0
TS_resume_spider/__init__.py
Normal file
BIN
TS_resume_spider/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
TS_resume_spider/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/__init__.cpython-39.pyc
Normal file
BIN
TS_resume_spider/__pycache__/__init__.cpython-39.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/pipelines.cpython-312.pyc
Normal file
BIN
TS_resume_spider/__pycache__/pipelines.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/pipelines.cpython-39.pyc
Normal file
BIN
TS_resume_spider/__pycache__/pipelines.cpython-39.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/settings.cpython-312.pyc
Normal file
BIN
TS_resume_spider/__pycache__/settings.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/settings.cpython-39.pyc
Normal file
BIN
TS_resume_spider/__pycache__/settings.cpython-39.pyc
Normal file
Binary file not shown.
12
TS_resume_spider/items.py
Normal file
12
TS_resume_spider/items.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class TsResumeSpiderItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
103
TS_resume_spider/middlewares.py
Normal file
103
TS_resume_spider/middlewares.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class TsResumeSpiderSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class TsResumeSpiderDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
116
TS_resume_spider/pipelines.py
Normal file
116
TS_resume_spider/pipelines.py
Normal file
@ -0,0 +1,116 @@
|
|||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
from datetime import datetime
|
||||||
|
import re
|
||||||
|
from TS_resume_spider.utils.db import DB
|
||||||
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class TsResumeSpiderPipeline:
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class YTSpiderPipeline:
|
||||||
|
reverse_field_map = {
|
||||||
|
'resume_id': 'resume_id',
|
||||||
|
'user_name': 'name',
|
||||||
|
'sex_show': 'gender',
|
||||||
|
'user_age': 'age',
|
||||||
|
'area_show': 'job_location',
|
||||||
|
'birthday': 'birthday',
|
||||||
|
'education_level_msg': 'education',
|
||||||
|
'expect_job': 'expected_position',
|
||||||
|
'last_edit_time': 'update_time',
|
||||||
|
'marry_status_show': 'marital_status',
|
||||||
|
'residence': 'current_location',
|
||||||
|
'phone_encrypt': 'phone',
|
||||||
|
'work_type_show': 'job_property',
|
||||||
|
'work_status_show': 'job_status',
|
||||||
|
'work_1_description': 'work_1_description',
|
||||||
|
'work_1_time': 'work_1_time',
|
||||||
|
'work_1_experience': 'work_1_experience',
|
||||||
|
'work_2_description': 'work_2_description',
|
||||||
|
'work_2_time': 'work_2_time',
|
||||||
|
'work_2_experience': 'work_2_experience',
|
||||||
|
'work_3_description': 'work_3_description',
|
||||||
|
'work_3_time': 'work_3_time',
|
||||||
|
'work_3_experience': 'work_3_experience',
|
||||||
|
'work_4_description': 'work_4_description',
|
||||||
|
'work_4_time': 'work_4_time',
|
||||||
|
'work_4_experience': 'work_4_experience',
|
||||||
|
}
|
||||||
|
|
||||||
|
def extract_int(self, s):
|
||||||
|
try:
|
||||||
|
return int(re.search(r'\d+', str(s)).group())
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def parse_datetime(self, s):
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(s)
|
||||||
|
except:
|
||||||
|
return datetime(2019, 12, 12)
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
if spider.name != 'yutian_top':
|
||||||
|
return item
|
||||||
|
experience = item.get("experience", [])
|
||||||
|
for j in range(4):
|
||||||
|
if j < len(experience):
|
||||||
|
company = experience[j].get("company", "")
|
||||||
|
time_line = experience[j].get("time_line", "")
|
||||||
|
content = experience[j].get("content", "")
|
||||||
|
else:
|
||||||
|
company = ''
|
||||||
|
time_line = ''
|
||||||
|
content = ''
|
||||||
|
|
||||||
|
item[f"work_{j + 1}_experience"] = company
|
||||||
|
item[f"work_{j + 1}_time"] = time_line
|
||||||
|
item[f"work_{j + 1}_description"] = content
|
||||||
|
|
||||||
|
item = {
|
||||||
|
self.reverse_field_map[k]: v
|
||||||
|
for k, v in item.items()
|
||||||
|
if k in self.reverse_field_map
|
||||||
|
}
|
||||||
|
|
||||||
|
if "age" in item:
|
||||||
|
item["age"] = self.extract_int(item["age"])
|
||||||
|
|
||||||
|
if "height" in item:
|
||||||
|
item["height"] = self.extract_int(item["height"])
|
||||||
|
|
||||||
|
if "weight" in item:
|
||||||
|
item["weight"] = self.extract_int(item["weight"])
|
||||||
|
|
||||||
|
if "update_time" in item:
|
||||||
|
item["update_time"] = self.parse_datetime(item["update_time"])
|
||||||
|
|
||||||
|
item["source_id"] = 2
|
||||||
|
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class YTSavePipeline:
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
if spider.name not in ['yutian_top' ,'zhrczp_com']:
|
||||||
|
return item
|
||||||
|
resume_id = item.get("resume_id")
|
||||||
|
if not resume_id:
|
||||||
|
raise DropItem("⚠️ resume_id 缺失,已丢弃")
|
||||||
|
|
||||||
|
try:
|
||||||
|
DB.insert_resume(item)
|
||||||
|
except Exception as e:
|
||||||
|
spider.logger.warning(f"❌ 写入失败:resume_id={resume_id}, 错误={e}")
|
||||||
|
|
||||||
|
return item
|
||||||
|
|
93
TS_resume_spider/settings.py
Normal file
93
TS_resume_spider/settings.py
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
# Scrapy settings for TS_resume_spider project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "TS_resume_spider"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["TS_resume_spider.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
# CONCURRENT_REQUESTS = 32
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
# DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
# COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
# TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
# DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
# SPIDER_MIDDLEWARES = {
|
||||||
|
# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
# EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
|
||||||
|
'TS_resume_spider.pipelines.YTSavePipeline': 500,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
# AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
# AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
# AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
# HTTPCACHE_ENABLED = True
|
||||||
|
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
# HTTPCACHE_DIR = "httpcache"
|
||||||
|
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
4
TS_resume_spider/spiders/__init__.py
Normal file
4
TS_resume_spider/spiders/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
BIN
TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/spiders/__pycache__/yutian_top.cpython-312.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/yutian_top.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/spiders/__pycache__/yutian_top.cpython-39.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/yutian_top.cpython-39.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/spiders/__pycache__/zhrczp_com.cpython-312.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/zhrczp_com.cpython-312.pyc
Normal file
Binary file not shown.
85
TS_resume_spider/spiders/yutian_top.py
Normal file
85
TS_resume_spider/spiders/yutian_top.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
import scrapy
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class YutianTopSpider(scrapy.Spider):
|
||||||
|
name = 'yutian_top'
|
||||||
|
allowed_domains = ['yutian.top']
|
||||||
|
start_urls = ['https://www.yutian.top/job/company/v1/resume/page']
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
headers = {
|
||||||
|
'accept': 'application/json, text/plain, */*',
|
||||||
|
'accept-language': 'zh-CN,zh;q=0.9',
|
||||||
|
'cache-control': 'no-cache',
|
||||||
|
'content-type': 'application/json;charset=UTF-8',
|
||||||
|
'origin': 'https://www.yutian.top',
|
||||||
|
'pragma': 'no-cache',
|
||||||
|
'priority': 'u=1, i',
|
||||||
|
'referer': 'https://www.yutian.top/enterprise/resume_store/list',
|
||||||
|
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||||
|
}
|
||||||
|
|
||||||
|
cookies = {
|
||||||
|
'company_sign': '',
|
||||||
|
'company_nonce': '',
|
||||||
|
'cuid': '',
|
||||||
|
'PHPSESSID': '210b19c9d51dbf8eec8e8ffb0540ad33',
|
||||||
|
'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDY4MTIxNTksImp0aSI6IjgwZGVjMzY4LWUwODktNGYxYS1hNWJjLWExNDMzMDYzMjdmYiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIwM2M2MmI5ODM4Yjk3Y2UzYmQxZTQwNDllZGVlNmI0OCIsInRlbmFudF90b2tlbiI6IjY1OTAxM2RlNjAxZmJmNjg1MzZmYTU0OTc4ODVkMTA2In0.0rXFe1iQClJ33rgXnTjhmye3zqVEZkJQvHGGET9dsz0',
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in range(1,6):
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
'step': 1000,
|
||||||
|
'page': i,
|
||||||
|
'education_level': [],
|
||||||
|
'arrival_time': [],
|
||||||
|
'work_time': [],
|
||||||
|
'area_id': [],
|
||||||
|
'keywords': '',
|
||||||
|
'work_status': '',
|
||||||
|
'work_status_show': '求职状态',
|
||||||
|
'category_id': '',
|
||||||
|
'work_type': '',
|
||||||
|
'work_type_show': '是否兼职',
|
||||||
|
'sex': '',
|
||||||
|
'sex_show': '性别',
|
||||||
|
'is_head': '',
|
||||||
|
'is_head_show': '有无照片',
|
||||||
|
'job_id': '',
|
||||||
|
'age': [],
|
||||||
|
'age_show': '年龄',
|
||||||
|
'refresh_time': 0,
|
||||||
|
'site_id': '',
|
||||||
|
'site_id2': '',
|
||||||
|
'province': '',
|
||||||
|
'city': '',
|
||||||
|
'county': '',
|
||||||
|
'provinceArr': [],
|
||||||
|
'cityArr': [],
|
||||||
|
'countyArr': [],
|
||||||
|
'only_job_category': 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=self.start_urls[0],
|
||||||
|
method='POST',
|
||||||
|
headers=headers,
|
||||||
|
cookies=cookies,
|
||||||
|
body=json.dumps(payload),
|
||||||
|
callback=self.parse,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
status_code = response.status
|
||||||
|
print(status_code)
|
||||||
|
data = json.loads(response.text)
|
||||||
|
for item in data.get('data', []):
|
||||||
|
yield item
|
148
TS_resume_spider/spiders/zhrczp_com.py
Normal file
148
TS_resume_spider/spiders/zhrczp_com.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
import re
|
||||||
|
import urllib
|
||||||
|
from typing import Iterable
|
||||||
|
import scrapy
|
||||||
|
from lxml import etree
|
||||||
|
from scrapy import Request
|
||||||
|
|
||||||
|
|
||||||
|
class ZunHuaComSpider(scrapy.Spider):
|
||||||
|
name = 'zhrczp_com'
|
||||||
|
allowed_domains = ['zhrczp.com']
|
||||||
|
start_urls = ['https://www.zhrczp.com/member/index.php']
|
||||||
|
cookies = {
|
||||||
|
'Hm_lvt_115013d5b34e45eb09d0baedeb1c845a': '1745062179',
|
||||||
|
'HMACCOUNT': 'B05D7338A384928F',
|
||||||
|
'Hm_lpvt_115013d5b34e45eb09d0baedeb1c845a': '1745062980',
|
||||||
|
'PHPSESSID': 'f2o89gakk79jl43hcl4ptnea3r',
|
||||||
|
'uid': '60531',
|
||||||
|
'shell': '9246a8c91784a3981081a37dd4bdcef9',
|
||||||
|
'usertype': '2',
|
||||||
|
'userdid': '0',
|
||||||
|
'amtype': '0',
|
||||||
|
'jobrefresh': '1',
|
||||||
|
'gzh': '1',
|
||||||
|
'acw_tc': '1a0c63d517450682931821154e003e6b210262ee0f2d393aa4e3b2a163053b',
|
||||||
|
'pc_bannerFlag': '1',
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||||
|
'Cache-Control': 'no-cache',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Pragma': 'no-cache',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'none',
|
||||||
|
'Sec-Fetch-User': '?1',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||||
|
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self) -> Iterable[Request]:
|
||||||
|
for page in range(1, 251):
|
||||||
|
params = {
|
||||||
|
'c': 'resume',
|
||||||
|
'page': str(page),
|
||||||
|
}
|
||||||
|
query_string = urllib.parse.urlencode(params)
|
||||||
|
full_url = f"{self.start_urls[0]}?{query_string}"
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=full_url,
|
||||||
|
method='GET',
|
||||||
|
headers=self.headers,
|
||||||
|
cookies=self.cookies,
|
||||||
|
callback=self.parse,
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
status_code = response.status
|
||||||
|
print(status_code)
|
||||||
|
html = response.text
|
||||||
|
res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html)
|
||||||
|
resume_id_list = list(set(res))
|
||||||
|
for item in resume_id_list:
|
||||||
|
params = {
|
||||||
|
'c': 'hr',
|
||||||
|
'act': 'resumeInfo',
|
||||||
|
'eid': item,
|
||||||
|
'state': 'undefined',
|
||||||
|
'from': '',
|
||||||
|
}
|
||||||
|
query_string = urllib.parse.urlencode(params)
|
||||||
|
full_url = f"{self.start_urls[0]}?{query_string}"
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=full_url,
|
||||||
|
method='GET',
|
||||||
|
headers=self.headers,
|
||||||
|
cookies=self.cookies,
|
||||||
|
callback=self.parse2,
|
||||||
|
meta={'resume_id': item},
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse2(self, response):
|
||||||
|
resume_id = response.meta.get('resume_id')
|
||||||
|
|
||||||
|
parts_raw = response.xpath('//div[@class="hr_resume_item"]/text()').get()
|
||||||
|
extra_span = response.xpath('//div[@class="hr_resume_item"]/span/text()').get()
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
if parts_raw:
|
||||||
|
cleaned = re.sub(r'\s+', ' ', parts_raw).strip()
|
||||||
|
parts = [p.strip() for p in cleaned.split('·') if p.strip()]
|
||||||
|
if extra_span:
|
||||||
|
parts.append(extra_span.strip())
|
||||||
|
|
||||||
|
current_location = ''
|
||||||
|
if parts and '现居' in parts[-1]:
|
||||||
|
current_location = parts[-1]
|
||||||
|
parts = parts[:-1]
|
||||||
|
|
||||||
|
text = " ".join(parts)
|
||||||
|
age = re.search(r'(\d{2})岁', text)
|
||||||
|
height = re.search(r'(\d{2,3})\s*cm', text, re.I)
|
||||||
|
weight = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.I)
|
||||||
|
experience = re.search(r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)', text)
|
||||||
|
education = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text)
|
||||||
|
marital = re.search(r'(已婚|未婚)', text)
|
||||||
|
ethnic = re.search(r'(汉|满|回|壮|蒙古)', text)
|
||||||
|
|
||||||
|
# 页面字段 XPath 提取
|
||||||
|
name = response.xpath('//span[@class="hr_resume_username"]/text()').get()
|
||||||
|
update_time_raw = response.xpath('//span[@class="hr_resume_time_l "]/text()').get()
|
||||||
|
update_time = re.sub(r'^更新时间[::]?', '', update_time_raw).strip() if update_time_raw else ''
|
||||||
|
|
||||||
|
job_funcs = response.xpath('//span[@class="yun_newedition_yx_job"]/text()').getall()
|
||||||
|
job_titles = response.xpath('//li[span[contains(text(),"意向岗位")]]/text()').get()
|
||||||
|
industry = response.xpath('//li[span[contains(text(),"从事行业")]]/text()').get()
|
||||||
|
salary = response.xpath('//li[span[contains(text(),"期望薪资")]]/text()').get()
|
||||||
|
report_time = response.xpath('//li[span[contains(text(),"到岗时间")]]/text()').get()
|
||||||
|
job_type = response.xpath('//li[span[contains(text(),"工作性质")]]/text()').get()
|
||||||
|
job_status = response.xpath('//li[span[contains(text(),"求职状态")]]/text()').get()
|
||||||
|
location = response.xpath('//li[span[contains(text(),"工作地点")]]/text()').get()
|
||||||
|
yield {
|
||||||
|
'resume_id': resume_id,
|
||||||
|
'name': name.strip() if name else None,
|
||||||
|
'age': age.group(1) if age else None,
|
||||||
|
'height': height.group(1) if height else None,
|
||||||
|
'weight': weight.group(1) if weight else None,
|
||||||
|
'work_years': experience.group(1) if experience else None,
|
||||||
|
'education': education.group(1) if education else None,
|
||||||
|
'marital_status': marital.group(1) if marital else None,
|
||||||
|
'ethnicity': ethnic.group(1) if ethnic else None,
|
||||||
|
'current_location': current_location.replace('现居', '').strip() if current_location else None,
|
||||||
|
'update_time': update_time[3:] if update_time else None,
|
||||||
|
'job_function': ', '.join([j.strip() for j in job_funcs]) if job_funcs else None,
|
||||||
|
'intended_position': job_titles.strip() if job_titles else None,
|
||||||
|
'industry': industry.strip() if industry else None,
|
||||||
|
'expected_salary': salary.strip() if salary else None,
|
||||||
|
'available_time': report_time.strip() if report_time else None,
|
||||||
|
'job_property': job_type.strip() if job_type else None,
|
||||||
|
'job_status': job_status.strip() if job_status else None,
|
||||||
|
'job_location': location.strip() if location else None,
|
||||||
|
'source_id': 1,
|
||||||
|
}
|
0
TS_resume_spider/utils/__init__.py
Normal file
0
TS_resume_spider/utils/__init__.py
Normal file
BIN
TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/utils/__pycache__/db.cpython-312.pyc
Normal file
BIN
TS_resume_spider/utils/__pycache__/db.cpython-312.pyc
Normal file
Binary file not shown.
66
TS_resume_spider/utils/db.py
Normal file
66
TS_resume_spider/utils/db.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
|
||||||
|
|
||||||
|
class MySQLClient:
|
||||||
|
def __init__(self, host, user, password, db, port=3306):
|
||||||
|
self.conn = pymysql.connect(
|
||||||
|
host=host,
|
||||||
|
user=user,
|
||||||
|
password=password,
|
||||||
|
db=db,
|
||||||
|
port=port,
|
||||||
|
charset='utf8mb4',
|
||||||
|
cursorclass=pymysql.cursors.DictCursor,
|
||||||
|
autocommit=True
|
||||||
|
)
|
||||||
|
self.cursor = self.conn.cursor()
|
||||||
|
|
||||||
|
def execute(self, sql, values=None):
|
||||||
|
try:
|
||||||
|
self.cursor.execute(sql, values or [])
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[MySQL] 执行失败: {e}")
|
||||||
|
self.conn.rollback()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
try:
|
||||||
|
self.cursor.close()
|
||||||
|
self.conn.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class DB:
|
||||||
|
_client: MySQLClient = None # 类属性持有连接
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def init(cls):
|
||||||
|
if cls._client is None:
|
||||||
|
cls._client = MySQLClient(
|
||||||
|
host='39.101.135.56',
|
||||||
|
user='tsreshub_prod',
|
||||||
|
password='Tr5h$Prod!92@TsRH',
|
||||||
|
db='tsreshub_db',
|
||||||
|
port=3306
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def insert_resume(cls, data: dict):
|
||||||
|
cls.init() # 保证连接已初始化
|
||||||
|
|
||||||
|
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
|
||||||
|
|
||||||
|
table = 'resumes_resumebasic'
|
||||||
|
keys = ', '.join(safe_data.keys())
|
||||||
|
placeholders = ', '.join(['%s'] * len(safe_data))
|
||||||
|
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
INSERT INTO {table} ({keys}) VALUES ({placeholders})
|
||||||
|
ON DUPLICATE KEY UPDATE {update_clause}
|
||||||
|
"""
|
||||||
|
|
||||||
|
cls._client.execute(sql, list(safe_data.values()))
|
17
debug/Debug_yutian_top.py
Normal file
17
debug/Debug_yutian_top.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# debug/debug_spider.py
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
|
sys.path.append(project_root)
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
from TS_resume_spider.spiders.yutian_top import YutianTopSpider
|
||||||
|
from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
|
||||||
|
|
||||||
|
def main():
|
||||||
|
process = CrawlerProcess(get_project_settings())
|
||||||
|
process.crawl(ZunHuaComSpider)
|
||||||
|
process.start()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
0
debug/__init__.py
Normal file
0
debug/__init__.py
Normal file
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
# Automatically created by: scrapy startproject
|
||||||
|
#
|
||||||
|
# For more information about the [deploy] section see:
|
||||||
|
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||||
|
|
||||||
|
[settings]
|
||||||
|
default = TS_resume_spider.settings
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
#url = http://localhost:6800/
|
||||||
|
project = TS_resume_spider
|
Loading…
x
Reference in New Issue
Block a user