a
This commit is contained in:
commit
90217778be
0
TS_resume_spider/__init__.py
Normal file
0
TS_resume_spider/__init__.py
Normal file
BIN
TS_resume_spider/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
TS_resume_spider/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/__init__.cpython-39.pyc
Normal file
BIN
TS_resume_spider/__pycache__/__init__.cpython-39.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/pipelines.cpython-312.pyc
Normal file
BIN
TS_resume_spider/__pycache__/pipelines.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/pipelines.cpython-39.pyc
Normal file
BIN
TS_resume_spider/__pycache__/pipelines.cpython-39.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/settings.cpython-312.pyc
Normal file
BIN
TS_resume_spider/__pycache__/settings.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/__pycache__/settings.cpython-39.pyc
Normal file
BIN
TS_resume_spider/__pycache__/settings.cpython-39.pyc
Normal file
Binary file not shown.
12
TS_resume_spider/items.py
Normal file
12
TS_resume_spider/items.py
Normal file
@ -0,0 +1,12 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class TsResumeSpiderItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
103
TS_resume_spider/middlewares.py
Normal file
103
TS_resume_spider/middlewares.py
Normal file
@ -0,0 +1,103 @@
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import is_item, ItemAdapter
|
||||
|
||||
|
||||
class TsResumeSpiderSpiderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, or item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Request or item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
||||
|
||||
|
||||
class TsResumeSpiderDownloaderMiddleware:
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info("Spider opened: %s" % spider.name)
|
116
TS_resume_spider/pipelines.py
Normal file
116
TS_resume_spider/pipelines.py
Normal file
@ -0,0 +1,116 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
from datetime import datetime
|
||||
import re
|
||||
from TS_resume_spider.utils.db import DB
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
# useful for handling different item types with a single interface
|
||||
from itemadapter import ItemAdapter
|
||||
|
||||
|
||||
class TsResumeSpiderPipeline:
|
||||
def process_item(self, item, spider):
|
||||
return item
|
||||
|
||||
|
||||
class YTSpiderPipeline:
|
||||
reverse_field_map = {
|
||||
'resume_id': 'resume_id',
|
||||
'user_name': 'name',
|
||||
'sex_show': 'gender',
|
||||
'user_age': 'age',
|
||||
'area_show': 'job_location',
|
||||
'birthday': 'birthday',
|
||||
'education_level_msg': 'education',
|
||||
'expect_job': 'expected_position',
|
||||
'last_edit_time': 'update_time',
|
||||
'marry_status_show': 'marital_status',
|
||||
'residence': 'current_location',
|
||||
'phone_encrypt': 'phone',
|
||||
'work_type_show': 'job_property',
|
||||
'work_status_show': 'job_status',
|
||||
'work_1_description': 'work_1_description',
|
||||
'work_1_time': 'work_1_time',
|
||||
'work_1_experience': 'work_1_experience',
|
||||
'work_2_description': 'work_2_description',
|
||||
'work_2_time': 'work_2_time',
|
||||
'work_2_experience': 'work_2_experience',
|
||||
'work_3_description': 'work_3_description',
|
||||
'work_3_time': 'work_3_time',
|
||||
'work_3_experience': 'work_3_experience',
|
||||
'work_4_description': 'work_4_description',
|
||||
'work_4_time': 'work_4_time',
|
||||
'work_4_experience': 'work_4_experience',
|
||||
}
|
||||
|
||||
def extract_int(self, s):
|
||||
try:
|
||||
return int(re.search(r'\d+', str(s)).group())
|
||||
except:
|
||||
return None
|
||||
|
||||
def parse_datetime(self, s):
|
||||
try:
|
||||
return datetime.fromisoformat(s)
|
||||
except:
|
||||
return datetime(2019, 12, 12)
|
||||
|
||||
def process_item(self, item, spider):
|
||||
if spider.name != 'yutian_top':
|
||||
return item
|
||||
experience = item.get("experience", [])
|
||||
for j in range(4):
|
||||
if j < len(experience):
|
||||
company = experience[j].get("company", "")
|
||||
time_line = experience[j].get("time_line", "")
|
||||
content = experience[j].get("content", "")
|
||||
else:
|
||||
company = ''
|
||||
time_line = ''
|
||||
content = ''
|
||||
|
||||
item[f"work_{j + 1}_experience"] = company
|
||||
item[f"work_{j + 1}_time"] = time_line
|
||||
item[f"work_{j + 1}_description"] = content
|
||||
|
||||
item = {
|
||||
self.reverse_field_map[k]: v
|
||||
for k, v in item.items()
|
||||
if k in self.reverse_field_map
|
||||
}
|
||||
|
||||
if "age" in item:
|
||||
item["age"] = self.extract_int(item["age"])
|
||||
|
||||
if "height" in item:
|
||||
item["height"] = self.extract_int(item["height"])
|
||||
|
||||
if "weight" in item:
|
||||
item["weight"] = self.extract_int(item["weight"])
|
||||
|
||||
if "update_time" in item:
|
||||
item["update_time"] = self.parse_datetime(item["update_time"])
|
||||
|
||||
item["source_id"] = 2
|
||||
|
||||
return item
|
||||
|
||||
|
||||
class YTSavePipeline:
|
||||
def process_item(self, item, spider):
|
||||
if spider.name not in ['yutian_top' ,'zhrczp_com']:
|
||||
return item
|
||||
resume_id = item.get("resume_id")
|
||||
if not resume_id:
|
||||
raise DropItem("⚠️ resume_id 缺失,已丢弃")
|
||||
|
||||
try:
|
||||
DB.insert_resume(item)
|
||||
except Exception as e:
|
||||
spider.logger.warning(f"❌ 写入失败:resume_id={resume_id}, 错误={e}")
|
||||
|
||||
return item
|
||||
|
93
TS_resume_spider/settings.py
Normal file
93
TS_resume_spider/settings.py
Normal file
@ -0,0 +1,93 @@
|
||||
# Scrapy settings for TS_resume_spider project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = "TS_resume_spider"
|
||||
|
||||
SPIDER_MODULES = ["TS_resume_spider.spiders"]
|
||||
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
# CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
# DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
# COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
# TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
# DEFAULT_REQUEST_HEADERS = {
|
||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# "Accept-Language": "en",
|
||||
# }
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
# SPIDER_MIDDLEWARES = {
|
||||
# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
|
||||
# }
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# DOWNLOADER_MIDDLEWARES = {
|
||||
# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
|
||||
# }
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||
# EXTENSIONS = {
|
||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||
# }
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
|
||||
'TS_resume_spider.pipelines.YTSavePipeline': 500,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||
# AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
# AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
# AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
# HTTPCACHE_ENABLED = True
|
||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||
# HTTPCACHE_DIR = "httpcache"
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
4
TS_resume_spider/spiders/__init__.py
Normal file
4
TS_resume_spider/spiders/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
BIN
TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/spiders/__pycache__/yutian_top.cpython-312.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/yutian_top.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/spiders/__pycache__/yutian_top.cpython-39.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/yutian_top.cpython-39.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/spiders/__pycache__/zhrczp_com.cpython-312.pyc
Normal file
BIN
TS_resume_spider/spiders/__pycache__/zhrczp_com.cpython-312.pyc
Normal file
Binary file not shown.
85
TS_resume_spider/spiders/yutian_top.py
Normal file
85
TS_resume_spider/spiders/yutian_top.py
Normal file
@ -0,0 +1,85 @@
|
||||
import scrapy
|
||||
import json
|
||||
|
||||
|
||||
class YutianTopSpider(scrapy.Spider):
|
||||
name = 'yutian_top'
|
||||
allowed_domains = ['yutian.top']
|
||||
start_urls = ['https://www.yutian.top/job/company/v1/resume/page']
|
||||
|
||||
def start_requests(self):
|
||||
headers = {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'zh-CN,zh;q=0.9',
|
||||
'cache-control': 'no-cache',
|
||||
'content-type': 'application/json;charset=UTF-8',
|
||||
'origin': 'https://www.yutian.top',
|
||||
'pragma': 'no-cache',
|
||||
'priority': 'u=1, i',
|
||||
'referer': 'https://www.yutian.top/enterprise/resume_store/list',
|
||||
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sec-fetch-dest': 'empty',
|
||||
'sec-fetch-mode': 'cors',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||
}
|
||||
|
||||
cookies = {
|
||||
'company_sign': '',
|
||||
'company_nonce': '',
|
||||
'cuid': '',
|
||||
'PHPSESSID': '210b19c9d51dbf8eec8e8ffb0540ad33',
|
||||
'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDY4MTIxNTksImp0aSI6IjgwZGVjMzY4LWUwODktNGYxYS1hNWJjLWExNDMzMDYzMjdmYiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIwM2M2MmI5ODM4Yjk3Y2UzYmQxZTQwNDllZGVlNmI0OCIsInRlbmFudF90b2tlbiI6IjY1OTAxM2RlNjAxZmJmNjg1MzZmYTU0OTc4ODVkMTA2In0.0rXFe1iQClJ33rgXnTjhmye3zqVEZkJQvHGGET9dsz0',
|
||||
}
|
||||
|
||||
for i in range(1,6):
|
||||
|
||||
payload = {
|
||||
'step': 1000,
|
||||
'page': i,
|
||||
'education_level': [],
|
||||
'arrival_time': [],
|
||||
'work_time': [],
|
||||
'area_id': [],
|
||||
'keywords': '',
|
||||
'work_status': '',
|
||||
'work_status_show': '求职状态',
|
||||
'category_id': '',
|
||||
'work_type': '',
|
||||
'work_type_show': '是否兼职',
|
||||
'sex': '',
|
||||
'sex_show': '性别',
|
||||
'is_head': '',
|
||||
'is_head_show': '有无照片',
|
||||
'job_id': '',
|
||||
'age': [],
|
||||
'age_show': '年龄',
|
||||
'refresh_time': 0,
|
||||
'site_id': '',
|
||||
'site_id2': '',
|
||||
'province': '',
|
||||
'city': '',
|
||||
'county': '',
|
||||
'provinceArr': [],
|
||||
'cityArr': [],
|
||||
'countyArr': [],
|
||||
'only_job_category': 0,
|
||||
}
|
||||
|
||||
yield scrapy.Request(
|
||||
url=self.start_urls[0],
|
||||
method='POST',
|
||||
headers=headers,
|
||||
cookies=cookies,
|
||||
body=json.dumps(payload),
|
||||
callback=self.parse,
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
status_code = response.status
|
||||
print(status_code)
|
||||
data = json.loads(response.text)
|
||||
for item in data.get('data', []):
|
||||
yield item
|
148
TS_resume_spider/spiders/zhrczp_com.py
Normal file
148
TS_resume_spider/spiders/zhrczp_com.py
Normal file
@ -0,0 +1,148 @@
|
||||
import re
|
||||
import urllib
|
||||
from typing import Iterable
|
||||
import scrapy
|
||||
from lxml import etree
|
||||
from scrapy import Request
|
||||
|
||||
|
||||
class ZunHuaComSpider(scrapy.Spider):
|
||||
name = 'zhrczp_com'
|
||||
allowed_domains = ['zhrczp.com']
|
||||
start_urls = ['https://www.zhrczp.com/member/index.php']
|
||||
cookies = {
|
||||
'Hm_lvt_115013d5b34e45eb09d0baedeb1c845a': '1745062179',
|
||||
'HMACCOUNT': 'B05D7338A384928F',
|
||||
'Hm_lpvt_115013d5b34e45eb09d0baedeb1c845a': '1745062980',
|
||||
'PHPSESSID': 'f2o89gakk79jl43hcl4ptnea3r',
|
||||
'uid': '60531',
|
||||
'shell': '9246a8c91784a3981081a37dd4bdcef9',
|
||||
'usertype': '2',
|
||||
'userdid': '0',
|
||||
'amtype': '0',
|
||||
'jobrefresh': '1',
|
||||
'gzh': '1',
|
||||
'acw_tc': '1a0c63d517450682931821154e003e6b210262ee0f2d393aa4e3b2a163053b',
|
||||
'pc_bannerFlag': '1',
|
||||
}
|
||||
headers = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Pragma': 'no-cache',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
}
|
||||
|
||||
def start_requests(self) -> Iterable[Request]:
|
||||
for page in range(1, 251):
|
||||
params = {
|
||||
'c': 'resume',
|
||||
'page': str(page),
|
||||
}
|
||||
query_string = urllib.parse.urlencode(params)
|
||||
full_url = f"{self.start_urls[0]}?{query_string}"
|
||||
yield scrapy.Request(
|
||||
url=full_url,
|
||||
method='GET',
|
||||
headers=self.headers,
|
||||
cookies=self.cookies,
|
||||
callback=self.parse,
|
||||
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
status_code = response.status
|
||||
print(status_code)
|
||||
html = response.text
|
||||
res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html)
|
||||
resume_id_list = list(set(res))
|
||||
for item in resume_id_list:
|
||||
params = {
|
||||
'c': 'hr',
|
||||
'act': 'resumeInfo',
|
||||
'eid': item,
|
||||
'state': 'undefined',
|
||||
'from': '',
|
||||
}
|
||||
query_string = urllib.parse.urlencode(params)
|
||||
full_url = f"{self.start_urls[0]}?{query_string}"
|
||||
yield scrapy.Request(
|
||||
url=full_url,
|
||||
method='GET',
|
||||
headers=self.headers,
|
||||
cookies=self.cookies,
|
||||
callback=self.parse2,
|
||||
meta={'resume_id': item},
|
||||
)
|
||||
|
||||
def parse2(self, response):
|
||||
resume_id = response.meta.get('resume_id')
|
||||
|
||||
parts_raw = response.xpath('//div[@class="hr_resume_item"]/text()').get()
|
||||
extra_span = response.xpath('//div[@class="hr_resume_item"]/span/text()').get()
|
||||
|
||||
parts = []
|
||||
if parts_raw:
|
||||
cleaned = re.sub(r'\s+', ' ', parts_raw).strip()
|
||||
parts = [p.strip() for p in cleaned.split('·') if p.strip()]
|
||||
if extra_span:
|
||||
parts.append(extra_span.strip())
|
||||
|
||||
current_location = ''
|
||||
if parts and '现居' in parts[-1]:
|
||||
current_location = parts[-1]
|
||||
parts = parts[:-1]
|
||||
|
||||
text = " ".join(parts)
|
||||
age = re.search(r'(\d{2})岁', text)
|
||||
height = re.search(r'(\d{2,3})\s*cm', text, re.I)
|
||||
weight = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.I)
|
||||
experience = re.search(r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)', text)
|
||||
education = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text)
|
||||
marital = re.search(r'(已婚|未婚)', text)
|
||||
ethnic = re.search(r'(汉|满|回|壮|蒙古)', text)
|
||||
|
||||
# 页面字段 XPath 提取
|
||||
name = response.xpath('//span[@class="hr_resume_username"]/text()').get()
|
||||
update_time_raw = response.xpath('//span[@class="hr_resume_time_l "]/text()').get()
|
||||
update_time = re.sub(r'^更新时间[::]?', '', update_time_raw).strip() if update_time_raw else ''
|
||||
|
||||
job_funcs = response.xpath('//span[@class="yun_newedition_yx_job"]/text()').getall()
|
||||
job_titles = response.xpath('//li[span[contains(text(),"意向岗位")]]/text()').get()
|
||||
industry = response.xpath('//li[span[contains(text(),"从事行业")]]/text()').get()
|
||||
salary = response.xpath('//li[span[contains(text(),"期望薪资")]]/text()').get()
|
||||
report_time = response.xpath('//li[span[contains(text(),"到岗时间")]]/text()').get()
|
||||
job_type = response.xpath('//li[span[contains(text(),"工作性质")]]/text()').get()
|
||||
job_status = response.xpath('//li[span[contains(text(),"求职状态")]]/text()').get()
|
||||
location = response.xpath('//li[span[contains(text(),"工作地点")]]/text()').get()
|
||||
yield {
|
||||
'resume_id': resume_id,
|
||||
'name': name.strip() if name else None,
|
||||
'age': age.group(1) if age else None,
|
||||
'height': height.group(1) if height else None,
|
||||
'weight': weight.group(1) if weight else None,
|
||||
'work_years': experience.group(1) if experience else None,
|
||||
'education': education.group(1) if education else None,
|
||||
'marital_status': marital.group(1) if marital else None,
|
||||
'ethnicity': ethnic.group(1) if ethnic else None,
|
||||
'current_location': current_location.replace('现居', '').strip() if current_location else None,
|
||||
'update_time': update_time[3:] if update_time else None,
|
||||
'job_function': ', '.join([j.strip() for j in job_funcs]) if job_funcs else None,
|
||||
'intended_position': job_titles.strip() if job_titles else None,
|
||||
'industry': industry.strip() if industry else None,
|
||||
'expected_salary': salary.strip() if salary else None,
|
||||
'available_time': report_time.strip() if report_time else None,
|
||||
'job_property': job_type.strip() if job_type else None,
|
||||
'job_status': job_status.strip() if job_status else None,
|
||||
'job_location': location.strip() if location else None,
|
||||
'source_id': 1,
|
||||
}
|
0
TS_resume_spider/utils/__init__.py
Normal file
0
TS_resume_spider/utils/__init__.py
Normal file
BIN
TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
TS_resume_spider/utils/__pycache__/db.cpython-312.pyc
Normal file
BIN
TS_resume_spider/utils/__pycache__/db.cpython-312.pyc
Normal file
Binary file not shown.
66
TS_resume_spider/utils/db.py
Normal file
66
TS_resume_spider/utils/db.py
Normal file
@ -0,0 +1,66 @@
|
||||
from datetime import datetime
|
||||
|
||||
import pymysql
|
||||
|
||||
|
||||
class MySQLClient:
|
||||
def __init__(self, host, user, password, db, port=3306):
|
||||
self.conn = pymysql.connect(
|
||||
host=host,
|
||||
user=user,
|
||||
password=password,
|
||||
db=db,
|
||||
port=port,
|
||||
charset='utf8mb4',
|
||||
cursorclass=pymysql.cursors.DictCursor,
|
||||
autocommit=True
|
||||
)
|
||||
self.cursor = self.conn.cursor()
|
||||
|
||||
def execute(self, sql, values=None):
|
||||
try:
|
||||
self.cursor.execute(sql, values or [])
|
||||
|
||||
except Exception as e:
|
||||
print(f"[MySQL] 执行失败: {e}")
|
||||
self.conn.rollback()
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
self.cursor.close()
|
||||
self.conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class DB:
|
||||
_client: MySQLClient = None # 类属性持有连接
|
||||
|
||||
@classmethod
|
||||
def init(cls):
|
||||
if cls._client is None:
|
||||
cls._client = MySQLClient(
|
||||
host='39.101.135.56',
|
||||
user='tsreshub_prod',
|
||||
password='Tr5h$Prod!92@TsRH',
|
||||
db='tsreshub_db',
|
||||
port=3306
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def insert_resume(cls, data: dict):
|
||||
cls.init() # 保证连接已初始化
|
||||
|
||||
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
|
||||
|
||||
table = 'resumes_resumebasic'
|
||||
keys = ', '.join(safe_data.keys())
|
||||
placeholders = ', '.join(['%s'] * len(safe_data))
|
||||
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
|
||||
|
||||
sql = f"""
|
||||
INSERT INTO {table} ({keys}) VALUES ({placeholders})
|
||||
ON DUPLICATE KEY UPDATE {update_clause}
|
||||
"""
|
||||
|
||||
cls._client.execute(sql, list(safe_data.values()))
|
17
debug/Debug_yutian_top.py
Normal file
17
debug/Debug_yutian_top.py
Normal file
@ -0,0 +1,17 @@
|
||||
# debug/debug_spider.py
|
||||
import sys
|
||||
import os
|
||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
sys.path.append(project_root)
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.utils.project import get_project_settings
|
||||
from TS_resume_spider.spiders.yutian_top import YutianTopSpider
|
||||
from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
|
||||
|
||||
def main():
|
||||
process = CrawlerProcess(get_project_settings())
|
||||
process.crawl(ZunHuaComSpider)
|
||||
process.start()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
0
debug/__init__.py
Normal file
0
debug/__init__.py
Normal file
11
scrapy.cfg
Normal file
11
scrapy.cfg
Normal file
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = TS_resume_spider.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = TS_resume_spider
|
Loading…
x
Reference in New Issue
Block a user