Add FnrcVipSpider and QJ050ComSpider; update pipelines and db logic for new sources
This commit is contained in:
parent
90217778be
commit
f153c6d250
@ -59,7 +59,7 @@ class YTSpiderPipeline:
|
|||||||
return datetime(2019, 12, 12)
|
return datetime(2019, 12, 12)
|
||||||
|
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
if spider.name != 'yutian_top':
|
if spider.name not in ['yutian_top','fnrc_vip']:
|
||||||
return item
|
return item
|
||||||
experience = item.get("experience", [])
|
experience = item.get("experience", [])
|
||||||
for j in range(4):
|
for j in range(4):
|
||||||
@ -93,15 +93,18 @@ class YTSpiderPipeline:
|
|||||||
|
|
||||||
if "update_time" in item:
|
if "update_time" in item:
|
||||||
item["update_time"] = self.parse_datetime(item["update_time"])
|
item["update_time"] = self.parse_datetime(item["update_time"])
|
||||||
|
if spider.name == "yutian_top":
|
||||||
item["source_id"] = 2
|
item["source_id"] = 2
|
||||||
|
elif spider.name == "fnrc_vip":
|
||||||
|
item["source_id"] = 3
|
||||||
|
else:
|
||||||
|
item["source_id"] = None
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
class YTSavePipeline:
|
class YTSavePipeline:
|
||||||
def process_item(self, item, spider):
|
def process_item(self, item, spider):
|
||||||
if spider.name not in ['yutian_top' ,'zhrczp_com']:
|
if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']:
|
||||||
return item
|
return item
|
||||||
resume_id = item.get("resume_id")
|
resume_id = item.get("resume_id")
|
||||||
if not resume_id:
|
if not resume_id:
|
||||||
|
@ -1,93 +1,75 @@
|
|||||||
# Scrapy settings for TS_resume_spider project
|
# Scrapy 项目 TS_resume_spider 的配置文件
|
||||||
#
|
|
||||||
# For simplicity, this file contains only settings considered important or
|
|
||||||
# commonly used. You can find more settings consulting the documentation:
|
|
||||||
#
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/settings.html
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
||||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
||||||
|
|
||||||
|
# 指定项目名称,默认会用在 User-Agent 和内部调用
|
||||||
BOT_NAME = "TS_resume_spider"
|
BOT_NAME = "TS_resume_spider"
|
||||||
|
|
||||||
|
# 指定爬虫类所在的模块(路径)
|
||||||
SPIDER_MODULES = ["TS_resume_spider.spiders"]
|
SPIDER_MODULES = ["TS_resume_spider.spiders"]
|
||||||
|
# 新建爬虫默认生成的地方
|
||||||
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
|
NEWSPIDER_MODULE = "TS_resume_spider.spiders"
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# 自定义 User-Agent,默认用 Scrapy的,可改成模仿浏览器的
|
||||||
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
|
# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
# Obey robots.txt rules
|
# 是否遵守 robots.txt 规则(推荐 False)
|
||||||
ROBOTSTXT_OBEY = False
|
ROBOTSTXT_OBEY = False
|
||||||
|
|
||||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
# 配置 Scrapy 最大并发请求数(默认 16)
|
||||||
# CONCURRENT_REQUESTS = 32
|
CONCURRENT_REQUESTS = 64 # 设置并发量为8,减少服务器压力,避免被断连
|
||||||
|
|
||||||
# Configure a delay for requests for the same website (default: 0)
|
# 同一网站请求间隔时(秒),有效避免被拦截
|
||||||
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
DOWNLOAD_DELAY = 0.1
|
||||||
# See also autothrottle settings and docs
|
|
||||||
# DOWNLOAD_DELAY = 3
|
|
||||||
# The download delay setting will honor only one of:
|
|
||||||
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
|
||||||
# CONCURRENT_REQUESTS_PER_IP = 16
|
|
||||||
|
|
||||||
# Disable cookies (enabled by default)
|
# 每个域名最多并发请求数(最高限制)
|
||||||
# COOKIES_ENABLED = False
|
CONCURRENT_REQUESTS_PER_DOMAIN = 64
|
||||||
|
|
||||||
# Disable Telnet Console (enabled by default)
|
# 禁用网络绑定到单个IP(默认就是 False)
|
||||||
# TELNETCONSOLE_ENABLED = False
|
# CONCURRENT_REQUESTS_PER_IP = 8
|
||||||
|
|
||||||
# Override the default request headers:
|
# 是否禁用 cookies,如果有记录状态需要开启
|
||||||
|
COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# 默认请求头,如有特殊需要,在请求中单独指定
|
||||||
# DEFAULT_REQUEST_HEADERS = {
|
# DEFAULT_REQUEST_HEADERS = {
|
||||||
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
# "Accept-Language": "en",
|
# "Accept-Language": "zh-CN,zh;q=0.9",
|
||||||
# }
|
# }
|
||||||
|
|
||||||
# Enable or disable spider middlewares
|
# 启用自动量制(加载随时调整下载延迟),有利于避免被阻止
|
||||||
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
AUTOTHROTTLE_ENABLED = False
|
||||||
# SPIDER_MIDDLEWARES = {
|
# 初始的下载延迟(秒)
|
||||||
# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543,
|
AUTOTHROTTLE_START_DELAY = 0
|
||||||
# }
|
# 最大下载延迟(秒),对高延迟服务器有效
|
||||||
|
AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# 平均并发数,1.0代表一次一个
|
||||||
|
AUTOTHROTTLE_TARGET_CONCURRENCY = 10
|
||||||
|
# 是否打印每次调整的日志
|
||||||
|
AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
# Enable or disable downloader middlewares
|
# 启用请求重试,有效处理连接失败,网络错误
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
RETRY_ENABLED = True
|
||||||
# DOWNLOADER_MIDDLEWARES = {
|
# 重试次数(次),默认是2,调高可接受网络不稳定
|
||||||
# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543,
|
RETRY_TIMES = 5
|
||||||
# }
|
# 对些网络错误、服务器错误以及超过限制重试
|
||||||
|
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
|
||||||
|
|
||||||
# Enable or disable extensions
|
# 配置使用的数据管道,数字小表明执行顺序
|
||||||
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
|
||||||
# EXTENSIONS = {
|
|
||||||
# "scrapy.extensions.telnet.TelnetConsole": None,
|
|
||||||
# }
|
|
||||||
|
|
||||||
# Configure item pipelines
|
|
||||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
|
'TS_resume_spider.pipelines.YTSpiderPipeline': 300,
|
||||||
'TS_resume_spider.pipelines.YTSavePipeline': 500,
|
'TS_resume_spider.pipelines.YTSavePipeline': 500,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
# 设置输出文件编码,防止中文乱码
|
||||||
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
FEED_EXPORT_ENCODING = "utf-8"
|
||||||
# AUTOTHROTTLE_ENABLED = True
|
|
||||||
# The initial download delay
|
|
||||||
# AUTOTHROTTLE_START_DELAY = 5
|
|
||||||
# The maximum download delay to be set in case of high latencies
|
|
||||||
# AUTOTHROTTLE_MAX_DELAY = 60
|
|
||||||
# The average number of requests Scrapy should be sending in parallel to
|
|
||||||
# each remote server
|
|
||||||
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
|
||||||
# Enable showing throttling stats for every response received:
|
|
||||||
# AUTOTHROTTLE_DEBUG = False
|
|
||||||
|
|
||||||
# Enable and configure HTTP caching (disabled by default)
|
# 启用未来可用的指纹,增强系统兼容性
|
||||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
|
||||||
|
# 如需添加 http cache,可以后期再考虑(不必须)
|
||||||
# HTTPCACHE_ENABLED = True
|
# HTTPCACHE_ENABLED = True
|
||||||
# HTTPCACHE_EXPIRATION_SECS = 0
|
# HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
# HTTPCACHE_DIR = "httpcache"
|
# HTTPCACHE_DIR = "httpcache"
|
||||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
JOBDIR = 'job_info/ts_resume_spider'
|
||||||
# Set settings whose default value is deprecated to a future-proof value
|
|
||||||
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
|
||||||
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
|
||||||
FEED_EXPORT_ENCODING = "utf-8"
|
|
||||||
|
112
TS_resume_spider/spiders/fnrc_vpi.py
Normal file
112
TS_resume_spider/spiders/fnrc_vpi.py
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
import requests
|
||||||
|
import scrapy
|
||||||
|
import json
|
||||||
|
|
||||||
|
from pyasn1_modules.rfc7292 import pbeWithSHAAnd40BitRC4
|
||||||
|
from requests import session
|
||||||
|
|
||||||
|
# 尽量不要跑这个脚本太他妈慢了
|
||||||
|
|
||||||
|
class FnrcVipSpider(scrapy.Spider):
|
||||||
|
name = 'fnrc_vip'
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
url = "https://www.fnrc.vip/job/company/v1/resume/page"
|
||||||
|
cookies = {
|
||||||
|
'PHPSESSID': 'ca613ae99706037e356a247500acb97b',
|
||||||
|
'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDczNzA1ODUsImp0aSI6IjBlZDI0NTM0LWE0NjEtNDkxNC1iNDU1LWQxZGEzYzQ5N2U0NiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIxYTJkODFjMTFkM2MzMmVhYmVlNWFkM2E3NGFmYWViNyIsInRlbmFudF90b2tlbiI6ImQzNWVjMmEzNjAxODM1NWE4MTg3ZTEyODI3MzE3ZGRjIn0.HoaWksDiMxtkbBJ8jVPlKLKzd1UqNHo4KfecS2uVUaM',
|
||||||
|
'company_sign': '',
|
||||||
|
'company_nonce': '',
|
||||||
|
'cuid': '',
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'accept': 'application/json, text/plain, */*',
|
||||||
|
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||||
|
'cache-control': 'no-cache',
|
||||||
|
'content-type': 'application/json;charset=UTF-8',
|
||||||
|
'origin': 'https://www.fnrc.vip',
|
||||||
|
'pragma': 'no-cache',
|
||||||
|
'priority': 'u=1, i',
|
||||||
|
'referer': 'https://www.fnrc.vip/enterprise/resume_store/list',
|
||||||
|
'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
|
||||||
|
'sec-ch-ua-mobile': '?0',
|
||||||
|
'sec-ch-ua-platform': '"Windows"',
|
||||||
|
'sec-fetch-dest': 'empty',
|
||||||
|
'sec-fetch-mode': 'cors',
|
||||||
|
'sec-fetch-site': 'same-origin',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||||
|
}
|
||||||
|
|
||||||
|
json_data = {
|
||||||
|
'step': 1000,
|
||||||
|
'page': 1,
|
||||||
|
'education_level': [],
|
||||||
|
'arrival_time': [],
|
||||||
|
'work_time': [],
|
||||||
|
'area_id': [],
|
||||||
|
'keywords': '',
|
||||||
|
'work_status': '',
|
||||||
|
'work_status_show': '求职状态',
|
||||||
|
'category_id': '',
|
||||||
|
'work_type': '',
|
||||||
|
'work_type_show': '是否兼职',
|
||||||
|
'sex': '',
|
||||||
|
'sex_show': '性别',
|
||||||
|
'is_head': '',
|
||||||
|
'is_head_show': '有无照片',
|
||||||
|
'job_id': '',
|
||||||
|
'age': [],
|
||||||
|
'age_show': '年龄',
|
||||||
|
'refresh_time': 0,
|
||||||
|
'site_id': '',
|
||||||
|
'site_id2': '',
|
||||||
|
'province': '',
|
||||||
|
'city': '',
|
||||||
|
'county': '',
|
||||||
|
'provinceArr': [],
|
||||||
|
'cityArr': [],
|
||||||
|
'countyArr': [],
|
||||||
|
'only_job_category': 0,
|
||||||
|
}
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update(headers)
|
||||||
|
session.cookies.update(cookies)
|
||||||
|
for page in range(1, 6):
|
||||||
|
payload = {
|
||||||
|
'step': 1000,
|
||||||
|
'page': page,
|
||||||
|
'education_level': [],
|
||||||
|
'arrival_time': [],
|
||||||
|
'work_time': [],
|
||||||
|
'area_id': [],
|
||||||
|
'keywords': '',
|
||||||
|
'work_status': '',
|
||||||
|
'work_status_show': '求职状态',
|
||||||
|
'category_id': '',
|
||||||
|
'work_type': '',
|
||||||
|
'work_type_show': '是否兼职',
|
||||||
|
'sex': '',
|
||||||
|
'sex_show': '性别',
|
||||||
|
'is_head': '',
|
||||||
|
'is_head_show': '有无照片',
|
||||||
|
'job_id': '',
|
||||||
|
'age': [],
|
||||||
|
'age_show': '年龄',
|
||||||
|
'refresh_time': 0,
|
||||||
|
'site_id': '',
|
||||||
|
'site_id2': '',
|
||||||
|
'province': '',
|
||||||
|
'city': '',
|
||||||
|
'county': '',
|
||||||
|
'provinceArr': [],
|
||||||
|
'cityArr': [],
|
||||||
|
'countyArr': [],
|
||||||
|
'only_job_category': 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = session.post(url, json=payload)
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
for item in data.get('data', []):
|
||||||
|
yield item
|
108
TS_resume_spider/spiders/qj050_com.py
Normal file
108
TS_resume_spider/spiders/qj050_com.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import json
|
||||||
|
import scrapy
|
||||||
|
import time
|
||||||
|
import urllib
|
||||||
|
from datetime import datetime
|
||||||
|
from scrapy.http import Response
|
||||||
|
|
||||||
|
class QJ050ComSpider(scrapy.Spider):
|
||||||
|
name = 'qj050_com'
|
||||||
|
allowed_domains = ['qj050.com']
|
||||||
|
start_urls = ['https://www.qj050.com/api/v1/resumes']
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'accept': 'application/json, text/plain, */*',
|
||||||
|
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||||
|
'authorization': 'Bearer 你的token',
|
||||||
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36',
|
||||||
|
'x-platform': '1',
|
||||||
|
'x-site-id': 'undefined',
|
||||||
|
}
|
||||||
|
cookies = {
|
||||||
|
'x-trace-id': '7d60110f6a7a4df595db14e54ee772dd',
|
||||||
|
'has_login_log': 'yes',
|
||||||
|
'HMACCOUNT': '52014CC932A93E9B',
|
||||||
|
'token': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpZCI6NDgxNTMsInVzZXJuYW1lIjoi55yf6LSkODg4OCIsInB3ZCI6IjFiYmJjNzc5OGRkMTFiNTI2YWQ4ZTVmYTYyNWY5MjVkIiwiaWF0IjoxNzQ1NzU3MjM4LCJleHAiOjE3NzcyOTMyMzh9.uU9G81yizRRUYCyJymit4n9vuysXCT-2V9PLdmdohgA',
|
||||||
|
'token.sig': 'Zta83bKMN9mPlsm9ZVnv7PaA7MwJZrLYHYrQK4Ft1rY',
|
||||||
|
'logged': '1',
|
||||||
|
}
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for page in range(1, 5):
|
||||||
|
params = {
|
||||||
|
'_': str(int(time.time() * 1000)),
|
||||||
|
'tab': 'resume',
|
||||||
|
'pageSize': '1000',
|
||||||
|
'pageIndex': str(page),
|
||||||
|
'showStatus': 'true',
|
||||||
|
}
|
||||||
|
query_string = urllib.parse.urlencode(params)
|
||||||
|
url = f"{self.start_urls[0]}?{query_string}"
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=url,
|
||||||
|
method='GET',
|
||||||
|
headers=self.headers,
|
||||||
|
cookies=self.cookies,
|
||||||
|
callback=self.parse
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, response: Response, **kwargs):
|
||||||
|
data = json.loads(response.text)
|
||||||
|
for item in data['data']['items']:
|
||||||
|
resume_id = item['id']
|
||||||
|
detail_url = f"https://www.qj050.com/api/v1/resume/{resume_id}?_={int(time.time() * 1000)}&view_type=resumeLibrary&privacy_description=1"
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=detail_url,
|
||||||
|
method='GET',
|
||||||
|
headers=self.headers,
|
||||||
|
cookies=self.cookies,
|
||||||
|
callback=self.parse_detail
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_detail(self, response):
|
||||||
|
info = json.loads(response.text).get('data', {})
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
# 常规字段
|
||||||
|
data['resume_id'] = info.get('id')
|
||||||
|
data['name'] = info.get('name') or None
|
||||||
|
data['age'] = int(info.get('age')) if info.get('age') else None
|
||||||
|
data['birthday'] = info.get('birthday') or None
|
||||||
|
data['work_years'] = info.get('work_exp_value') or None
|
||||||
|
data['highest_education'] = info.get('edu_value') or None
|
||||||
|
data['marital_status'] = info.get('marriage_value') or None
|
||||||
|
data['phone'] = info.get('phone') or None
|
||||||
|
data['intended_position'] = ','.join(
|
||||||
|
[item.get('name') for item in info.get('infoCateforyArrObj', [])]) if info.get(
|
||||||
|
'infoCateforyArrObj') else None
|
||||||
|
data['expected_salary'] = info.get('salaryDesc') or None
|
||||||
|
data['job_property'] = info.get('work_type_value') or None
|
||||||
|
data['job_status'] = info.get('job_instant_value') or None
|
||||||
|
data['job_location'] = info.get('job_region_value') or None
|
||||||
|
|
||||||
|
# 更新时间 (如果有 last_edit_time,要转 datetime)
|
||||||
|
if 'last_edit_time' in info and info.get('last_edit_time'):
|
||||||
|
try:
|
||||||
|
data['update_time'] = datetime.strptime(info['last_edit_time'], "%Y-%m-%d %H:%M:%S")
|
||||||
|
except Exception:
|
||||||
|
data['update_time'] = None
|
||||||
|
else:
|
||||||
|
data['update_time'] = None
|
||||||
|
|
||||||
|
# 工作经历处理
|
||||||
|
works = info.get('works', [])
|
||||||
|
for i in range(4):
|
||||||
|
if i < len(works):
|
||||||
|
company = works[i].get('company', '')
|
||||||
|
content = works[i].get('content', '')
|
||||||
|
combined = f"{company}:{content}" if company or content else ''
|
||||||
|
data[f'work_{i + 1}_experience'] = combined or None
|
||||||
|
else:
|
||||||
|
data[f'work_{i + 1}_experience'] = None
|
||||||
|
|
||||||
|
data[f'work_{i + 1}_time'] = None
|
||||||
|
data[f'work_{i + 1}_description'] = None
|
||||||
|
data['source_id'] = 4
|
||||||
|
data['crawl_keywords'] = ''
|
||||||
|
yield data
|
@ -51,16 +51,23 @@ class DB:
|
|||||||
def insert_resume(cls, data: dict):
|
def insert_resume(cls, data: dict):
|
||||||
cls.init() # 保证连接已初始化
|
cls.init() # 保证连接已初始化
|
||||||
|
|
||||||
|
# 只保留基本数据类型
|
||||||
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
|
safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))}
|
||||||
|
|
||||||
|
if 'resume_id' not in safe_data or 'source_id' not in safe_data:
|
||||||
|
# 必须有 source_id + resume_id
|
||||||
|
return
|
||||||
|
|
||||||
table = 'resumes_resumebasic'
|
table = 'resumes_resumebasic'
|
||||||
keys = ', '.join(safe_data.keys())
|
keys = ', '.join(safe_data.keys())
|
||||||
placeholders = ', '.join(['%s'] * len(safe_data))
|
placeholders = ', '.join(['%s'] * len(safe_data))
|
||||||
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id'])
|
|
||||||
|
# 注意:update时排除 source_id 和 resume_id
|
||||||
|
update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k not in ('source_id', 'resume_id')])
|
||||||
|
|
||||||
sql = f"""
|
sql = f"""
|
||||||
INSERT INTO {table} ({keys}) VALUES ({placeholders})
|
INSERT INTO {table} ({keys}) VALUES ({placeholders})
|
||||||
ON DUPLICATE KEY UPDATE {update_clause}
|
ON DUPLICATE KEY UPDATE {update_clause}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cls._client.execute(sql, list(safe_data.values()))
|
cls._client.execute(sql, list(safe_data.values()))
|
||||||
|
@ -7,10 +7,11 @@ from scrapy.crawler import CrawlerProcess
|
|||||||
from scrapy.utils.project import get_project_settings
|
from scrapy.utils.project import get_project_settings
|
||||||
from TS_resume_spider.spiders.yutian_top import YutianTopSpider
|
from TS_resume_spider.spiders.yutian_top import YutianTopSpider
|
||||||
from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
|
from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider
|
||||||
|
from TS_resume_spider.spiders.fnrc_vpi import FnrcVipSpider
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
process = CrawlerProcess(get_project_settings())
|
process = CrawlerProcess(get_project_settings())
|
||||||
process.crawl(ZunHuaComSpider)
|
process.crawl(FnrcVipSpider)
|
||||||
process.start()
|
process.start()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user