120 lines
3.8 KiB
Python
120 lines
3.8 KiB
Python
# Define your item pipelines here
|
||
#
|
||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||
from datetime import datetime
|
||
import re
|
||
from TS_resume_spider.utils.db import DB
|
||
from scrapy.exceptions import DropItem
|
||
|
||
# useful for handling different item types with a single interface
|
||
from itemadapter import ItemAdapter
|
||
|
||
|
||
class TsResumeSpiderPipeline:
|
||
def process_item(self, item, spider):
|
||
return item
|
||
|
||
|
||
class YTSpiderPipeline:
|
||
reverse_field_map = {
|
||
'resume_id': 'resume_id',
|
||
'user_name': 'name',
|
||
'sex_show': 'gender',
|
||
'user_age': 'age',
|
||
'area_show': 'job_location',
|
||
'birthday': 'birthday',
|
||
'education_level_msg': 'education',
|
||
'expect_job': 'expected_position',
|
||
'last_edit_time': 'update_time',
|
||
'marry_status_show': 'marital_status',
|
||
'residence': 'current_location',
|
||
'phone_encrypt': 'phone',
|
||
'work_type_show': 'job_property',
|
||
'work_status_show': 'job_status',
|
||
'work_1_description': 'work_1_description',
|
||
'work_1_time': 'work_1_time',
|
||
'work_1_experience': 'work_1_experience',
|
||
'work_2_description': 'work_2_description',
|
||
'work_2_time': 'work_2_time',
|
||
'work_2_experience': 'work_2_experience',
|
||
'work_3_description': 'work_3_description',
|
||
'work_3_time': 'work_3_time',
|
||
'work_3_experience': 'work_3_experience',
|
||
'work_4_description': 'work_4_description',
|
||
'work_4_time': 'work_4_time',
|
||
'work_4_experience': 'work_4_experience',
|
||
}
|
||
|
||
def extract_int(self, s):
|
||
try:
|
||
return int(re.search(r'\d+', str(s)).group())
|
||
except:
|
||
return None
|
||
|
||
def parse_datetime(self, s):
|
||
try:
|
||
return datetime.fromisoformat(s)
|
||
except:
|
||
return datetime(2019, 12, 12)
|
||
|
||
def process_item(self, item, spider):
|
||
if spider.name not in ['yutian_top','fnrc_vip']:
|
||
return item
|
||
experience = item.get("experience", [])
|
||
for j in range(4):
|
||
if j < len(experience):
|
||
company = experience[j].get("company", "")
|
||
time_line = experience[j].get("time_line", "")
|
||
content = experience[j].get("content", "")
|
||
else:
|
||
company = ''
|
||
time_line = ''
|
||
content = ''
|
||
|
||
item[f"work_{j + 1}_experience"] = company
|
||
item[f"work_{j + 1}_time"] = time_line
|
||
item[f"work_{j + 1}_description"] = content
|
||
|
||
item = {
|
||
self.reverse_field_map[k]: v
|
||
for k, v in item.items()
|
||
if k in self.reverse_field_map
|
||
}
|
||
|
||
if "age" in item:
|
||
item["age"] = self.extract_int(item["age"])
|
||
|
||
if "height" in item:
|
||
item["height"] = self.extract_int(item["height"])
|
||
|
||
if "weight" in item:
|
||
item["weight"] = self.extract_int(item["weight"])
|
||
|
||
if "update_time" in item:
|
||
item["update_time"] = self.parse_datetime(item["update_time"])
|
||
if spider.name == "yutian_top":
|
||
item["source_id"] = 2
|
||
elif spider.name == "fnrc_vip":
|
||
item["source_id"] = 3
|
||
else:
|
||
item["source_id"] = None
|
||
return item
|
||
|
||
|
||
class YTSavePipeline:
|
||
def process_item(self, item, spider):
|
||
if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']:
|
||
return item
|
||
resume_id = item.get("resume_id")
|
||
if not resume_id:
|
||
raise DropItem("⚠️ resume_id 缺失,已丢弃")
|
||
|
||
try:
|
||
DB.insert_resume(item)
|
||
except Exception as e:
|
||
spider.logger.warning(f"❌ 写入失败:resume_id={resume_id}, 错误={e}")
|
||
|
||
return item
|
||
|