120 lines
3.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from datetime import datetime
import re
from TS_resume_spider.utils.db import DB
from scrapy.exceptions import DropItem
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class TsResumeSpiderPipeline:
def process_item(self, item, spider):
return item
class YTSpiderPipeline:
reverse_field_map = {
'resume_id': 'resume_id',
'user_name': 'name',
'sex_show': 'gender',
'user_age': 'age',
'area_show': 'job_location',
'birthday': 'birthday',
'education_level_msg': 'education',
'expect_job': 'expected_position',
'last_edit_time': 'update_time',
'marry_status_show': 'marital_status',
'residence': 'current_location',
'phone_encrypt': 'phone',
'work_type_show': 'job_property',
'work_status_show': 'job_status',
'work_1_description': 'work_1_description',
'work_1_time': 'work_1_time',
'work_1_experience': 'work_1_experience',
'work_2_description': 'work_2_description',
'work_2_time': 'work_2_time',
'work_2_experience': 'work_2_experience',
'work_3_description': 'work_3_description',
'work_3_time': 'work_3_time',
'work_3_experience': 'work_3_experience',
'work_4_description': 'work_4_description',
'work_4_time': 'work_4_time',
'work_4_experience': 'work_4_experience',
}
def extract_int(self, s):
try:
return int(re.search(r'\d+', str(s)).group())
except:
return None
def parse_datetime(self, s):
try:
return datetime.fromisoformat(s)
except:
return datetime(2019, 12, 12)
def process_item(self, item, spider):
if spider.name not in ['yutian_top','fnrc_vip']:
return item
experience = item.get("experience", [])
for j in range(4):
if j < len(experience):
company = experience[j].get("company", "")
time_line = experience[j].get("time_line", "")
content = experience[j].get("content", "")
else:
company = ''
time_line = ''
content = ''
item[f"work_{j + 1}_experience"] = company
item[f"work_{j + 1}_time"] = time_line
item[f"work_{j + 1}_description"] = content
item = {
self.reverse_field_map[k]: v
for k, v in item.items()
if k in self.reverse_field_map
}
if "age" in item:
item["age"] = self.extract_int(item["age"])
if "height" in item:
item["height"] = self.extract_int(item["height"])
if "weight" in item:
item["weight"] = self.extract_int(item["weight"])
if "update_time" in item:
item["update_time"] = self.parse_datetime(item["update_time"])
if spider.name == "yutian_top":
item["source_id"] = 2
elif spider.name == "fnrc_vip":
item["source_id"] = 3
else:
item["source_id"] = None
return item
class YTSavePipeline:
def process_item(self, item, spider):
if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']:
return item
resume_id = item.get("resume_id")
if not resume_id:
raise DropItem("⚠️ resume_id 缺失,已丢弃")
try:
DB.insert_resume(item)
except Exception as e:
spider.logger.warning(f"❌ 写入失败resume_id={resume_id}, 错误={e}")
return item