TS_resume_spider/TS_resume_spider/pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from datetime import datetime
import re
from TS_resume_spider.utils.db import DB
from scrapy.exceptions import DropItem

# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class TsResumeSpiderPipeline:
    def process_item(self, item, spider):
        return item


class YTSpiderPipeline:
    reverse_field_map = {
        'resume_id': 'resume_id',
        'user_name': 'name',
        'sex_show': 'gender',
        'user_age': 'age',
        'area_show': 'job_location',
        'birthday': 'birthday',
        'education_level_msg': 'education',
        'expect_job': 'expected_position',
        'last_edit_time': 'update_time',
        'marry_status_show': 'marital_status',
        'residence': 'current_location',
        'phone_encrypt': 'phone',
        'work_type_show': 'job_property',
        'work_status_show': 'job_status',
        'work_1_description': 'work_1_description',
        'work_1_time': 'work_1_time',
        'work_1_experience': 'work_1_experience',
        'work_2_description': 'work_2_description',
        'work_2_time': 'work_2_time',
        'work_2_experience': 'work_2_experience',
        'work_3_description': 'work_3_description',
        'work_3_time': 'work_3_time',
        'work_3_experience': 'work_3_experience',
        'work_4_description': 'work_4_description',
        'work_4_time': 'work_4_time',
        'work_4_experience': 'work_4_experience',
    }

    def extract_int(self, s):
        try:
            return int(re.search(r'\d+', str(s)).group())
        except:
            return None

    def parse_datetime(self, s):
        try:
            return datetime.fromisoformat(s)
        except:
            return datetime(2019, 12, 12)

    def process_item(self, item, spider):
        if spider.name not in ['yutian_top','fnrc_vip']:
            return item
        experience = item.get("experience", [])
        for j in range(4):
            if j < len(experience):
                company = experience[j].get("company", "")
                time_line = experience[j].get("time_line", "")
                content = experience[j].get("content", "")
            else:
                company = ''
                time_line = ''
                content = ''

            item[f"work_{j + 1}_experience"] = company
            item[f"work_{j + 1}_time"] = time_line
            item[f"work_{j + 1}_description"] = content

        item = {
            self.reverse_field_map[k]: v
            for k, v in item.items()
            if k in self.reverse_field_map
        }

        if "age" in item:
            item["age"] = self.extract_int(item["age"])

        if "height" in item:
            item["height"] = self.extract_int(item["height"])

        if "weight" in item:
            item["weight"] = self.extract_int(item["weight"])

        if "update_time" in item:
            item["update_time"] = self.parse_datetime(item["update_time"])
        if spider.name == "yutian_top":
            item["source_id"] = 2
        elif spider.name == "fnrc_vip":
            item["source_id"] = 3
        else:
            item["source_id"] = None
        return item


class YTSavePipeline:
    def process_item(self, item, spider):
        if spider.name not in ['yutian_top' ,'zhrczp_com', 'fnrc_vip', 'qj050_com']:
            return item
        resume_id = item.get("resume_id")
        if not resume_id:
            raise DropItem("⚠️ resume_id 缺失，已丢弃")

        try:
            DB.insert_resume(item)
        except Exception as e:
            spider.logger.warning(f"❌ 写入失败：resume_id={resume_id}, 错误={e}")

        return item