更新YutianTopSpider爬虫以处理公司和职位信息的保存逻辑,增加错误处理和公司ID回写功能

This commit is contained in:
晓丰 2025-06-08 16:54:00 +08:00
parent 5e4ff46c80
commit f864ea6bb3

View File

@ -120,6 +120,24 @@ class YTSavePipeline:
class CompanySavePipeline:
def process_item(self, item, spider):
if spider.name == 'yutian_top_compary':
company = item.get("company")
if not company:
return item
if 'website' in company:
company['website_id'] = company.pop('website')
company_name = company.get("name")
website_id = company.get("website_id")
if not company_name or not website_id:
return item
try:
DB.insert_company(company)
# 设置 company_id 回写
item["position"]["company_id"] = DB.get_company_id(company_name)
except Exception as e:
spider.logger.warning(f"❌ 写入失败company_name={company_name}, 错误={e}")
return item
if spider.name not in ['zhrczp_com_compary']:
return item
@ -138,6 +156,19 @@ class CompanySavePipeline:
class PositionSavePipeline:
def process_item(self, item, spider):
if spider.name == 'yutian_top_compary':
position = item.get("position")
if not position:
return item
title = position.get("title")
if not title:
return item
try:
DB.insert_position(position)
except Exception as e:
spider.logger.warning(f"❌ 写入失败title={title}, company_id={position.get('company_id')}, 错误={e}")
return item
if spider.name not in ['zhrczp_com_position']:
return item