From f864ea6bb3418fdf30f27d3f1509ab38d32ebcbd Mon Sep 17 00:00:00 2001 From: Franklin-F Date: Sun, 8 Jun 2025 16:54:00 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0YutianTopSpider=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E4=BB=A5=E5=A4=84=E7=90=86=E5=85=AC=E5=8F=B8=E5=92=8C?= =?UTF-8?q?=E8=81=8C=E4=BD=8D=E4=BF=A1=E6=81=AF=E7=9A=84=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E9=80=BB=E8=BE=91=EF=BC=8C=E5=A2=9E=E5=8A=A0=E9=94=99=E8=AF=AF?= =?UTF-8?q?=E5=A4=84=E7=90=86=E5=92=8C=E5=85=AC=E5=8F=B8ID=E5=9B=9E?= =?UTF-8?q?=E5=86=99=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TS_resume_spider/pipelines.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py index b872d0c..76ab4bb 100644 --- a/TS_resume_spider/pipelines.py +++ b/TS_resume_spider/pipelines.py @@ -120,6 +120,24 @@ class YTSavePipeline: class CompanySavePipeline: def process_item(self, item, spider): + if spider.name == 'yutian_top_compary': + company = item.get("company") + if not company: + return item + if 'website' in company: + company['website_id'] = company.pop('website') + company_name = company.get("name") + website_id = company.get("website_id") + if not company_name or not website_id: + return item + try: + DB.insert_company(company) + # 设置 company_id 回写 + item["position"]["company_id"] = DB.get_company_id(company_name) + except Exception as e: + spider.logger.warning(f"❌ 写入失败:company_name={company_name}, 错误={e}") + return item + if spider.name not in ['zhrczp_com_compary']: return item @@ -138,6 +156,19 @@ class CompanySavePipeline: class PositionSavePipeline: def process_item(self, item, spider): + if spider.name == 'yutian_top_compary': + position = item.get("position") + if not position: + return item + title = position.get("title") + if not title: + return item + try: + DB.insert_position(position) + except Exception as e: + spider.logger.warning(f"❌ 写入失败:title={title}, company_id={position.get('company_id')}, 错误={e}") + return item + if spider.name not in ['zhrczp_com_position']: return item