diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py index 25dd6c0..b872d0c 100644 --- a/TS_resume_spider/pipelines.py +++ b/TS_resume_spider/pipelines.py @@ -123,11 +123,8 @@ class CompanySavePipeline: if spider.name not in ['zhrczp_com_compary']: return item - # 字段映射 if 'website' in item: item['website_id'] = item.pop('website') - - # 检查必要字段 company_name = item.get("name") website_id = item.get("website_id") if not company_name or not website_id: @@ -136,5 +133,23 @@ class CompanySavePipeline: DB.insert_company(item) except Exception as e: spider.logger.warning(f"❌ 写入失败:company_name={company_name}, 错误={e}") + return item + + +class PositionSavePipeline: + def process_item(self, item, spider): + if spider.name not in ['zhrczp_com_position']: + return item + + title = item.get("title") + company_name = item.pop("company_name") + item['company_id'] = DB.get_company_id(company_name) + if not title or not company_name: + return None + + try: + DB.insert_position(item) + except Exception as e: + spider.logger.warning(f"❌ 写入失败:title={title}, company_name={company_name}, 错误={e}") return item diff --git a/TS_resume_spider/settings.py b/TS_resume_spider/settings.py index 19e6cad..4426fe9 100644 --- a/TS_resume_spider/settings.py +++ b/TS_resume_spider/settings.py @@ -1,6 +1,7 @@ # Scrapy 项目 TS_resume_spider 的配置文件 - # 指定项目名称,默认会用在 User-Agent 和内部调用 +from scrapy.settings.default_settings import TELNETCONSOLE_ENABLED + BOT_NAME = "TS_resume_spider" # 指定爬虫类所在的模块(路径) @@ -13,7 +14,10 @@ NEWSPIDER_MODULE = "TS_resume_spider.spiders" # 是否遵守 robots.txt 规则(推荐 False) ROBOTSTXT_OBEY = False - +# 是否启用日志记录(默认 True),可设置为 False 以禁用 +OFFSITE_ENABLED = False +LOG_LEVEL = "INFO" # 设置日志级别为 INFO,减少输出量 +TELNETCONSOLE_ENABLED = False # 配置 Scrapy 最大并发请求数(默认 16) CONCURRENT_REQUESTS = 64 # 设置并发量为8,减少服务器压力,避免被断连 @@ -57,7 +61,8 @@ RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429] ITEM_PIPELINES = { 'TS_resume_spider.pipelines.YTSpiderPipeline': 300, 'TS_resume_spider.pipelines.YTSavePipeline': 500, - 'TS_resume_spider.pipelines.CompanySavePipeline': 600, + 'TS_resume_spider.pipelines.CompanySavePipeline': 501, + 'TS_resume_spider.pipelines.PositionSavePipeline': 502, } # 设置输出文件编码,防止中文乱码 diff --git a/TS_resume_spider/spiders/zhrczp_com_compary.py b/TS_resume_spider/spiders/zhrczp_com_compary.py index 2cfbb32..96f85d6 100644 --- a/TS_resume_spider/spiders/zhrczp_com_compary.py +++ b/TS_resume_spider/spiders/zhrczp_com_compary.py @@ -3,9 +3,7 @@ import scrapy from lxml import etree - def extract_company_data(xpathobj): - """从 etree.HTML 对象中提取公司信息,返回 dict 或 None。""" def first_or_empty(path): lst = xpathobj.xpath(path) return lst[0].strip() if lst else "" @@ -21,30 +19,34 @@ def extract_company_data(xpathobj): # 公司详情信息 info = [t.strip() for t in xpathobj.xpath('//div[@class="com_details_info"]/text()') if t.strip()] - category = info[1] if len(info) > 1 else "" + category = info[1] if len(info) > 1 else "" company_type = info[2] if len(info) > 2 else "" - size = info[3] if len(info) > 3 else "" + size = info[3] if len(info) > 3 else "" founded_date = info[4] if len(info) > 4 else "" # 公司福利 benefits = [b.strip() for b in xpathobj.xpath('//div[@class="com_welfare "]/span/text()') if b.strip()] benefits_str = " | ".join(benefits) - address = first_or_empty('//div[@class="com_details_tel_me"]/div/text()') + address = first_or_empty('//div[@class="firm_name"]/div/text()') return { - "name": name, - "category": category, - "size": size, + "name": name, + "category": category, + "size": size, "company_type": company_type, "founded_date": founded_date, "introduction": introduction, - "address": address, - "benefits": benefits_str, - "website": 1, + "address": address, + "benefits": benefits_str, + "website": 1, } +def get_company_href(xpathobj): + hrefs = xpathobj.xpath('//div[@class="firm_name"]/span/a/@href') + return [href.strip() for href in hrefs if href.strip()] + class ZunHuaComSpider(scrapy.Spider): name = 'zhrczp_com_compary' @@ -55,7 +57,6 @@ class ZunHuaComSpider(scrapy.Spider): 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Pragma': 'no-cache', - 'Referer': 'https://www.zhrczp.com/member/index.php?c=resume&jobin=76&jobclass_search=76&cityin=&cityclass_search=&keyword=&minsalary=&maxsalary=&minage=&maxage=&exp=&edu=&uptime=&sex=&type=', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', @@ -67,17 +68,40 @@ class ZunHuaComSpider(scrapy.Spider): 'sec-ch-ua-platform': '"Windows"', } - async def start(self) -> Iterable[scrapy.Request]: - for page in range(1000, 100_000): + def start_requests(self) -> Iterable[scrapy.Request]: + + for page in range(1, 186): + self.logger.info(f"Fetching company list page: {page}") yield scrapy.Request( - url=f"https://www.zhrczp.com/company/{page}.html", + url=f"https://www.zhrczp.com/company/list/0-0-0-0-0-0-{page}.html", headers=self.headers, callback=self.parse, - dont_filter=True, # 如果需要关闭重复过滤 + dont_filter=True, + ) + for page in range(1, 10): + self.logger.info(f"Fetching company list page: {page}") + yield scrapy.Request( + url=f"https://www.zhrczp.com/company/list/0-0-0-0-0-1-{page}.html", + headers=self.headers, + callback=self.parse, + dont_filter=True, ) def parse(self, response): + xpathobj = etree.HTML(response.text) + company_href = get_company_href(xpathobj) + if company_href: + for href in company_href: + self.logger.debug(href) + yield scrapy.Request( + url=href, + headers=self.headers, + callback=self.parse_company, + dont_filter=True, + ) + + def parse_company(self, response): xpathobj = etree.HTML(response.text) company_data = extract_company_data(xpathobj) if company_data: - yield company_data \ No newline at end of file + yield company_data diff --git a/TS_resume_spider/utils/db.py b/TS_resume_spider/utils/db.py index bf6564c..ffb4332 100644 --- a/TS_resume_spider/utils/db.py +++ b/TS_resume_spider/utils/db.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- import os from datetime import datetime - +from sqlalchemy.sql import select from sqlalchemy import ( - create_engine, MetaData, Table, Column,Integer, + create_engine, MetaData, Table, Column, Integer, BigInteger, String, Text, DateTime, text # <-- 导入 text ) from sqlalchemy.dialects.mysql import insert as mysql_insert @@ -92,6 +92,26 @@ resumes = Table( Column('updated_at', DateTime, default=datetime.utcnow, onupdate=datetime.utcnow), ) +table_positions = Table( + 'positions_position', metadata, + Column('id', BigInteger, primary_key=True, autoincrement=True), + Column('title', String(200), nullable=False), + Column('nature', String(50)), + Column('category', String(100)), + Column('region', String(100)), + Column('experience', String(100)), + Column('education', String(100)), + Column('salary', String(100)), + Column('company_id', BigInteger, nullable=False), + Column('website_id', BigInteger, nullable=False), + Column('benefits', Text), + Column('contact_info', String(200)), + Column('contact_name', String(100)), + Column('description', Text), + Column('openings', Integer), + Column('position_status', Integer), +) + class DB: @classmethod @@ -139,7 +159,28 @@ class DB: with engine.begin() as conn: conn.execute(stmt) - print(f"✅ 插入/更新成功:{safe['name']}") + + @classmethod + def get_company_id(cls, company_name: str): + stmt = select(companies.c.id).where(companies.c.name == company_name) + with engine.connect() as conn: + result = conn.execute(stmt).scalar() + return result + + @classmethod + def insert_position(cls, data: dict): + company_id = data.get('company_id') + title = data.get('title') + website_id = data.get('website_id') + if not title or website_id is None: + return + safe = {k: v for k, v in data.items() if k in table_positions.c and k != 'company_name'} + safe['company_id'] = company_id + stmt = mysql_insert(table_positions).values(**safe) + update_cols = {col.name: stmt.inserted[col.name] for col in table_positions.c if col.name != 'id'} + stmt = stmt.on_duplicate_key_update(**update_cols) + with engine.begin() as conn: + conn.execute(stmt) if __name__ == '__main__': @@ -152,15 +193,16 @@ if __name__ == '__main__': print(f"❌ 无法连接数据库:{e}") exit(1) - test_data = { - 'name': '河北遵一建设工程有限公司', - 'category': '房地产/建筑/工程', - 'size': '20-100人', - 'company_type': '民营', - 'founded_date': '', - 'introduction': '河北遵一建设工程有限公司是一家诚信经营、具有良好口碑的建设工程公司……', - 'address': '领袖嘉园西门口对面', - 'benefits': '', - 'website_id': 1, - } - DB.insert_company(test_data) + # test_data = { + # 'name': '河北遵一建设工程有限公司', + # 'category': '房地产/建筑/工程', + # 'size': '20-100人', + # 'company_type': '民营', + # 'founded_date': '', + # 'introduction': '河北遵一建设工程有限公司是一家诚信经营、具有良好口碑的建设工程公司……', + # 'address': '领袖嘉园西门口对面', + # 'benefits': '', + # 'website_id': 1, + # } + # DB.insert_company(test_data) + print(DB.get_company_id("托管教育"))