重构数据库架构;添加简历表并更新公司插入逻辑

This commit is contained in:
晓丰 2025-05-25 06:02:11 +08:00
parent 0ed52acfb9
commit efb8632d39
3 changed files with 102 additions and 43 deletions

View File

@ -117,8 +117,8 @@ class YTSavePipeline:
return item return item
class CompanySavePipeline: class CompanySavePipeline:
def process_item(self, item, spider):
def process_item(self, item, spider): def process_item(self, item, spider):
if spider.name not in ['zhrczp_com_compary']: if spider.name not in ['zhrczp_com_compary']:
return item return item
@ -131,12 +131,10 @@ class CompanySavePipeline:
company_name = item.get("name") company_name = item.get("name")
website_id = item.get("website_id") website_id = item.get("website_id")
if not company_name or not website_id: if not company_name or not website_id:
raise DropItem(f"⚠️ 缺少必要字段,已丢弃: name={company_name}, website_id={website_id}") return None
try: try:
DB.insert_company(item) DB.insert_company(item)
except Exception as e: except Exception as e:
spider.logger.warning(f"❌ 写入失败company_name={company_name}, 错误={e}") spider.logger.warning(f"❌ 写入失败company_name={company_name}, 错误={e}")
return item return item

View File

@ -68,7 +68,7 @@ class ZunHuaComSpider(scrapy.Spider):
} }
async def start(self) -> Iterable[scrapy.Request]: async def start(self) -> Iterable[scrapy.Request]:
for page in range(1, 100000): for page in range(1000, 100_000):
yield scrapy.Request( yield scrapy.Request(
url=f"https://www.zhrczp.com/company/{page}.html", url=f"https://www.zhrczp.com/company/{page}.html",
headers=self.headers, headers=self.headers,
@ -77,9 +77,7 @@ class ZunHuaComSpider(scrapy.Spider):
) )
def parse(self, response): def parse(self, response):
# 使用 lxml 解析
xpathobj = etree.HTML(response.text) xpathobj = etree.HTML(response.text)
# 调用公共提取函数
company_data = extract_company_data(xpathobj) company_data = extract_company_data(xpathobj)
if company_data: if company_data:
yield company_data yield company_data

View File

@ -3,20 +3,18 @@ import os
from datetime import datetime from datetime import datetime
from sqlalchemy import ( from sqlalchemy import (
create_engine, MetaData, Table, Column, create_engine, MetaData, Table, Column,Integer,
BigInteger, String, Text, DateTime, text # <-- 导入 text BigInteger, String, Text, DateTime, text # <-- 导入 text
) )
from sqlalchemy.dialects.mysql import insert as mysql_insert from sqlalchemy.dialects.mysql import insert as mysql_insert
from sqlalchemy.engine.url import URL from sqlalchemy.engine.url import URL
# —— 数据库配置 —— #
DB_USER = os.getenv('DB_USER', 'tsreshub_prod') DB_USER = os.getenv('DB_USER', 'tsreshub_prod')
DB_PASS = os.getenv('DB_PASS', 'Tr5h$Prod!92@TsRH') DB_PASS = os.getenv('DB_PASS', 'Tr5h$Prod!92@TsRH')
DB_HOST = os.getenv('DB_HOST', '39.101.135.56') DB_HOST = os.getenv('DB_HOST', '39.101.135.56')
DB_PORT = int(os.getenv('DB_PORT', 3306)) DB_PORT = int(os.getenv('DB_PORT', 3306))
DB_NAME = os.getenv('DB_NAME', 'tsreshub_db') DB_NAME = os.getenv('DB_NAME', 'tsreshub_db')
# —— 生成安全的 URL —— #
db_url = URL.create( db_url = URL.create(
drivername="mysql+pymysql", drivername="mysql+pymysql",
username=DB_USER, username=DB_USER,
@ -26,11 +24,8 @@ db_url = URL.create(
database=DB_NAME, database=DB_NAME,
query={"charset": "utf8mb4"} query={"charset": "utf8mb4"}
) )
# —— 创建 Engine —— #
engine = create_engine(db_url, echo=False, pool_pre_ping=True) engine = create_engine(db_url, echo=False, pool_pre_ping=True)
# —— 定义元数据与表 —— #
metadata = MetaData() metadata = MetaData()
companies = Table( companies = Table(
@ -48,11 +43,80 @@ companies = Table(
Column('created_at', DateTime, default=datetime.utcnow), Column('created_at', DateTime, default=datetime.utcnow),
Column('updated_at', DateTime, default=datetime.utcnow, onupdate=datetime.utcnow), Column('updated_at', DateTime, default=datetime.utcnow, onupdate=datetime.utcnow),
) )
resumes = Table(
'resumes_resumebasic', metadata,
Column('id', BigInteger, primary_key=True, autoincrement=True),
Column('resume_id', Integer, nullable=False, index=True),
Column('name', String(255)),
Column('job_region', String(255)),
Column('birthday', String(255)),
Column('education', String(255)),
Column('school', String(255)),
Column('expected_position', String(255)),
Column('last_active_time', String(255)),
Column('marital_status', String(255)),
Column('current_location', String(255)),
Column('age', Integer),
Column('phone', String(255)),
Column('gender', String(255)),
Column('job_type', String(255)),
Column('job_status', String(255)),
Column('work_1_experience', Text),
Column('work_1_time', String(255)),
Column('work_1_description', Text),
Column('work_2_experience', Text),
Column('work_2_time', String(255)),
Column('work_2_description', Text),
Column('work_3_experience', Text),
Column('work_3_time', String(255)),
Column('work_3_description', Text),
Column('work_4_experience', Text),
Column('work_4_time', String(255)),
Column('work_4_description', Text),
Column('height', Integer),
Column('weight', Integer),
Column('work_years', String(255)),
Column('highest_education', String(255)),
Column('ethnicity', String(255)),
Column('update_time', DateTime),
Column('job_function', String(255)),
Column('intended_position', String(255)),
Column('industry', String(255)),
Column('expected_salary', String(255)),
Column('available_time', String(255)),
Column('job_property', String(255)),
Column('job_location', String(255)),
Column('crawl_keywords', String(255)),
Column('source_id', BigInteger), # 外键指向 websites_website(id)
Column('created_at', DateTime, default=datetime.utcnow),
Column('updated_at', DateTime, default=datetime.utcnow, onupdate=datetime.utcnow),
)
# (可选)首次创建表结构:
# metadata.create_all(engine)
class DB: class DB:
@classmethod
def insert_resume(cls, data: dict):
safe = {
k: v for k, v in data.items()
if k in resumes.c and isinstance(v, (str, int, float, type(None), datetime))
}
resume_id = safe.get('resume_id')
source_id = safe.get('source_id')
if resume_id is None or source_id is None:
print("❌ 必须提供 resume_id 和 source_id才可插入")
return
stmt = mysql_insert(resumes).values(**safe)
update_cols = {
col.name: stmt.inserted[col.name]
for col in resumes.c
if col.name not in ('id', 'created_at')
}
stmt = stmt.on_duplicate_key_update(**update_cols)
with engine.begin() as conn:
conn.execute(stmt)
print(f"✅ 简历插入/更新成功resume_id={resume_id}, source_id={source_id}")
@classmethod @classmethod
def insert_company(cls, data: dict): def insert_company(cls, data: dict):
safe = { safe = {
@ -77,18 +141,17 @@ class DB:
conn.execute(stmt) conn.execute(stmt)
print(f"✅ 插入/更新成功:{safe['name']}") print(f"✅ 插入/更新成功:{safe['name']}")
if __name__ == '__main__': if __name__ == '__main__':
# 测试连接:用 text() 包装 SQL 字符串
print("→ 尝试连接数据库…") print("→ 尝试连接数据库…")
try: try:
with engine.connect() as conn: with engine.connect() as conn:
conn.execute(text("SELECT 1")) # <-- 使用 text() conn.execute(text("SELECT 1"))
print("✅ 数据库连接成功") print("✅ 数据库连接成功")
except Exception as e: except Exception as e:
print(f"❌ 无法连接数据库:{e}") print(f"❌ 无法连接数据库:{e}")
exit(1) exit(1)
# 测试插入数据
test_data = { test_data = {
'name': '河北遵一建设工程有限公司', 'name': '河北遵一建设工程有限公司',
'category': '房地产/建筑/工程', 'category': '房地产/建筑/工程',