添加.gitignore文件以排除不必要的文件和目录;更新爬虫代码以处理公司介绍的提取逻辑

This commit is contained in:
晓丰 2025-05-25 22:59:19 +08:00
parent 03b56ae1b2
commit 688e7fb3f7
3 changed files with 58 additions and 4 deletions

56
.gitignore vendored Normal file
View File

@ -0,0 +1,56 @@
# === Python 缓存 ===
__pycache__/
*.py[cod]
*$py.class
# === 环境变量文件 ===
.env
.env.*
# === 虚拟环境目录 ===
venv/
.venv/
env/
ENV/
env.bak/
venv.bak/
# === 安装构建缓存 ===
*.egg
*.egg-info/
.eggs/
dist/
build/
pip-log.txt
# === 测试相关缓存文件 ===
.coverage
.tox/
nosetests.xml
coverage.xml
*.cover
*.py,cover
# === 数据库相关 ===
*.sqlite3
db.sqlite3
# === 日志文件 ===
*.log
logs/
# === 静态与媒体文件Django ===
media/
static/
staticfiles/
# === IDE 配置 ===
.idea/ # PyCharm
*.iml
*.ipr
*.iws
.vscode/ # VS Code
# === 系统自动生成文件 ===
.DS_Store # macOS
Thumbs.db # Windows

View File

@ -11,6 +11,8 @@ def extract_company_data(xpathobj):
name = first_or_empty('//h1/a/text()')
# 公司介绍段落
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
if not intro_list:
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
introduction = "\r\n".join(intro_list)
# 如果没有名称或介绍,直接忽略

View File

@ -8,9 +8,7 @@ def first_or_empty(xpobj,path):
return lst[0].strip() if lst else ""
def extract_position_data(xpathobj):
print("aaa")
title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
print(title)
if not title:
return None
nature = "全职"
@ -64,7 +62,6 @@ def extract_position_data(xpathobj):
def get_position_href(xpathobj):
hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
print(hrefs)
return [href.strip() for href in hrefs if href.strip()]
@ -98,7 +95,6 @@ class ZunHuaComSpider(scrapy.Spider):
)
def parse(self, response):
self.logger.info(f"Parsing page: {response.url}")
xpathobj = etree.HTML(response.text)
position_hrefs = get_position_href(xpathobj)
if position_hrefs: