添加.gitignore文件以排除不必要的文件和目录;更新爬虫代码以处理公司介绍的提取逻辑
This commit is contained in:
parent
03b56ae1b2
commit
688e7fb3f7
56
.gitignore
vendored
Normal file
56
.gitignore
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
# === Python 缓存 ===
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# === 环境变量文件 ===
|
||||
.env
|
||||
.env.*
|
||||
|
||||
# === 虚拟环境目录 ===
|
||||
venv/
|
||||
.venv/
|
||||
env/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# === 安装构建缓存 ===
|
||||
*.egg
|
||||
*.egg-info/
|
||||
.eggs/
|
||||
dist/
|
||||
build/
|
||||
pip-log.txt
|
||||
|
||||
# === 测试相关缓存文件 ===
|
||||
.coverage
|
||||
.tox/
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
|
||||
# === 数据库相关 ===
|
||||
*.sqlite3
|
||||
db.sqlite3
|
||||
|
||||
# === 日志文件 ===
|
||||
*.log
|
||||
logs/
|
||||
|
||||
# === 静态与媒体文件(Django) ===
|
||||
media/
|
||||
static/
|
||||
staticfiles/
|
||||
|
||||
# === IDE 配置 ===
|
||||
.idea/ # PyCharm
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
.vscode/ # VS Code
|
||||
|
||||
# === 系统自动生成文件 ===
|
||||
.DS_Store # macOS
|
||||
Thumbs.db # Windows
|
@ -11,6 +11,8 @@ def extract_company_data(xpathobj):
|
||||
name = first_or_empty('//h1/a/text()')
|
||||
# 公司介绍段落
|
||||
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
|
||||
if not intro_list:
|
||||
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
|
||||
introduction = "\r\n".join(intro_list)
|
||||
|
||||
# 如果没有名称或介绍,直接忽略
|
||||
|
@ -8,9 +8,7 @@ def first_or_empty(xpobj,path):
|
||||
return lst[0].strip() if lst else ""
|
||||
|
||||
def extract_position_data(xpathobj):
|
||||
print("aaa")
|
||||
title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
|
||||
print(title)
|
||||
if not title:
|
||||
return None
|
||||
nature = "全职"
|
||||
@ -64,7 +62,6 @@ def extract_position_data(xpathobj):
|
||||
|
||||
def get_position_href(xpathobj):
|
||||
hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
|
||||
print(hrefs)
|
||||
return [href.strip() for href in hrefs if href.strip()]
|
||||
|
||||
|
||||
@ -98,7 +95,6 @@ class ZunHuaComSpider(scrapy.Spider):
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
self.logger.info(f"Parsing page: {response.url}")
|
||||
xpathobj = etree.HTML(response.text)
|
||||
position_hrefs = get_position_href(xpathobj)
|
||||
if position_hrefs:
|
||||
|
Loading…
x
Reference in New Issue
Block a user