添加.gitignore文件以排除不必要的文件和目录;更新爬虫代码以处理公司介绍的提取逻辑
This commit is contained in:
parent
03b56ae1b2
commit
688e7fb3f7
56
.gitignore
vendored
Normal file
56
.gitignore
vendored
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# === Python 缓存 ===
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# === 环境变量文件 ===
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
|
||||||
|
# === 虚拟环境目录 ===
|
||||||
|
venv/
|
||||||
|
.venv/
|
||||||
|
env/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# === 安装构建缓存 ===
|
||||||
|
*.egg
|
||||||
|
*.egg-info/
|
||||||
|
.eggs/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
pip-log.txt
|
||||||
|
|
||||||
|
# === 测试相关缓存文件 ===
|
||||||
|
.coverage
|
||||||
|
.tox/
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
|
||||||
|
# === 数据库相关 ===
|
||||||
|
*.sqlite3
|
||||||
|
db.sqlite3
|
||||||
|
|
||||||
|
# === 日志文件 ===
|
||||||
|
*.log
|
||||||
|
logs/
|
||||||
|
|
||||||
|
# === 静态与媒体文件(Django) ===
|
||||||
|
media/
|
||||||
|
static/
|
||||||
|
staticfiles/
|
||||||
|
|
||||||
|
# === IDE 配置 ===
|
||||||
|
.idea/ # PyCharm
|
||||||
|
*.iml
|
||||||
|
*.ipr
|
||||||
|
*.iws
|
||||||
|
.vscode/ # VS Code
|
||||||
|
|
||||||
|
# === 系统自动生成文件 ===
|
||||||
|
.DS_Store # macOS
|
||||||
|
Thumbs.db # Windows
|
@ -11,6 +11,8 @@ def extract_company_data(xpathobj):
|
|||||||
name = first_or_empty('//h1/a/text()')
|
name = first_or_empty('//h1/a/text()')
|
||||||
# 公司介绍段落
|
# 公司介绍段落
|
||||||
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
|
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
|
||||||
|
if not intro_list:
|
||||||
|
intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
|
||||||
introduction = "\r\n".join(intro_list)
|
introduction = "\r\n".join(intro_list)
|
||||||
|
|
||||||
# 如果没有名称或介绍,直接忽略
|
# 如果没有名称或介绍,直接忽略
|
||||||
|
@ -8,9 +8,7 @@ def first_or_empty(xpobj,path):
|
|||||||
return lst[0].strip() if lst else ""
|
return lst[0].strip() if lst else ""
|
||||||
|
|
||||||
def extract_position_data(xpathobj):
|
def extract_position_data(xpathobj):
|
||||||
print("aaa")
|
|
||||||
title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
|
title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
|
||||||
print(title)
|
|
||||||
if not title:
|
if not title:
|
||||||
return None
|
return None
|
||||||
nature = "全职"
|
nature = "全职"
|
||||||
@ -64,7 +62,6 @@ def extract_position_data(xpathobj):
|
|||||||
|
|
||||||
def get_position_href(xpathobj):
|
def get_position_href(xpathobj):
|
||||||
hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
|
hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
|
||||||
print(hrefs)
|
|
||||||
return [href.strip() for href in hrefs if href.strip()]
|
return [href.strip() for href in hrefs if href.strip()]
|
||||||
|
|
||||||
|
|
||||||
@ -98,7 +95,6 @@ class ZunHuaComSpider(scrapy.Spider):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def parse(self, response):
|
def parse(self, response):
|
||||||
self.logger.info(f"Parsing page: {response.url}")
|
|
||||||
xpathobj = etree.HTML(response.text)
|
xpathobj = etree.HTML(response.text)
|
||||||
position_hrefs = get_position_href(xpathobj)
|
position_hrefs = get_position_href(xpathobj)
|
||||||
if position_hrefs:
|
if position_hrefs:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user