添加.gitignore文件以排除不必要的文件和目录；更新爬虫代码以处理公司介绍的提取逻辑

2025-05-25 22:59:19 +08:00 · 2025-05-25 22:59:19 +08:00 · 688e7fb3f7
commit 688e7fb3f7
parent 03b56ae1b2
3 changed files with 58 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,56 @@
+# === Python 缓存 ===
+__pycache__/
+*.py[cod]
+*$py.class
+
+# === 环境变量文件 ===
+.env
+.env.*
+
+# === 虚拟环境目录 ===
+venv/
+.venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+
+# === 安装构建缓存 ===
+*.egg
+*.egg-info/
+.eggs/
+dist/
+build/
+pip-log.txt
+
+# === 测试相关缓存文件 ===
+.coverage
+.tox/
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+
+# === 数据库相关 ===
+*.sqlite3
+db.sqlite3
+
+# === 日志文件 ===
+*.log
+logs/
+
+# === 静态与媒体文件（Django） ===
+media/
+static/
+staticfiles/
+
+# === IDE 配置 ===
+.idea/          # PyCharm
+*.iml
+*.ipr
+*.iws
+.vscode/        # VS Code
+
+# === 系统自动生成文件 ===
+.DS_Store       # macOS
+Thumbs.db       # Windows
--- a/TS_resume_spider/spiders/zhrczp_com_compary.py
+++ b/TS_resume_spider/spiders/zhrczp_com_compary.py
@ -11,6 +11,8 @@ def extract_company_data(xpathobj):
    name = first_or_empty('//h1/a/text()')
    # 公司介绍段落
    intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
+    if not intro_list:
+        intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
    introduction = "\r\n".join(intro_list)

    # 如果没有名称或介绍，直接忽略
--- a/TS_resume_spider/spiders/zhrczp_com_position.py
+++ b/TS_resume_spider/spiders/zhrczp_com_position.py
@ -8,9 +8,7 @@ def first_or_empty(xpobj,path):
    return lst[0].strip() if lst else ""

 def extract_position_data(xpathobj):
-    print("aaa")
    title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
-    print(title)
    if not title:
        return None
    nature = "全职"
@ -64,7 +62,6 @@ def extract_position_data(xpathobj):

 def get_position_href(xpathobj):
    hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
-    print(hrefs)
    return [href.strip() for href in hrefs if href.strip()]


@ -98,7 +95,6 @@ class ZunHuaComSpider(scrapy.Spider):
            )

    def parse(self, response):
-        self.logger.info(f"Parsing page: {response.url}")
        xpathobj = etree.HTML(response.text)
        position_hrefs = get_position_href(xpathobj)
        if position_hrefs: