添加.gitignore文件以排除不必要的文件和目录；更新爬虫代码以处理公司介绍的提取逻辑

2025-05-25 22:59:19 +08:00 · 2025-05-25 22:59:19 +08:00 · 688e7fb3f7
commit 688e7fb3f7
parent 03b56ae1b2
3 changed files with 58 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,56 @@
 # === Python 缓存 ===
 __pycache__/
 *.py[cod]
 *$py.class
 # === 环境变量文件 ===
 .env
 .env.*
 # === 虚拟环境目录 ===
 venv/
 .venv/
 env/
 ENV/
 env.bak/
 venv.bak/
 # === 安装构建缓存 ===
 *.egg
 *.egg-info/
 .eggs/
 dist/
 build/
 pip-log.txt
 # === 测试相关缓存文件 ===
 .coverage
 .tox/
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 # === 数据库相关 ===
 *.sqlite3
 db.sqlite3
 # === 日志文件 ===
 *.log
 logs/
 # === 静态与媒体文件（Django） ===
 media/
 static/
 staticfiles/
 # === IDE 配置 ===
 .idea/          # PyCharm
 *.iml
 *.ipr
 *.iws
 .vscode/        # VS Code
 # === 系统自动生成文件 ===
 .DS_Store       # macOS
 Thumbs.db       # Windows
--- a/TS_resume_spider/spiders/zhrczp_com_compary.py
+++ b/TS_resume_spider/spiders/zhrczp_com_compary.py
@ -11,6 +11,8 @@ def extract_company_data(xpathobj):
    name = first_or_empty('//h1/a/text()')
    # 公司介绍段落
    intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()]
    if not intro_list:
        intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()]
    introduction = "\r\n".join(intro_list)
    # 如果没有名称或介绍，直接忽略
--- a/TS_resume_spider/spiders/zhrczp_com_position.py
+++ b/TS_resume_spider/spiders/zhrczp_com_position.py
@ -8,9 +8,7 @@ def first_or_empty(xpobj,path):
    return lst[0].strip() if lst else ""
 def extract_position_data(xpathobj):
    print("aaa")
    title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()')
    print(title)
    if not title:
        return None
    nature = "全职"
@ -64,7 +62,6 @@ def extract_position_data(xpathobj):
 def get_position_href(xpathobj):
    hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href")
    print(hrefs)
    return [href.strip() for href in hrefs if href.strip()]
@ -98,7 +95,6 @@ class ZunHuaComSpider(scrapy.Spider):
            )
    def parse(self, response):
        self.logger.info(f"Parsing page: {response.url}")
        xpathobj = etree.HTML(response.text)
        position_hrefs = get_position_href(xpathobj)
        if position_hrefs: