From 688e7fb3f7daa8f09bf7fe41d5416d497ea1e4bf Mon Sep 17 00:00:00 2001 From: Franklin-F Date: Sun, 25 May 2025 22:59:19 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0.gitignore=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E4=BB=A5=E6=8E=92=E9=99=A4=E4=B8=8D=E5=BF=85=E8=A6=81=E7=9A=84?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=92=8C=E7=9B=AE=E5=BD=95=EF=BC=9B=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E7=88=AC=E8=99=AB=E4=BB=A3=E7=A0=81=E4=BB=A5=E5=A4=84?= =?UTF-8?q?=E7=90=86=E5=85=AC=E5=8F=B8=E4=BB=8B=E7=BB=8D=E7=9A=84=E6=8F=90?= =?UTF-8?q?=E5=8F=96=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 56 +++++++++++++++++++ .../spiders/zhrczp_com_compary.py | 2 + .../spiders/zhrczp_com_position.py | 4 -- 3 files changed, 58 insertions(+), 4 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..59fa06a --- /dev/null +++ b/.gitignore @@ -0,0 +1,56 @@ +# === Python 缓存 === +__pycache__/ +*.py[cod] +*$py.class + +# === 环境变量文件 === +.env +.env.* + +# === 虚拟环境目录 === +venv/ +.venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# === 安装构建缓存 === +*.egg +*.egg-info/ +.eggs/ +dist/ +build/ +pip-log.txt + +# === 测试相关缓存文件 === +.coverage +.tox/ +nosetests.xml +coverage.xml +*.cover +*.py,cover + +# === 数据库相关 === +*.sqlite3 +db.sqlite3 + +# === 日志文件 === +*.log +logs/ + +# === 静态与媒体文件(Django) === +media/ +static/ +staticfiles/ + +# === IDE 配置 === +.idea/ # PyCharm +*.iml +*.ipr +*.iws +.vscode/ # VS Code + +# === 系统自动生成文件 === +.DS_Store # macOS +Thumbs.db # Windows diff --git a/TS_resume_spider/spiders/zhrczp_com_compary.py b/TS_resume_spider/spiders/zhrczp_com_compary.py index 96f85d6..ac5c505 100644 --- a/TS_resume_spider/spiders/zhrczp_com_compary.py +++ b/TS_resume_spider/spiders/zhrczp_com_compary.py @@ -11,6 +11,8 @@ def extract_company_data(xpathobj): name = first_or_empty('//h1/a/text()') # 公司介绍段落 intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()] + if not intro_list: + intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()] introduction = "\r\n".join(intro_list) # 如果没有名称或介绍,直接忽略 diff --git a/TS_resume_spider/spiders/zhrczp_com_position.py b/TS_resume_spider/spiders/zhrczp_com_position.py index 075253e..561c518 100644 --- a/TS_resume_spider/spiders/zhrczp_com_position.py +++ b/TS_resume_spider/spiders/zhrczp_com_position.py @@ -8,9 +8,7 @@ def first_or_empty(xpobj,path): return lst[0].strip() if lst else "" def extract_position_data(xpathobj): - print("aaa") title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()') - print(title) if not title: return None nature = "全职" @@ -64,7 +62,6 @@ def extract_position_data(xpathobj): def get_position_href(xpathobj): hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href") - print(hrefs) return [href.strip() for href in hrefs if href.strip()] @@ -98,7 +95,6 @@ class ZunHuaComSpider(scrapy.Spider): ) def parse(self, response): - self.logger.info(f"Parsing page: {response.url}") xpathobj = etree.HTML(response.text) position_hrefs = get_position_href(xpathobj) if position_hrefs: