diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..59fa06a --- /dev/null +++ b/.gitignore @@ -0,0 +1,56 @@ +# === Python 缓存 === +__pycache__/ +*.py[cod] +*$py.class + +# === 环境变量文件 === +.env +.env.* + +# === 虚拟环境目录 === +venv/ +.venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# === 安装构建缓存 === +*.egg +*.egg-info/ +.eggs/ +dist/ +build/ +pip-log.txt + +# === 测试相关缓存文件 === +.coverage +.tox/ +nosetests.xml +coverage.xml +*.cover +*.py,cover + +# === 数据库相关 === +*.sqlite3 +db.sqlite3 + +# === 日志文件 === +*.log +logs/ + +# === 静态与媒体文件(Django) === +media/ +static/ +staticfiles/ + +# === IDE 配置 === +.idea/ # PyCharm +*.iml +*.ipr +*.iws +.vscode/ # VS Code + +# === 系统自动生成文件 === +.DS_Store # macOS +Thumbs.db # Windows diff --git a/TS_resume_spider/spiders/zhrczp_com_compary.py b/TS_resume_spider/spiders/zhrczp_com_compary.py index 96f85d6..ac5c505 100644 --- a/TS_resume_spider/spiders/zhrczp_com_compary.py +++ b/TS_resume_spider/spiders/zhrczp_com_compary.py @@ -11,6 +11,8 @@ def extract_company_data(xpathobj): name = first_or_empty('//h1/a/text()') # 公司介绍段落 intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/p/text()') if t.strip()] + if not intro_list: + intro_list = [t.strip() for t in xpathobj.xpath('//div[@class="company_img_auto"]/text()') if t.strip()] introduction = "\r\n".join(intro_list) # 如果没有名称或介绍,直接忽略 diff --git a/TS_resume_spider/spiders/zhrczp_com_position.py b/TS_resume_spider/spiders/zhrczp_com_position.py index 075253e..561c518 100644 --- a/TS_resume_spider/spiders/zhrczp_com_position.py +++ b/TS_resume_spider/spiders/zhrczp_com_position.py @@ -8,9 +8,7 @@ def first_or_empty(xpobj,path): return lst[0].strip() if lst else "" def extract_position_data(xpathobj): - print("aaa") title = first_or_empty(xpathobj, '//h1[@class="job_details_name"]/text()') - print(title) if not title: return None nature = "全职" @@ -64,7 +62,6 @@ def extract_position_data(xpathobj): def get_position_href(xpathobj): hrefs = xpathobj.xpath("//div[@class='yunjoblist_newname']/a/@href") - print(hrefs) return [href.strip() for href in hrefs if href.strip()] @@ -98,7 +95,6 @@ class ZunHuaComSpider(scrapy.Spider): ) def parse(self, response): - self.logger.info(f"Parsing page: {response.url}") xpathobj = etree.HTML(response.text) position_hrefs = get_position_href(xpathobj) if position_hrefs: