From 90217778bee74b80765ac2b72e6272a630b419da Mon Sep 17 00:00:00 2001 From: Franklin-F Date: Sun, 20 Apr 2025 01:49:43 +0800 Subject: [PATCH] a --- TS_resume_spider/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 166 bytes .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 160 bytes .../__pycache__/pipelines.cpython-312.pyc | Bin 0 -> 4880 bytes .../__pycache__/pipelines.cpython-39.pyc | Bin 0 -> 794 bytes .../__pycache__/settings.cpython-312.pyc | Bin 0 -> 632 bytes .../__pycache__/settings.cpython-39.pyc | Bin 0 -> 527 bytes TS_resume_spider/items.py | 12 ++ TS_resume_spider/middlewares.py | 103 ++++++++++++ TS_resume_spider/pipelines.py | 116 ++++++++++++++ TS_resume_spider/settings.py | 93 +++++++++++ TS_resume_spider/spiders/__init__.py | 4 + .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 174 bytes .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 168 bytes .../__pycache__/yutian_top.cpython-312.pyc | Bin 0 -> 3504 bytes .../__pycache__/yutian_top.cpython-39.pyc | Bin 0 -> 2702 bytes .../__pycache__/zhrczp_com.cpython-312.pyc | Bin 0 -> 8951 bytes TS_resume_spider/spiders/yutian_top.py | 85 ++++++++++ TS_resume_spider/spiders/zhrczp_com.py | 148 ++++++++++++++++++ TS_resume_spider/utils/__init__.py | 0 .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 172 bytes .../utils/__pycache__/db.cpython-312.pyc | Bin 0 -> 3649 bytes TS_resume_spider/utils/db.py | 66 ++++++++ debug/Debug_yutian_top.py | 17 ++ debug/__init__.py | 0 scrapy.cfg | 11 ++ 26 files changed, 655 insertions(+) create mode 100644 TS_resume_spider/__init__.py create mode 100644 TS_resume_spider/__pycache__/__init__.cpython-312.pyc create mode 100644 TS_resume_spider/__pycache__/__init__.cpython-39.pyc create mode 100644 TS_resume_spider/__pycache__/pipelines.cpython-312.pyc create mode 100644 TS_resume_spider/__pycache__/pipelines.cpython-39.pyc create mode 100644 TS_resume_spider/__pycache__/settings.cpython-312.pyc create mode 100644 TS_resume_spider/__pycache__/settings.cpython-39.pyc create mode 100644 TS_resume_spider/items.py create mode 100644 TS_resume_spider/middlewares.py create mode 100644 TS_resume_spider/pipelines.py create mode 100644 TS_resume_spider/settings.py create mode 100644 TS_resume_spider/spiders/__init__.py create mode 100644 TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc create mode 100644 TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc create mode 100644 TS_resume_spider/spiders/__pycache__/yutian_top.cpython-312.pyc create mode 100644 TS_resume_spider/spiders/__pycache__/yutian_top.cpython-39.pyc create mode 100644 TS_resume_spider/spiders/__pycache__/zhrczp_com.cpython-312.pyc create mode 100644 TS_resume_spider/spiders/yutian_top.py create mode 100644 TS_resume_spider/spiders/zhrczp_com.py create mode 100644 TS_resume_spider/utils/__init__.py create mode 100644 TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc create mode 100644 TS_resume_spider/utils/__pycache__/db.cpython-312.pyc create mode 100644 TS_resume_spider/utils/db.py create mode 100644 debug/Debug_yutian_top.py create mode 100644 debug/__init__.py create mode 100644 scrapy.cfg diff --git a/TS_resume_spider/__init__.py b/TS_resume_spider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/TS_resume_spider/__pycache__/__init__.cpython-312.pyc b/TS_resume_spider/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5ac1a20ab7f20b35b76382cb52d80ba3916994a GIT binary patch literal 166 zcmX@j%ge<81iwGuNCnZ4K?FMZ%mNgd&QQsq$>_I|p@<2{`wUX^%hkmy#yLMFwKyiA zvLquvFUGkju{g`kg5RHSq=M+jAOaaM0yz#qT+9L_QW%06G#UL?G8BP?5yUT77poZO{FKz< zn1ITXjQqS9=c2^&oYbP2kl^^D)Z)_I)cE3p%oJ<_@$s2?nI-Y@dIgoYIBatBQ%ZAE L?LfwV24V&Pv@$8V literal 0 HcmV?d00001 diff --git a/TS_resume_spider/__pycache__/pipelines.cpython-312.pyc b/TS_resume_spider/__pycache__/pipelines.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b7b35ee717f49c37561779c31fc2788d7d74042 GIT binary patch literal 4880 zcmai2YitzP6}~gGuh|{jyI#NTHMTJhYitM1`_;sWX+s_s6q*J@I$HJ)_L_ZhXBM;0 zI*L+SnF^3MS7IwnVI>R+F}dZ4vvZog+N?9-+3ggxat2`5s3dCeHypvBVQio)dVk z63?0B{nA}j@^iX_1~f$rMU)Y!@Wam_Yna92`!yv3v-kL=|=*6j#Ebs1g*)y1`zszHBplNpXi@P9s+c zs9RJed_uQ`V0_)Cnq8p3Fx8{U=^(ftf6i|53L3C(mKF;+5p%{n*f_> z3t%g41#F{jfIjL2Y^Q5z2koR?bS>?s>u3*s;^*Rf+^9g;!<)BIKcJQN0@{)r{C&Fn z$l;amUl$8+C`5HJp(@O@n{GiH=?+ynBdg=FN!>Q4L}AzH4pSsBrsx8K7+^{uFCk6D zj?3X#FrbBEQQdJo#I$i5Na|jNCh|-@q9FIu8;)w(pOUkMi&=P8X8r8rGX@PKoX)X`;do-x%?qGs3 zC8|A~WmOz@t0F@yOU5frdS;(AMUT1I>ox#E~O`Spc5uMGe=xi?6+5D)^mQ{4Nl_*rF5G?pUAPK>hDX{gDeh`PtLQH$#A4gS=hXsPls!b&g8x23lH`nVgiLaNeq>5GMmPAax)6$L zI?ohvVoHDo$8|x~m~I_ou|)g|&+t6!oT}nNIyy|Bpu3ba8VkS{1=Xpx$^$e|*18F_ zR5aKd#NY+7>6W+QEjE|Dn4E9;wRhp*)rQ5!jZ2N&vW?po8+R;vcV?uWUmH9pHhtu6 ze4wJi-rTx_T<7-)D=dlL*tF!!K&-UnM7hPG70W}LxC8Zs$`fygBj*X zil2baD+}dL!oZBjXqMt83X3n7qWl;sZm(%UvliQ^P~1?%5N^zjk2pUx@`@tH_xe9fPZwc$TN?QFbTGbU=)uvX> zw`kQ=Xw{fn?vfVf6BPw!Srd?;oc zS3+asnr@pkpR5O%chRrtq8dxEpdyE;-@$Mlx-h1|7%ULQ3?QsTbt~p-s_tPB+tC@E z2q|GIM*?xS59HZ?0Ke08s|;NW!`H}O1kkw&HiDG{2nS8hNu4{b;--PDvQYAA8Jc#= z_o#a*%NO^8i28e2H>83t&xPIRcBlJpx_WZn#+es#_04myTzn-RUU+7){>d5VEl_HC?%u?)ih4`t8r|eQ>euJG0_yvKQ^QTiQ^e9ovDr{p7Zp;n}TePx|2H`m{a$TE?>>EA{34 zgDcpHjAva|>d9@~zJi^~c=~Qi{YEoP)aW41zF#;mJKvG+TP<$q-`#by-0UWp{p|2k zU2hhCHNA$FRC$)9uB_BGzbPxNn?IG6)*Awd&se@B@GvAId1i;t?aj5cFSYb$TY49^ zT(e%;z1V^y*Pnagw#z%KWnHZ~U+0pqKkMsXpx1U>i7xu~LVMG>k#BErN)KmU9l5Ta zrLL{nuC3ST^}X3``xm>OhYoyMY0d36-;8DEXx7nu8*}#mEc; zGrYzyFG3xHj8NK8t7h&ED1v}msslE{SqQ?d&9~n@4 zX$R8#3=wH35$k4uWVn!)kpH|*F$+taTZHAuUOSf6R|8y>9G60temZ+MZeBVt>| z*KgEgr2%@%)IVV~Vx@_QP3cCX8R-_(7%*CqZX;rA`bEQsbUP7!nf5_r4OTjc*f2Y7 zbRyklcDvD7i((k zK?Z1WT7d@9PsDh``kQUZiaYs&HtdDl{$9p;i0!vE_Qbk})m zlAA2%POxKn_5$>-JEz83a4If?k7kX)NI*m+pT0f+#q|pvrF&M#zi#~T&aZy+#f|fK z{_xv>T>s^r8$V8`h<{?(4Yw*5%LNe}8Ym z9kZjrWSFloGd5I(qHu%L3YTtO+;=9ZpAV}5I;PsZu$oT6fH zrtdBDysaxY!ip>S7|5$X11S03O#7~^YggX!)-}yNck#LO(ZxFdjQCMa)2-T;OzYE& zwL=+c$gl|FQSO#hy(G0|rMC3pze!zi7~OR?To|AW@Q?9aUX!^{s}$#flAl14MqsOd@mYuHiYfr@LnRzGAXAKJyCJ;I0X&Y|tX zhuf_{k1!83R$lul@(@nIzjW=Fif0E~{4O&oW={dFH&=ey7=VZ!RtM+k3|wP-CWao6F=#wh$Z!J6%a6SqX$+;RQmJF2vj{nLP=7gS^bZQ7y% zvv`1bBmU!d^}hRu&%|PVrY3WAdWQdj{8nTG-J$Anw>T}tGCxcqO5_N*6ndl!!h&X-5gg5M4=eq6S-WT|;`xDTkWcf|ih!QVNc>4l341R~wQDfiT%^vfxNYD>qmt z|BGJw2l88TNpEHew9r4`lAdzvx=ssCpmP}Jy?O5)W-cq0RUqqHXs2H*0DPZ=Kc(Fb zH%&Qw0tAGBC?`s&5G7QJ8fru%OQiH!4VTDrSRyOoGSS18@uSA7sNx{bC?B2Cm=77D zS@HBfV=flF%bE%R?5AuEwRrwOHe^FOUfcAzi<-?ku% z9naeD!9Y}9c<>J;4CG05fDdu(?ZP8r*cdvoZ~L%kyV9|V;QM_T;J9nMdl31^cCie! z9lr-1=;AiEJy)#Z13SRbiV-M)C}B)fr^?VCrVm0Q`R zE3HpXrMS-*e;xN0Bh+?3B%j)k&6V?wHHj-5TbH&;7JgHnxvI8{v!<$($qt2G+AZ^) z-S*bauXmr>{vZqva;e!QgD?^WO@ZqyPmg9G!kEGFe^warJZ=klB6WTMr(r}TWV)aQ z%qWYAR~&vGfsp!;vLC<;PMM4cX;#c2112Yu<}5su$5cp2HHXPm<~%$#swYgBJP=3f3l`ei(k(_B>Ld|EHAmbRWhe6`1hw@u^9bDyj>Ves{`GV7)N Vw!Q4fQAnqcNuMO-mb@gG;5$WtpQ-== literal 0 HcmV?d00001 diff --git a/TS_resume_spider/items.py b/TS_resume_spider/items.py new file mode 100644 index 0000000..7ff1068 --- /dev/null +++ b/TS_resume_spider/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class TsResumeSpiderItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/TS_resume_spider/middlewares.py b/TS_resume_spider/middlewares.py new file mode 100644 index 0000000..f29dd7b --- /dev/null +++ b/TS_resume_spider/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class TsResumeSpiderSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class TsResumeSpiderDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/TS_resume_spider/pipelines.py b/TS_resume_spider/pipelines.py new file mode 100644 index 0000000..8e08eb1 --- /dev/null +++ b/TS_resume_spider/pipelines.py @@ -0,0 +1,116 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +from datetime import datetime +import re +from TS_resume_spider.utils.db import DB +from scrapy.exceptions import DropItem + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter + + +class TsResumeSpiderPipeline: + def process_item(self, item, spider): + return item + + +class YTSpiderPipeline: + reverse_field_map = { + 'resume_id': 'resume_id', + 'user_name': 'name', + 'sex_show': 'gender', + 'user_age': 'age', + 'area_show': 'job_location', + 'birthday': 'birthday', + 'education_level_msg': 'education', + 'expect_job': 'expected_position', + 'last_edit_time': 'update_time', + 'marry_status_show': 'marital_status', + 'residence': 'current_location', + 'phone_encrypt': 'phone', + 'work_type_show': 'job_property', + 'work_status_show': 'job_status', + 'work_1_description': 'work_1_description', + 'work_1_time': 'work_1_time', + 'work_1_experience': 'work_1_experience', + 'work_2_description': 'work_2_description', + 'work_2_time': 'work_2_time', + 'work_2_experience': 'work_2_experience', + 'work_3_description': 'work_3_description', + 'work_3_time': 'work_3_time', + 'work_3_experience': 'work_3_experience', + 'work_4_description': 'work_4_description', + 'work_4_time': 'work_4_time', + 'work_4_experience': 'work_4_experience', + } + + def extract_int(self, s): + try: + return int(re.search(r'\d+', str(s)).group()) + except: + return None + + def parse_datetime(self, s): + try: + return datetime.fromisoformat(s) + except: + return datetime(2019, 12, 12) + + def process_item(self, item, spider): + if spider.name != 'yutian_top': + return item + experience = item.get("experience", []) + for j in range(4): + if j < len(experience): + company = experience[j].get("company", "") + time_line = experience[j].get("time_line", "") + content = experience[j].get("content", "") + else: + company = '' + time_line = '' + content = '' + + item[f"work_{j + 1}_experience"] = company + item[f"work_{j + 1}_time"] = time_line + item[f"work_{j + 1}_description"] = content + + item = { + self.reverse_field_map[k]: v + for k, v in item.items() + if k in self.reverse_field_map + } + + if "age" in item: + item["age"] = self.extract_int(item["age"]) + + if "height" in item: + item["height"] = self.extract_int(item["height"]) + + if "weight" in item: + item["weight"] = self.extract_int(item["weight"]) + + if "update_time" in item: + item["update_time"] = self.parse_datetime(item["update_time"]) + + item["source_id"] = 2 + + return item + + +class YTSavePipeline: + def process_item(self, item, spider): + if spider.name not in ['yutian_top' ,'zhrczp_com']: + return item + resume_id = item.get("resume_id") + if not resume_id: + raise DropItem("⚠️ resume_id 缺失,已丢弃") + + try: + DB.insert_resume(item) + except Exception as e: + spider.logger.warning(f"❌ 写入失败:resume_id={resume_id}, 错误={e}") + + return item + diff --git a/TS_resume_spider/settings.py b/TS_resume_spider/settings.py new file mode 100644 index 0000000..6bab753 --- /dev/null +++ b/TS_resume_spider/settings.py @@ -0,0 +1,93 @@ +# Scrapy settings for TS_resume_spider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "TS_resume_spider" + +SPIDER_MODULES = ["TS_resume_spider.spiders"] +NEWSPIDER_MODULE = "TS_resume_spider.spiders" + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +# USER_AGENT = "TS_resume_spider (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "TS_resume_spider.middlewares.TsResumeSpiderSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# "TS_resume_spider.middlewares.TsResumeSpiderDownloaderMiddleware": 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'TS_resume_spider.pipelines.YTSpiderPipeline': 300, + 'TS_resume_spider.pipelines.YTSavePipeline': 500, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/TS_resume_spider/spiders/__init__.py b/TS_resume_spider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/TS_resume_spider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc b/TS_resume_spider/spiders/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..083e29c17c35f8ff7c1bcbb1ec7c0534df366bd4 GIT binary patch literal 174 zcmX@j%ge<81iwGuNL>h|AA>mP;4=$QWI977gC?WjN`@jPAn!9s%`aaUs~G3}l+@yw zfXb4L{Ja?FqQvr?)S{S>;P|4{;?msI_~L@h6l?+z@#2{H_{_Y_lK6PNg34bUHo5sJ br8%i~MXW&68G*PM#Q4a}$jDg43}gWStAa34 literal 0 HcmV?d00001 diff --git a/TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc b/TS_resume_spider/spiders/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccc770b1765a92df5649f10a130840c3f5c1de11 GIT binary patch literal 168 zcmYe~<>g`kg5RHSq%H)~k3k${zzF0x0C6!3kVs(&X3%8xTggxa5=Iced|j+!obyvs zi(>*ROEU8FVw{T-%X3nTVnTxBi&BeAb5rAs3o=u%2|&b)W8&j8^D;}~u(#!5#QrmZ_%ZZ10vL$CG(q7$P@rg0py)r)FD=p!hf9D4C~U{y=tqGzpE`S_Y}N6R zocry}&g|^$&hGH9ZEej6o|S)EbAN0_=wGz4``8=c*}DL?5Js30P^$iBQj7)7B5b*Z zu$7>^ZCB~#`@Xe~Q`RcuvpuGb823_)5YuQp8wRk22ud*+r7W09S+OO{q-@v<)Q)Yp zP|AVrK%G^G&uMfd%PP+n6H;lZ#A6~G%~g|;R;7~GzyXTGIaMtwZwG^wN+nS5L;1nH zlnHW@SYnIS;Pr4&CQ4Z(!4jJ#+^$d5wNCgQf#0)r;9dZCAqEV#U@Nv^J5UF9Vi#`0 z&A0`(;x^olJMMMdU~s4TapQfs>z@6FMSWot@5cx3Tc~M-BIv!JA#}saBixO9K4tL1 z4J**zECSkR(nBWgH|YTG1pc5&hp-3mVH0~zdITQ@&M^}o$8P{0Hn9(%0Q@G!S%gnw zf5SKMFpKC1IDvbuzhG|I)W&L|*RJIT$~)?Q@1@w^3r!r{U^Xmj)2<|M?x@{X{k>tk z--rTOf_?Vt^Q+MPhW(#2TMUCf{=|l==-1W?;|s^M!)&P}@Eoi1QZblUq~b}BO4ik2 zNnm-vCxR!mb}i?hj-AwUlh@9K0u!36DET>-%MoqxRpm)8$I1#(&nzd-`%i1#uiqq^ zqkJZO(!*<`ua^f&Q6+Lo<`vS|KxsvlWD*p3Mb*xFXC)~s5YKc@mPF#6yneE>Yoanli4CR*{3Pf?8cYH7acXrk3w3Lca<+!-$io+$_;StXeZyc!%IeJe0} z$}@anE)iWk=@Iw>@yrsgAo&_T2f^O}m;kFw>>4Zc4HaL9;b1wAlvJ&w{`L!OFs~Gi#>_;ZBp&YipU%_*jU=qoeFu*tMj(5F%Hm_&C3CVI_QtUz}dZWo9|r zo?q7H!!dqgA^@EPG6$XVGo!Icax9vdUymgU%Dh+#u}kx(=JVM~YIY?b)skb2tIL&m zq@c!Tlk3T)a4xpGkY8M#S&v1cS~QZ>qIoPP`FYSC&t+DZVW9c-cqF+V)3T${{LENv z31r0;Ep=sKja|J&<(23Ne8lyW=pQgH( zg^YNSTPOV{USXJ--i*py3W|o%bBU&?CR&#z;Dv;u1dTKnTM}uKgR8r!mOinI2IOfA_(6?|=O0Cx3W&^L?WQMj}~Bu2M4%({xj}*GB=kM{)bN5AJ;N z;KM(Ix?xqwy5Ry+m(!^{y!qvW&HILvSJFAcVxzSI_E@-g`{CBkqYuA)bo(R20oN6b zTBfQG{`C9Df4L>oJH*#&SZNqsKuk@e719XqvRq@*biFsx-+FdQOLa}qY3bUs8@z=$Lj zc%m4#jD)L(i(>^L!*T_(5LE{i3_ zu$Sm@4ZFyu5MfO|1#_3*0%9<{;ViLLL1M8&YZVWDzK=)VzB&!LuUx%Y1*eKvr)9Px z5cz6iDP6zm(~4O({E$)SD_38Z69MqBz3!M?_fydZUnxHV|1YPvpYjFl+wahSZlZ6y zC-kA9KJbR#|Bl`r(7QeQL66?EsQ3O@_Z-!|etqzmK6q3ge6wajhZZeQhB~{NH=VWP zXkbW(@BVJRcanFw6EvRwQcvOeNU0EseN;*W*h8j*E_qnZ8a;PZvini zJD{2axxG6hH7CU`!NsnoqvGh?z){GiVq_9;hjLOm*PI;?!F_}4pH1sZO+vOC?2ejHAJz8 z`qN%JOtBZad$+x{BNQJ+2mAE?qq^s?erQ1NAEaO4^!4jQCI^CY?D*43=(;)k9M0#i z-mSphfWH6WR{U;!XX@|!kLx`H`oSUiesiFAD|I)uXN}g4QhlegmLh{B1CKDtYCB0`EoXv|-sTa~P!V zmd~#(oxqms6+_V6)xKffu;yESD92F7?I|F^^&7SgTcc!hy(pP1F~f#gm4)$ks zT()p2(@Ng3n;B!+XbojpvqbgT}lpM3QHz%m$~4_%hAp2r%Wd zK(qs|=7X897eS=V0Ws4+?;hNe?n;|2-?-W~3;%HSebdqPiFRAt9{$|_ng6Sf;Y}M| z!MVG0+bcVlzuG_i6fsRbn-P6q_fHp|SOGT{YN~7M02A}M3|Bf`geoYVHk#9Exa-RT z#jWY|wK6L-BsTgsI^gV)Dg>voBtpHY$Y;PSd4`g+Kz@my0Y&u*(N&izp^K+@f!soR zOV7=S-gD$m@n&4#H}g05`I`$*Z4t)a{%p}=xA({^yGlNrJV~(`Wk>zbiD^wDp$=R1YYs!uql#&#sp(o|QOqA}Nk!J0ctq6z`QpiR7g( zFD;3%fp$%?b=ZKtZ4beaysQ}ZFbo(53>fx@@U=*?JnnJb%g!a`)XRdB;NiKy^KlQ) zx%XVE^Yb$TTn`>Zd;gvlgs=GE{3XHQIy}w?K#+h$Ly*I_D2v+yilFF20Y$OUk9-!D zqeyHBDd`313N?+>^H&50*Wq!NfnY%vkswEq2v(70Y@f8V8Zm9v8fTC5 zS&ALk#93SIVoii_&MyAFSK;BoJHG>hME;FDDxeS(N}y>pgJ#hjnnw$0@zLU`h%TTv z(8b_>^HJEO)w{aokt*?Y z!%(xOOfLE1#2{H~CiA&WZX*f6YH=euS}mrMn~+9)fZMlqmMs;2kSVMtKfHabS-rEI zH1q*ZZewjgQj?go5N`%tfY*lFQK>#LNiF(uRnxG|yv6XHHdL$Y!aDQjgOsN=Vlhe# zZ;msNr!*Ve*pG(+`*EA9T~qZFkUW6c*q@@fgDIxow1c&@)=Rspci|d9uSxtFUqwPL-zKd3f0S1ML6)3$cER?W5jaf57@J4Z!)-*t;E zav)c`v@+7S47Q(>o#9S#ZQu*3m^^*=*4@U9Mx#>pW>lB;(u@qSCBym%+LhwyT}z%n z8(GBCFf4UfM=yBqe!<(9I|t<1pMUz#Pk;IR7k_>B$)_Rk+dn=1_3xj4{&#@gIM?{( z%cnp8V<>y{#k1c$e*XEF&%gLtAba|^zr6UzL(09SuK1F}uM)hLj(p+VKyi9xD9?gp+$D4j zR%f1IJID*6Q(!~AsDnpx0!TQV(jBFTRV2?%m=KObm*>+*dDsacAv^~=2pWEG5+q;i ze7eMhS&?D0kz+l~D{MJAt`XOQ_v{(INvS-gfrvBHp_H?45X%@VeAct?@TI%;Ml%)n zqb@c4xQSVhAb*Mni>c#JX@m@P?D$ffps}CORKsYi+5imVjz(1*_WM12gy3#GkU&u|ZVy2Z1JG{B^zj@!=Nibm#Ey z7~-%Fx2QTaFgt~`Uu*l;mAkq`@GD7-8)4xY8CNh&5}W+9cGIGafgmHpV-+8=r48Y?}dimJs1zScmdYLorgJ-79+ z_cqxEBV5K~cL%kim)D#efq9BtdDrl*fq!;-IWuCcuQ)9*mG&!U!oT_7~+QnC)+i zGH|=1S74PrK{u^d0Vm(bkQCk-AOV(;3`HV9c8ok1C27Vz%Z_21?9Fo)fCNTjNWXR= zA!rv7LXQA66G1B>#Ds*95;8(gCr-Gk)NYySSG(-`hC5qu!Lg)xRVStfR zqKq&SCZc>u-7X`{q?}Y3VU{i-QE^G!t|TfiVeJLP3TO*^E!LHsracM}G4Zwr#aPLi_uLIioCF5-=FO2@{A z4fe)*$Hr!^Xor16%SNZOX}hzjp}D?kcYd_F$cdUV9=yAIW3xZd;Pko!4G|_p z;!cW_G}<>f9h{}PzM&CsayL6RHZ|bR<|c=|$$>jJ5=28M;U>G9IYB)quyXB!zAZM)jq_O)=LsE^(B`>(Wc*sL;KEJio$&+o2 zsbk*Sovk%7Z)S#=JQS{V<8Ch*Q+AVNq!thO&Xb&+2)m;}GQ@E55X|nu8Ilu+!XYvy z+~$ad4~1jCKmfOGsIyzDPr|wi_t2Ks;}(a#&c4|Kh{lG^mJ5vyR?BvHSn?#0RjpyBc6;3p3J(!Hx(Ok9alU9UPY7z;V~VP*s#R7sC*AILlMyDS z$@H}c!B9~!c1*PsS)+DmIK)ul0H*{XM7q)2;-p6?+!MrO>Ne6{yPIU(-r5!j9CPmv zg$Z)*eVdQ@-u($!KQZHph=;-nvUXpHCf!krtj&fXI;KPfwc9;l`j`R+YhjKk9xKX& zwZU+wFF?kMXEBig&UA&TASb0rCQ60ef;o{)4)7}YJzovj1DF7U8IvC;1cdOmFhkvf zo$14{J^>+OvcVI>Si$WqEg>Ef_6ZqrPO@K`gJyfbViwLptS~2`PsCz=J~`yZGCd(d z!S4oya#nN$`<*Kw-^0tFS|q(PGL zh6zsQCE>kNG$(V1!`(iT=HzZX5a`6+-Bu+hfmP}uIT6iJoD|pszD`aO!6};L6mUz1 z+^~8$CC%Uz;{pWDNohBQM|vrwE9F2^Vum#ON$KVA!(&~a!UWB+{-yRS12tQ=^2`_MB?tx(5X9_TCXmz58+ z+)zA~hz#oZ?r-FnxoUXJ;Fig9$9TEp&P%EC=J>(1sdCshXd7KQZmLNdYe$ccvZJA7 z>DKt(`^9C6y8gPf&YUhar_0R4g@c8;&R8*Q8Z>=VAS+Qn$7Cw?Qw_}dEsa*jn>*Ay ziUjVg8vp{s&@$zn=DobrumZGenSX&x%sNYcsSChy{CQWI<>?bKvfP~?Ip(~J5U`@0 zJAxY&-B;i@zi%f3|03dAV|M z9Np|l4kBb(ca^hp-dz<;X>QD4mV;({RzWC066^)%Uw}2w$3e)!{FOh%5=vI;6MR%~ zJL}3>r)Bd_Ti6;~1zR0g0NgISlp%_{yF04aR@bfHW?fse*0Hw3$_c%&eaaB*3w4FT z?_g8Z<%4X=fm>1QVQNlyZG6}ke!0m79@Hr zYF>thMvI@17koo)Ljy>en0~%@^P!99`Bxa{ojPr(OoaQx@q+tuRlIvbZc1xPul8N; zi;M58i>~@F`;*3P_jD=s?zrf_v0}(Qvg(HSMrB-`)>S>jw3#qqOqZ@omzJlEtJ7ts zrv*|z7O4MUV?p^~dD^gYq#>!V{zjFL33^Z<8xwY_cg~9mD7o=5;g0}i66wM*!Jm%^ zz>&R(d?M`=0#`Qm|R0EMH`yBp! z9GcZPuaA_K`t#{}cASvr^l8Da%G^Q^<`$j->-EW4*^A`Sf-AZgQ}he3tP#UgfFv=s zf|dAlC$-thF;@JdIdYo8iBi@lXC;)DfR^CTVCOMPnS;?7oIe#Nn2nJ5wR0xOf~+Jb zt4}e`aLVsP%w^c5^2sc_kn3YUq42l1U9L~x%c#<)ff`MDlAz< zA6;Ibe2$%6g?S&;EyAtOb1x8)^4$7GxDCtVHY~zjx-9O}MYzkB#a*@tw{cnA#znYI z%i=aI!d<>B?s6a0zF6Vg0>aE{sUlXp#0tTHvVv7oT2|>t+pl>SQJI?+RuiIK!4?2* z!Ln#AY$4DVE{k?0s{&fpvS_PVHPEV;MZ1dC0Igw=j<2YJt&&3lj!`pm5PMYBLw(}^KR z_L=9s$#twA)`$K@yjgA5vj(6vEQ7L!Ed|QbWl+|#Wk6ZB49Ysz2$aU9DOnTHnUmxE_|Bhx#T~$DdTW)J z${l;rKDg?P4eiRSt+91WkkJu5l%o}O(CfbwSpJTVk6(UD?Sd|flJ}!u-CF*7sSbK7J#n=HVJ=pS6Qt=MF|uY@ds&?)I=#KmTa@`g_)B8ycHVj6C}M z>XQ$zu#TzEK6&{0`wz$7VnLpxroOK>+rt1`y=_zW&YGk9Z-#r}r z1N-Fm0P5iPaO`b%>b(z8H$C*xqYwJoM>l^v_50iG)UV!!&KgCVt?H>yKc2evE>L_3 zoi&OMebv)DlQUpG;rBGoAR;lf);dH~rV2-#xP}Ry$8dZ#3izkv$~gLlrz%?*$hW zY-i;lGuUha-)VkgH`M%aUx=>Gtb>}W=}T9quJuoSoOt-fud1wPatnmF-V4|se*Wu+ z|NPeDn^zu=z565BriX^7hi*K+{^8?)y7VL1rs8)%!;Bez4BMygKm1~7`r-$VKNUQ;#P$c6NBoWQj%{g@ts>2K(fNB?itX4qzE#!0im|m_33aS*8PoWf$B`%0Y z2sp;O&?r-f5F38etS6$#V)TH)kyFpoktt(Vh~7a~vs zhCvz+;DAu$=W$;EeMsVkX5>Yva0GoOu!=bJRUoGtKUZEC1s#r3Zqfx+12uI3=_2xw z3@5n|!5J@gm`Bi2C<~%8UIaS$Rshz3Zw8Nv>bBOj-LvpPe&px(zRA;?iURQm7`UwvdywK72Bs0Jd4P+f?p z6*Vr@`~)>PG>8wL&j(dXl5!)2Ec+5TX~z20S%lThu{N5I@*u1tXK^}X9LC24NSf+K zW74cu3qboYR6fyr;^c?{ePW@)h(?~bpukU80A{vAu>h1QTAnjpx`_Cp< zA4%ztPU=sL>rbTgC*#ToMa6N+R|UHJhO$Igf7igPBWJE{r-uzeHz7faRxxe&$*7HU0wWEoX{U?8WDyiES-v!NA1^RTM_UggQ2M0`( zM(eoInli4RD6DxDjpQzRr-?*S{}W*_J$UDy3_WEAAKT64rie z(p+~(oGRWBm*r(Oj)_y+t#Rp$6e}vb`ugS9la>2Zngf%X!{eI6Db3NiIBlvLt{bdN zT92npCnim&#!aVErqglFSFe)qbRtV8D;n-}j@eR%{gZ|xhGWU@V7wszf{vwh zZIin8ab0^#*D>b>y_V9Sn$({e*Plu0U2$c)q&)F*|I5j$S5qY?CreI`mz++OoQccR zT4SQJzcOio=RGm0JvFX9mC~M$OHnB+)E`Q&Ih8WBPa4jS8_uQ-KZzGCDkrxvmO1Z? zP_I#@3rh!#Ga`T<6c*oXxz>`{-@pHzgQLWZ1aM$ARq;*dHD{u^zj@&3J6lFuW@HF2 z$5i@ThHG7kK!0Gw@lN>8@fiieD}RVDK=?vTWxQ(|ST|`}H*Q)t>XgXW;AG^2vb$uT|3|z4h{xKw@*~ojWIJ?G*FBAc6xl%oMNrYgO)cR)U@9#l`1RgbNnsX*vTOjUW;FmQgD z9b|`IA3Hv?0%0xa`A!dXPgdH-D{T{%j(fz+N;FUfirkeC>>X|$Y@IOIfIU_L`sr%S zu{meK2~*8I@mKbyIYeor>6sPio@082^4n+IF#UePx6fV`U~6{?zNKG+c=P5CLz`ak zWtqOMK>TH0F|_}J9nt|RsVi+ODon25u?}up;VZY?HhsS1?ROAUk zB~JP}G?%b+LFvU7sKZ_hT|AsF-kj8Kxw!vX|-06(iLm8hTBZ2bb4MWZ~ literal 0 HcmV?d00001 diff --git a/TS_resume_spider/spiders/yutian_top.py b/TS_resume_spider/spiders/yutian_top.py new file mode 100644 index 0000000..8d85f95 --- /dev/null +++ b/TS_resume_spider/spiders/yutian_top.py @@ -0,0 +1,85 @@ +import scrapy +import json + + +class YutianTopSpider(scrapy.Spider): + name = 'yutian_top' + allowed_domains = ['yutian.top'] + start_urls = ['https://www.yutian.top/job/company/v1/resume/page'] + + def start_requests(self): + headers = { + 'accept': 'application/json, text/plain, */*', + 'accept-language': 'zh-CN,zh;q=0.9', + 'cache-control': 'no-cache', + 'content-type': 'application/json;charset=UTF-8', + 'origin': 'https://www.yutian.top', + 'pragma': 'no-cache', + 'priority': 'u=1, i', + 'referer': 'https://www.yutian.top/enterprise/resume_store/list', + 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-fetch-dest': 'empty', + 'sec-fetch-mode': 'cors', + 'sec-fetch-site': 'same-origin', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + } + + cookies = { + 'company_sign': '', + 'company_nonce': '', + 'cuid': '', + 'PHPSESSID': '210b19c9d51dbf8eec8e8ffb0540ad33', + 'auth-token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NDY4MTIxNTksImp0aSI6IjgwZGVjMzY4LWUwODktNGYxYS1hNWJjLWExNDMzMDYzMjdmYiIsIm5hbWUiOiIxODYxNzg3MjE4NSIsInVzZXJfaWQiOiIwM2M2MmI5ODM4Yjk3Y2UzYmQxZTQwNDllZGVlNmI0OCIsInRlbmFudF90b2tlbiI6IjY1OTAxM2RlNjAxZmJmNjg1MzZmYTU0OTc4ODVkMTA2In0.0rXFe1iQClJ33rgXnTjhmye3zqVEZkJQvHGGET9dsz0', + } + + for i in range(1,6): + + payload = { + 'step': 1000, + 'page': i, + 'education_level': [], + 'arrival_time': [], + 'work_time': [], + 'area_id': [], + 'keywords': '', + 'work_status': '', + 'work_status_show': '求职状态', + 'category_id': '', + 'work_type': '', + 'work_type_show': '是否兼职', + 'sex': '', + 'sex_show': '性别', + 'is_head': '', + 'is_head_show': '有无照片', + 'job_id': '', + 'age': [], + 'age_show': '年龄', + 'refresh_time': 0, + 'site_id': '', + 'site_id2': '', + 'province': '', + 'city': '', + 'county': '', + 'provinceArr': [], + 'cityArr': [], + 'countyArr': [], + 'only_job_category': 0, + } + + yield scrapy.Request( + url=self.start_urls[0], + method='POST', + headers=headers, + cookies=cookies, + body=json.dumps(payload), + callback=self.parse, + ) + + def parse(self, response): + status_code = response.status + print(status_code) + data = json.loads(response.text) + for item in data.get('data', []): + yield item \ No newline at end of file diff --git a/TS_resume_spider/spiders/zhrczp_com.py b/TS_resume_spider/spiders/zhrczp_com.py new file mode 100644 index 0000000..9946ca3 --- /dev/null +++ b/TS_resume_spider/spiders/zhrczp_com.py @@ -0,0 +1,148 @@ +import re +import urllib +from typing import Iterable +import scrapy +from lxml import etree +from scrapy import Request + + +class ZunHuaComSpider(scrapy.Spider): + name = 'zhrczp_com' + allowed_domains = ['zhrczp.com'] + start_urls = ['https://www.zhrczp.com/member/index.php'] + cookies = { + 'Hm_lvt_115013d5b34e45eb09d0baedeb1c845a': '1745062179', + 'HMACCOUNT': 'B05D7338A384928F', + 'Hm_lpvt_115013d5b34e45eb09d0baedeb1c845a': '1745062980', + 'PHPSESSID': 'f2o89gakk79jl43hcl4ptnea3r', + 'uid': '60531', + 'shell': '9246a8c91784a3981081a37dd4bdcef9', + 'usertype': '2', + 'userdid': '0', + 'amtype': '0', + 'jobrefresh': '1', + 'gzh': '1', + 'acw_tc': '1a0c63d517450682931821154e003e6b210262ee0f2d393aa4e3b2a163053b', + 'pc_bannerFlag': '1', + } + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Pragma': 'no-cache', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36', + 'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + } + + def start_requests(self) -> Iterable[Request]: + for page in range(1, 251): + params = { + 'c': 'resume', + 'page': str(page), + } + query_string = urllib.parse.urlencode(params) + full_url = f"{self.start_urls[0]}?{query_string}" + yield scrapy.Request( + url=full_url, + method='GET', + headers=self.headers, + cookies=self.cookies, + callback=self.parse, + + ) + + def parse(self, response): + status_code = response.status + print(status_code) + html = response.text + res = re.findall(r"com_lookresume_check\('(.+?)','1'\)", html) + resume_id_list = list(set(res)) + for item in resume_id_list: + params = { + 'c': 'hr', + 'act': 'resumeInfo', + 'eid': item, + 'state': 'undefined', + 'from': '', + } + query_string = urllib.parse.urlencode(params) + full_url = f"{self.start_urls[0]}?{query_string}" + yield scrapy.Request( + url=full_url, + method='GET', + headers=self.headers, + cookies=self.cookies, + callback=self.parse2, + meta={'resume_id': item}, + ) + + def parse2(self, response): + resume_id = response.meta.get('resume_id') + + parts_raw = response.xpath('//div[@class="hr_resume_item"]/text()').get() + extra_span = response.xpath('//div[@class="hr_resume_item"]/span/text()').get() + + parts = [] + if parts_raw: + cleaned = re.sub(r'\s+', ' ', parts_raw).strip() + parts = [p.strip() for p in cleaned.split('·') if p.strip()] + if extra_span: + parts.append(extra_span.strip()) + + current_location = '' + if parts and '现居' in parts[-1]: + current_location = parts[-1] + parts = parts[:-1] + + text = " ".join(parts) + age = re.search(r'(\d{2})岁', text) + height = re.search(r'(\d{2,3})\s*cm', text, re.I) + weight = re.search(r'(\d{2,3})\s*(kg|公斤)', text, re.I) + experience = re.search(r'(无经验|1年以下|\d{1,2}-\d{1,2}年|(?:\d{1,2})年以上|(?:\d{1,2})年经验)', text) + education = re.search(r'(初中|高中|中专|大专|本科|硕士|博士)', text) + marital = re.search(r'(已婚|未婚)', text) + ethnic = re.search(r'(汉|满|回|壮|蒙古)', text) + + # 页面字段 XPath 提取 + name = response.xpath('//span[@class="hr_resume_username"]/text()').get() + update_time_raw = response.xpath('//span[@class="hr_resume_time_l "]/text()').get() + update_time = re.sub(r'^更新时间[::]?', '', update_time_raw).strip() if update_time_raw else '' + + job_funcs = response.xpath('//span[@class="yun_newedition_yx_job"]/text()').getall() + job_titles = response.xpath('//li[span[contains(text(),"意向岗位")]]/text()').get() + industry = response.xpath('//li[span[contains(text(),"从事行业")]]/text()').get() + salary = response.xpath('//li[span[contains(text(),"期望薪资")]]/text()').get() + report_time = response.xpath('//li[span[contains(text(),"到岗时间")]]/text()').get() + job_type = response.xpath('//li[span[contains(text(),"工作性质")]]/text()').get() + job_status = response.xpath('//li[span[contains(text(),"求职状态")]]/text()').get() + location = response.xpath('//li[span[contains(text(),"工作地点")]]/text()').get() + yield { + 'resume_id': resume_id, + 'name': name.strip() if name else None, + 'age': age.group(1) if age else None, + 'height': height.group(1) if height else None, + 'weight': weight.group(1) if weight else None, + 'work_years': experience.group(1) if experience else None, + 'education': education.group(1) if education else None, + 'marital_status': marital.group(1) if marital else None, + 'ethnicity': ethnic.group(1) if ethnic else None, + 'current_location': current_location.replace('现居', '').strip() if current_location else None, + 'update_time': update_time[3:] if update_time else None, + 'job_function': ', '.join([j.strip() for j in job_funcs]) if job_funcs else None, + 'intended_position': job_titles.strip() if job_titles else None, + 'industry': industry.strip() if industry else None, + 'expected_salary': salary.strip() if salary else None, + 'available_time': report_time.strip() if report_time else None, + 'job_property': job_type.strip() if job_type else None, + 'job_status': job_status.strip() if job_status else None, + 'job_location': location.strip() if location else None, + 'source_id': 1, + } diff --git a/TS_resume_spider/utils/__init__.py b/TS_resume_spider/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc b/TS_resume_spider/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4254c5d2698a8847192e880c336a240b4cd96038 GIT binary patch literal 172 zcmX@j%ge<81RY+?86f&Gh(HIQS%4zb87dhx8U0o=6fpsLpFwJVdAnG}IOnIN7RLls zmSp7T#W)uwmgl4v#e@XM7o`@L=BCCM7i6Yj6DTdo%qfnEkI&4@EQycTE2#X%VUwGm bQks)$SHuc5n-PeML5z>gjEsy$%s>_ZovAGL literal 0 HcmV?d00001 diff --git a/TS_resume_spider/utils/__pycache__/db.cpython-312.pyc b/TS_resume_spider/utils/__pycache__/db.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f0aef539974914e182ad0f92f9c3dc6ec91afe5 GIT binary patch literal 3649 zcmb6bTWl29_0DT|XLrnc@!ELf*j{7oGKjdfc|Y1>Y%8JAHDH6v;$#@ljPWkB5AMvy z>}t0{5tVGokz@74MiIhks6u2us1&vR45%NKnh!gMdOM;LQrar%m%)*e(4U?=yR$47 zt(0T$nKS2}bMD-8&+9KWHC_bpi_6UT6*od(kQWEoGO>OHh&iMpmC{iHMG-}4TBo8E zjb2A8a}BBNB3+V>(l(C+o~y!Rvb5;_iU^35{-5_XNOLm=+k3b(o|ekw#bnf_^tomHf2)ITHM*h zVZbv_<>@?=XY;U;S>eZ>!&f*6#jE!y^Hur|v*;He+bm@}y9?LbPmQ5bTdBW}p3Hv1HY9m6kw_ zH8gz;lha_HAmUw24uSkj%i%(pBI9R(uA)c5rn$k{!D67E{MR=Z z{Q*J*n;$i{-t=GhuQv89H}({_G<_Z5>%EigG`|)^Tbkx#v$55N&gF*Am4@w;?7O~o zu!bTEi5UmTUDz>nABZ``Be+p#t?C)C)+zHzL|vf_nnD+mlL8dVqIP6f;$s`aJq5-j zXR$%0^340_2EAC0Rf_EQJtK%h=O8f)%rBX2`8h#NpexKdG(m}UB+~#powq&uX82!I zAAR}3^gnO??#oYpdoV1za6K$qZtaSuWK7NC2d*eu+Ki==7MI4cq=|!M-GfuQJ}N7h zL>3ck7DK|x^|q{MG{d4agPf1V0^R}yZUU66tG>gsePYZw4d^ObqdC5vZ1+ObWfa}0jvSU$ z*qponXcr-V0FuT|Kv&UY-=>*EQ-_v(EstL5DAsQy|Dm15x^S^R^tI2`v}v;M-Q#Nv z5?bx6u~_KlvQphwYru}}uP)%ukvdlGO;{x0cptKe`hOYIk7no@Mqy$|q2Jtv&$ceg)4mW+k7@O2srum$Jbm-(!P{oa$V_zow`6u&BV2T~w+7 zkrXG9xdHNQylw*vUY%1p`xIO+*H)Q}lu44@hCJ)Y=ipQhU2wl#H_sBEDD*|*ky_%7 zJgu^qAkpCje4c(Esho{nH`rNvdNT`hg-Ty3cvN?u0~t@=1(?sf0ekZvz`h%Nfrr~7 zg?$6WNzQJ7Ja3ZvZUcCu*`c35H+sYIH0gy3bVr3LFa>&|M8h{uAW?`|b)`wcC{GNd zvJq2q^v-aO?l3IAJVwOSoHzXA@Y&uIX9kA0KcmE4$ZO9aWppeuG;n%2d@M3N2#VqD zIj>k22D(#xL3S__?mu(##Ie5K;eqh+fpg(AC;JJRt9e08^y3~lo5jUUEddi#%!nlo zQ%))xbU_na40K708`D!V%xPvetznV`78_5+k`{YO%NiD=Yspep63!cuI#U%CKrOX);Vh)so3%*a{K` z3H;}-unVnK1D~p4v+UAbt)2E&9C9$j1!)TT@A1ueraaT$$DKPqihmIQP+tuuuB-F% zLg175r-45Web)Y2=E2$M(s^m+?1cxT@umJt5BKWtotmd+hnIqhNq#w)cwE~t$rb%| zGuf%^eCTG&^_G>8xZ>Y2$-*o)mz~WnY+Go!(|$X)*tOiYe`U*oNuk*I>P`MS|7qy2 zTctnt{#jlcdTS-})`Gk+blZEk`)>Q)Q=hl*`+Tdk(s*IAW{sulLd8(ayn7xKQwzDf zJMXrvA?gU#QS9hh7zA>6%^Jt}YbTjCFXDw6-;{6F)3oeqn(w<6T!^i9?p^NOyVALT zrR~5&&%y83L#RGUefQLh+AmNB=}xzL`UTg$EgazcuX+17d+zTi7_N#`OT3*Rk)TSM zwfvGKCzB~tCWDg!ozQ2G;Ry|fVYL)JtO|oRCY=QDAn18BB;^+qv<-g70YJY(#irJ) zuTP#|VOt+_wNGddw`rY0uD}|hV3H-2pF6Y=e2U;B3zPr= literal 0 HcmV?d00001 diff --git a/TS_resume_spider/utils/db.py b/TS_resume_spider/utils/db.py new file mode 100644 index 0000000..834c6f9 --- /dev/null +++ b/TS_resume_spider/utils/db.py @@ -0,0 +1,66 @@ +from datetime import datetime + +import pymysql + + +class MySQLClient: + def __init__(self, host, user, password, db, port=3306): + self.conn = pymysql.connect( + host=host, + user=user, + password=password, + db=db, + port=port, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor, + autocommit=True + ) + self.cursor = self.conn.cursor() + + def execute(self, sql, values=None): + try: + self.cursor.execute(sql, values or []) + + except Exception as e: + print(f"[MySQL] 执行失败: {e}") + self.conn.rollback() + + def __del__(self): + try: + self.cursor.close() + self.conn.close() + except Exception: + pass + + +class DB: + _client: MySQLClient = None # 类属性持有连接 + + @classmethod + def init(cls): + if cls._client is None: + cls._client = MySQLClient( + host='39.101.135.56', + user='tsreshub_prod', + password='Tr5h$Prod!92@TsRH', + db='tsreshub_db', + port=3306 + ) + + @classmethod + def insert_resume(cls, data: dict): + cls.init() # 保证连接已初始化 + + safe_data = {k: v for k, v in data.items() if isinstance(v, (str, int, float, type(None), datetime))} + + table = 'resumes_resumebasic' + keys = ', '.join(safe_data.keys()) + placeholders = ', '.join(['%s'] * len(safe_data)) + update_clause = ', '.join([f"{k} = VALUES({k})" for k in safe_data if k != 'resume_id']) + + sql = f""" + INSERT INTO {table} ({keys}) VALUES ({placeholders}) + ON DUPLICATE KEY UPDATE {update_clause} + """ + + cls._client.execute(sql, list(safe_data.values())) diff --git a/debug/Debug_yutian_top.py b/debug/Debug_yutian_top.py new file mode 100644 index 0000000..a190a37 --- /dev/null +++ b/debug/Debug_yutian_top.py @@ -0,0 +1,17 @@ +# debug/debug_spider.py +import sys +import os +project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +sys.path.append(project_root) +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings +from TS_resume_spider.spiders.yutian_top import YutianTopSpider +from TS_resume_spider.spiders.zhrczp_com import ZunHuaComSpider + +def main(): + process = CrawlerProcess(get_project_settings()) + process.crawl(ZunHuaComSpider) + process.start() + +if __name__ == '__main__': + main() diff --git a/debug/__init__.py b/debug/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..e3798f2 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = TS_resume_spider.settings + +[deploy] +#url = http://localhost:6800/ +project = TS_resume_spider