import csv import os import time import requests import json import re class Zillow: def __init__(self): self.baseurl = "https://www.zillow.com" self.proxies = { "http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890", } self.cookies = { 'zguid': '24|%24e835e7de-1e03-40a2-9e0b-ea6557fce975', 'zgsession': '1|5a92d272-1f0a-4eab-aad9-0b01bfc4e12e', 'pxcts': '4efbe577-f671-11ef-944c-1da0cdd7af45', '_pxvid': '4efbd04b-f671-11ef-944a-e6989223c886', 'zjs_anonymous_id': '%22e835e7de-1e03-40a2-9e0b-ea6557fce975%22', 'zg_anonymous_id': '%224b8a74fb-12d3-4dd5-b453-91d2f059cc5c%22', '_ga': 'GA1.2.1116450831.1740815202', '_gid': 'GA1.2.1210142068.1740815202', '_gcl_au': '1.1.1930799100.1740815207', '_scid': '3cxthBM96c4ODxO1v0Al3ufRKQFwF0wr', 'DoubleClickSession': 'true', '_ScCbts': '%5B%5D', '_pin_unauth': 'dWlkPU4yRmtNVGhtTmpVdE5tWmtaUzAwWXpnNUxUbGtPRGt0WkRrek1UTm1OREUyTTJJdw', '_tt_enable_cookie': '1', '_ttp': '01JN8AR8CC889QQ2N91WJNGVRG_.tt.1', '_fbp': 'fb.1.1740815213095.801729201273859650', '_sctr': '1%7C1740758400000', '_clck': '1gi6cgn%7C2%7Cftu%7C0%7C1886', '_lr_env_src_ats': 'false', 'g_state': '{"i_l":0}', 'loginmemento': '1|38bd958cf99c0efa6fcdf2d8b4ba656e18214f94b1d0b8482177e5821d485cb5', 'userid': 'X|3|516ce03fb1857a73%7C9%7CtUJB3zsjokPFJzXVcdK9kYKiV6BT9Hom', 'zjs_user_id': '%22X1-ZUqbce6dl8y0w9_1oeg0%22', '_derived_epik': 'dj0yJnU9YzhHUUI3UjgwVTYyektsUXlURWZ5UnNPZGpPTDBwUi0mbj01MkNxOF9nSm1udHEwczFUVGoyM0p3Jm09NCZ0PUFBQUFBR2ZDeDJRJnJtPTQmcnQ9QUFBQUFHZkN4MlEmc3A9Mg', 'tfpsi': 'cf8acdc2-7676-40ea-810c-a9d9613cd3ac', 'ZILLOW_SID': '1|AAAAAVVbFRIBVVsVEuEBjRdhLdPgArRt9zTF9A9GUy9n6qhArfRCvbRpWcUyvOX17mDKQCRGzog4qedbe0aFqFnGaQzFa2AVHA', 'JSESSIONID': '2AB46AF163E850A6A93A55EEAFBFA961', '_px3': '2b2ceb1e9473516a5889d6b089882f2df95563f843864b748551a222e3a93bd6:Re6tLyXWJyuSifQZrCVdgU+HEPF8Ih1TmBAyFOt2qBZwjYXKAdScUmpmzzTUHclqZwD1vCAkP6Pow8uPxHGEBw==:1000:8rliApJL6+buJDD0QJN5jVM3zTksVkovJZ9PzmBfy8r178UA1kmnQ3o8GCOS0s/wmN0v5TATeetgtPpn8ZgJH/nA3DeEY9emAKW8bwwCOzP2xxF8rOEN4IYvM48vCdaEhFnLMTOWUR5ZhrMtnuinXvnFPpcsoTdiVsjPiPUunfMbpjRVUSEeRGfrO2BvQBnqh1bPhqq4CXReAdRoEbUdIoO+jZ250vTxbnHyWHDnnsk=', '_rdt_uuid': '1740815209620.81e8d67a-5a8f-489d-844a-6eb51bea084d', '_scid_r': '7ExthBM96c4ODxO1v0Al3ufRKQFwF0wrejgyEw', 'search': '6|1743432649336%7Crect%3D40.945233052704474%2C-73.83509060791016%2C40.845317795985295%2C-73.98271939208985%26rid%3D17182%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26singlestory%3D0%26housing-connector%3D0%26parking-spots%3Dnull-%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26showcase%3D0%26featuredMultiFamilyBuilding%3D0%26onlyRentalStudentHousingType%3D0%26onlyRentalIncomeRestrictedHousingType%3D0%26onlyRentalMilitaryHousingType%3D0%26onlyRentalDisabledHousingType%3D0%26onlyRentalSeniorHousingType%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0917182%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09', '_clsk': 'uwbs56%7C1740840661345%7C2%7C0%7Cr.clarity.ms%2Fcollect', 'AWSALB': 'SVlf4eVEwg9mRsnpIwRyN+xS9wd8r/tNawixFSnBEf7jkmDoA1qS5ygg/3YxMalClDa1hkwxgRsZsha0aX46GGoXENyABk7jsy4YM0+YnjOiQ4CnF/ZFkf377PDR', 'AWSALBCORS': 'SVlf4eVEwg9mRsnpIwRyN+xS9wd8r/tNawixFSnBEf7jkmDoA1qS5ygg/3YxMalClDa1hkwxgRsZsha0aX46GGoXENyABk7jsy4YM0+YnjOiQ4CnF/ZFkf377PDR', '_uetsid': '566607f0f67111ef9e8f315a86932718', '_uetvid': '5665f370f67111efae377d2348e45d1f', } self.csvdata = [ [ 'zpid', 'price', # 价格 'yearBuilt', # 始建 'streetAddress', # 地址 'bedrooms', # 卧室数量 'bathrooms', # 浴室数量 'responsivePhotos', # 照片 'timeOnZillow', # 展示天数 'pageViewCount', # 页面浏览量 'favoriteCount', # 收藏数量 'phrases', # 标签 'description', # 描述 ], ] if not os.path.exists('data.csv'): with open('data.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(self.csvdata[0]) # 写入标题行 # self.update_cookies_ZILLOW_SID() # 更新cookies def get_JSESSIONID(self): cookies = self.cookies headers = { 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'priority': 'u=1, i', 'referer': 'https://www.zillow.com/homedetails/4705-Henry-Hudson-Pkwy-APT-6B-Bronx-NY-10471/244446711_zpid/', 'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36' } params = { 'zpid': '244446711', } response = requests.get( 'https://www.zillow.com/ajax/homedetail/MarkPropertyViewed.htm', params=params, cookies=cookies, headers=headers, ) JSESSIONID = response.headers.get('x-requested-session') print(JSESSIONID) self.cookies['JSESSIONID'] = JSESSIONID cookies = response.headers.get('set-cookies') print(cookies) def update_cookies_ZILLOW_SID(self): cookies = self.cookies headers = { 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'priority': 'u=1, i', 'referer': 'https://www.zillow.com/homedetails/3531-Bronxwood-Ave-APT-3F-Bronx-NY-10469/2053005716_zpid/', 'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', } params = { 'featureFlags': [ 'SHOPPING_ZOAM_MIGRATION_FOR_SALE_HDP', 'CIAM_ZOAM_MIGRATION_GLOBAL', ], } response = requests.get('https://www.zillow.com/api/user/featureFlags', params=params, cookies=cookies, headers=headers, proxies=self.proxies) scookies = response.headers['set-cookie'].split(';')[0].split('=')[1] print(scookies) self.cookies['ZILLOW_SID'] = scookies def get_search_page_info(self): url = "/async-create-search-page-state" cookies = self.cookies headers = { 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'content-type': 'application/json', 'origin': 'https://www.zillow.com', 'pragma': 'no-cache', 'priority': 'u=1, i', # 'referer': 'https://www.zillow.com/new-york-ny/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22isMapVisible%22%3Atrue%2C%22mapBounds%22%3A%7B%22west%22%3A-73.99487916636572%2C%22east%22%3A-73.69962159800635%2C%22south%22%3A40.799450997125795%2C%22north%22%3A40.976203993908605%7D%2C%22mapZoom%22%3A12%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A6181%2C%22regionType%22%3A6%7D%5D%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D', 'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', } json_data = { 'searchQueryState': { 'pagination': {}, 'isMapVisible': True, 'mapBounds': { 'west': -73.99487916636572, 'east': -73.69962159800635, 'south': 40.79945099712579, 'north': 40.97620399390861, }, 'mapZoom': 12, 'regionSelection': [ { 'regionId': 6181, 'regionType': 6, }, ], 'filterState': { 'sortSelection': { 'value': 'globalrelevanceex', }, }, 'isListVisible': True, }, 'wants': { 'cat1': [ 'mapResults', ], }, 'requestId': 2, 'isDebugRequest': False, } response = requests.put(url=self.baseurl + url, cookies=cookies, headers=headers, json=json_data, proxies=self.proxies) json_data = response.json() return json_data def json_parsing(self, search_json): for i in search_json.get("cat1").get("searchResults").get("mapResults"): detailurl = i.get("detailUrl") if "homedetails" in detailurl: self.get_homedetails(i.get("detailUrl")) def get_homedetails(self, detailUrl, r=3): cookies = self.cookies headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'priority': 'u=0, i', 'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36' } if r > 0: try: response = requests.get(url=self.baseurl + detailUrl, cookies=cookies, headers=headers, proxies=self.proxies) data_json_str = re.findall(r'\"gdpClientCache\":(.*?),\"composedGraphQLQuery\"', response.text)[0] except: time.sleep(2) print(response.status_code) print(self.baseurl + detailUrl) self.get_JSESSIONID() if r == 3: with open('error_list.txt', 'w', encoding='utf-8') as f: f.write(self.baseurl + detailUrl) return self.get_homedetails(detailUrl, r - 1) else: print("重试最大次数!") return if response.headers.get('set-cookie'): scookies = response.headers['set-cookie'].split(';')[0].split('=')[1] self.cookies['ZILLOW_SID'] = scookies data_json_str = data_json_str.encode('utf-8').decode('unicode_escape') # 去除转义 data_json = ('{"data":{"property":' + data_json_str.split(':{"property":')[-1])[:-1] jd = json.loads(data_json).get("data").get("property") new_row = [ jd.get("zpid"), jd.get("price"), jd.get("yearBuilt"), jd.get("address").get("streetAddress"), jd.get("bedrooms"), jd.get("bathrooms"), "|".join([i.get('mixedSources').get('jpeg')[-1].get('url') for i in jd.get("responsivePhotos")]), jd.get('timeOnZillow'), jd.get('pageViewCount'), jd.get('favoriteCount'), '' if not jd.get("homeInsights") else "|".join( [i for i in jd.get("homeInsights")[0].get("insights")[0].get("phrases")]), jd.get("description") ] with open('data.csv', 'a', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(new_row) print(f"成功保存:{new_row[0]}") if __name__ == '__main__': Z = Zillow() search_data = Z.get_search_page_info() Z.json_parsing(search_data) # Z.get_JSESSIONID()