284 lines
13 KiB
Python
284 lines
13 KiB
Python
import csv
|
|
import os
|
|
import time
|
|
|
|
import requests
|
|
import json
|
|
import re
|
|
|
|
|
|
class Zillow:
|
|
|
|
def __init__(self):
|
|
self.baseurl = "https://www.zillow.com"
|
|
self.proxies = {
|
|
"http": "http://127.0.0.1:7890",
|
|
"https": "http://127.0.0.1:7890",
|
|
}
|
|
self.cookies = {
|
|
'zguid': '24|%24e835e7de-1e03-40a2-9e0b-ea6557fce975',
|
|
'zgsession': '1|5a92d272-1f0a-4eab-aad9-0b01bfc4e12e',
|
|
'pxcts': '4efbe577-f671-11ef-944c-1da0cdd7af45',
|
|
'_pxvid': '4efbd04b-f671-11ef-944a-e6989223c886',
|
|
'zjs_anonymous_id': '%22e835e7de-1e03-40a2-9e0b-ea6557fce975%22',
|
|
'zg_anonymous_id': '%224b8a74fb-12d3-4dd5-b453-91d2f059cc5c%22',
|
|
'_ga': 'GA1.2.1116450831.1740815202',
|
|
'_gid': 'GA1.2.1210142068.1740815202',
|
|
'_gcl_au': '1.1.1930799100.1740815207',
|
|
'_scid': '3cxthBM96c4ODxO1v0Al3ufRKQFwF0wr',
|
|
'DoubleClickSession': 'true',
|
|
'_ScCbts': '%5B%5D',
|
|
'_pin_unauth': 'dWlkPU4yRmtNVGhtTmpVdE5tWmtaUzAwWXpnNUxUbGtPRGt0WkRrek1UTm1OREUyTTJJdw',
|
|
'_tt_enable_cookie': '1',
|
|
'_ttp': '01JN8AR8CC889QQ2N91WJNGVRG_.tt.1',
|
|
'_fbp': 'fb.1.1740815213095.801729201273859650',
|
|
'_sctr': '1%7C1740758400000',
|
|
'_clck': '1gi6cgn%7C2%7Cftu%7C0%7C1886',
|
|
'_lr_env_src_ats': 'false',
|
|
'g_state': '{"i_l":0}',
|
|
'loginmemento': '1|38bd958cf99c0efa6fcdf2d8b4ba656e18214f94b1d0b8482177e5821d485cb5',
|
|
'userid': 'X|3|516ce03fb1857a73%7C9%7CtUJB3zsjokPFJzXVcdK9kYKiV6BT9Hom',
|
|
'zjs_user_id': '%22X1-ZUqbce6dl8y0w9_1oeg0%22',
|
|
'_derived_epik': 'dj0yJnU9YzhHUUI3UjgwVTYyektsUXlURWZ5UnNPZGpPTDBwUi0mbj01MkNxOF9nSm1udHEwczFUVGoyM0p3Jm09NCZ0PUFBQUFBR2ZDeDJRJnJtPTQmcnQ9QUFBQUFHZkN4MlEmc3A9Mg',
|
|
'tfpsi': 'cf8acdc2-7676-40ea-810c-a9d9613cd3ac',
|
|
'ZILLOW_SID': '1|AAAAAVVbFRIBVVsVEuEBjRdhLdPgArRt9zTF9A9GUy9n6qhArfRCvbRpWcUyvOX17mDKQCRGzog4qedbe0aFqFnGaQzFa2AVHA',
|
|
'JSESSIONID': '2AB46AF163E850A6A93A55EEAFBFA961',
|
|
'_px3': '2b2ceb1e9473516a5889d6b089882f2df95563f843864b748551a222e3a93bd6:Re6tLyXWJyuSifQZrCVdgU+HEPF8Ih1TmBAyFOt2qBZwjYXKAdScUmpmzzTUHclqZwD1vCAkP6Pow8uPxHGEBw==:1000:8rliApJL6+buJDD0QJN5jVM3zTksVkovJZ9PzmBfy8r178UA1kmnQ3o8GCOS0s/wmN0v5TATeetgtPpn8ZgJH/nA3DeEY9emAKW8bwwCOzP2xxF8rOEN4IYvM48vCdaEhFnLMTOWUR5ZhrMtnuinXvnFPpcsoTdiVsjPiPUunfMbpjRVUSEeRGfrO2BvQBnqh1bPhqq4CXReAdRoEbUdIoO+jZ250vTxbnHyWHDnnsk=',
|
|
'_rdt_uuid': '1740815209620.81e8d67a-5a8f-489d-844a-6eb51bea084d',
|
|
'_scid_r': '7ExthBM96c4ODxO1v0Al3ufRKQFwF0wrejgyEw',
|
|
'search': '6|1743432649336%7Crect%3D40.945233052704474%2C-73.83509060791016%2C40.845317795985295%2C-73.98271939208985%26rid%3D17182%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26singlestory%3D0%26housing-connector%3D0%26parking-spots%3Dnull-%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26showcase%3D0%26featuredMultiFamilyBuilding%3D0%26onlyRentalStudentHousingType%3D0%26onlyRentalIncomeRestrictedHousingType%3D0%26onlyRentalMilitaryHousingType%3D0%26onlyRentalDisabledHousingType%3D0%26onlyRentalSeniorHousingType%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0917182%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
|
|
'_clsk': 'uwbs56%7C1740840661345%7C2%7C0%7Cr.clarity.ms%2Fcollect',
|
|
'AWSALB': 'SVlf4eVEwg9mRsnpIwRyN+xS9wd8r/tNawixFSnBEf7jkmDoA1qS5ygg/3YxMalClDa1hkwxgRsZsha0aX46GGoXENyABk7jsy4YM0+YnjOiQ4CnF/ZFkf377PDR',
|
|
'AWSALBCORS': 'SVlf4eVEwg9mRsnpIwRyN+xS9wd8r/tNawixFSnBEf7jkmDoA1qS5ygg/3YxMalClDa1hkwxgRsZsha0aX46GGoXENyABk7jsy4YM0+YnjOiQ4CnF/ZFkf377PDR',
|
|
'_uetsid': '566607f0f67111ef9e8f315a86932718',
|
|
'_uetvid': '5665f370f67111efae377d2348e45d1f',
|
|
}
|
|
self.csvdata = [
|
|
[
|
|
'zpid',
|
|
'price', # 价格
|
|
'yearBuilt', # 始建
|
|
'streetAddress', # 地址
|
|
'bedrooms', # 卧室数量
|
|
'bathrooms', # 浴室数量
|
|
'responsivePhotos', # 照片
|
|
'timeOnZillow', # 展示天数
|
|
'pageViewCount', # 页面浏览量
|
|
'favoriteCount', # 收藏数量
|
|
'phrases', # 标签
|
|
'description', # 描述
|
|
],
|
|
]
|
|
if not os.path.exists('data.csv'):
|
|
with open('data.csv', 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(self.csvdata[0]) # 写入标题行
|
|
# self.update_cookies_ZILLOW_SID() # 更新cookies
|
|
|
|
def get_JSESSIONID(self):
|
|
|
|
cookies = self.cookies
|
|
|
|
headers = {
|
|
'accept': '*/*',
|
|
'accept-language': 'zh-CN,zh;q=0.9',
|
|
'cache-control': 'no-cache',
|
|
'pragma': 'no-cache',
|
|
'priority': 'u=1, i',
|
|
'referer': 'https://www.zillow.com/homedetails/4705-Henry-Hudson-Pkwy-APT-6B-Bronx-NY-10471/244446711_zpid/',
|
|
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'cors',
|
|
'sec-fetch-site': 'same-origin',
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
params = {
|
|
'zpid': '244446711',
|
|
}
|
|
|
|
response = requests.get(
|
|
'https://www.zillow.com/ajax/homedetail/MarkPropertyViewed.htm',
|
|
params=params,
|
|
cookies=cookies,
|
|
headers=headers,
|
|
)
|
|
JSESSIONID = response.headers.get('x-requested-session')
|
|
print(JSESSIONID)
|
|
self.cookies['JSESSIONID'] = JSESSIONID
|
|
cookies = response.headers.get('set-cookies')
|
|
print(cookies)
|
|
|
|
def update_cookies_ZILLOW_SID(self):
|
|
cookies = self.cookies
|
|
headers = {
|
|
'accept': '*/*',
|
|
'accept-language': 'zh-CN,zh;q=0.9',
|
|
'cache-control': 'no-cache',
|
|
'pragma': 'no-cache',
|
|
'priority': 'u=1, i',
|
|
'referer': 'https://www.zillow.com/homedetails/3531-Bronxwood-Ave-APT-3F-Bronx-NY-10469/2053005716_zpid/',
|
|
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'cors',
|
|
'sec-fetch-site': 'same-origin',
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
|
|
}
|
|
|
|
params = {
|
|
'featureFlags': [
|
|
'SHOPPING_ZOAM_MIGRATION_FOR_SALE_HDP',
|
|
'CIAM_ZOAM_MIGRATION_GLOBAL',
|
|
],
|
|
}
|
|
|
|
response = requests.get('https://www.zillow.com/api/user/featureFlags', params=params, cookies=cookies,
|
|
headers=headers, proxies=self.proxies)
|
|
|
|
scookies = response.headers['set-cookie'].split(';')[0].split('=')[1]
|
|
print(scookies)
|
|
self.cookies['ZILLOW_SID'] = scookies
|
|
|
|
def get_search_page_info(self):
|
|
url = "/async-create-search-page-state"
|
|
cookies = self.cookies
|
|
|
|
headers = {
|
|
'accept': '*/*',
|
|
'accept-language': 'zh-CN,zh;q=0.9',
|
|
'cache-control': 'no-cache',
|
|
'content-type': 'application/json',
|
|
'origin': 'https://www.zillow.com',
|
|
'pragma': 'no-cache',
|
|
'priority': 'u=1, i',
|
|
# 'referer': 'https://www.zillow.com/new-york-ny/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22isMapVisible%22%3Atrue%2C%22mapBounds%22%3A%7B%22west%22%3A-73.99487916636572%2C%22east%22%3A-73.69962159800635%2C%22south%22%3A40.799450997125795%2C%22north%22%3A40.976203993908605%7D%2C%22mapZoom%22%3A12%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A6181%2C%22regionType%22%3A6%7D%5D%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D',
|
|
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
'sec-fetch-dest': 'empty',
|
|
'sec-fetch-mode': 'cors',
|
|
'sec-fetch-site': 'same-origin',
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
|
|
}
|
|
|
|
json_data = {
|
|
'searchQueryState': {
|
|
'pagination': {},
|
|
'isMapVisible': True,
|
|
'mapBounds': {
|
|
'west': -73.99487916636572,
|
|
'east': -73.69962159800635,
|
|
'south': 40.79945099712579,
|
|
'north': 40.97620399390861,
|
|
},
|
|
'mapZoom': 12,
|
|
'regionSelection': [
|
|
{
|
|
'regionId': 6181,
|
|
'regionType': 6,
|
|
},
|
|
],
|
|
'filterState': {
|
|
'sortSelection': {
|
|
'value': 'globalrelevanceex',
|
|
},
|
|
},
|
|
'isListVisible': True,
|
|
},
|
|
'wants': {
|
|
'cat1': [
|
|
'mapResults',
|
|
],
|
|
},
|
|
'requestId': 2,
|
|
'isDebugRequest': False,
|
|
}
|
|
|
|
response = requests.put(url=self.baseurl + url, cookies=cookies,
|
|
headers=headers, json=json_data, proxies=self.proxies)
|
|
json_data = response.json()
|
|
return json_data
|
|
|
|
def json_parsing(self, search_json):
|
|
|
|
for i in search_json.get("cat1").get("searchResults").get("mapResults"):
|
|
detailurl = i.get("detailUrl")
|
|
if "homedetails" in detailurl:
|
|
self.get_homedetails(i.get("detailUrl"))
|
|
|
|
def get_homedetails(self, detailUrl, r=3):
|
|
cookies = self.cookies
|
|
|
|
headers = {
|
|
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
'accept-language': 'zh-CN,zh;q=0.9',
|
|
'cache-control': 'no-cache',
|
|
'pragma': 'no-cache',
|
|
'priority': 'u=0, i',
|
|
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
|
|
'sec-ch-ua-mobile': '?0',
|
|
'sec-ch-ua-platform': '"Windows"',
|
|
'sec-fetch-dest': 'document',
|
|
'sec-fetch-mode': 'navigate',
|
|
'sec-fetch-site': 'none',
|
|
'sec-fetch-user': '?1',
|
|
'upgrade-insecure-requests': '1',
|
|
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
|
|
}
|
|
|
|
if r > 0:
|
|
|
|
try:
|
|
response = requests.get(url=self.baseurl + detailUrl, cookies=cookies, headers=headers,
|
|
proxies=self.proxies)
|
|
data_json_str = re.findall(r'\"gdpClientCache\":(.*?),\"composedGraphQLQuery\"', response.text)[0]
|
|
except:
|
|
time.sleep(2)
|
|
print(response.status_code)
|
|
print(self.baseurl + detailUrl)
|
|
self.get_JSESSIONID()
|
|
if r == 3:
|
|
with open('error_list.txt', 'w', encoding='utf-8') as f:
|
|
f.write(self.baseurl + detailUrl)
|
|
return self.get_homedetails(detailUrl, r - 1)
|
|
else:
|
|
print("重试最大次数!")
|
|
return
|
|
if response.headers.get('set-cookie'):
|
|
scookies = response.headers['set-cookie'].split(';')[0].split('=')[1]
|
|
self.cookies['ZILLOW_SID'] = scookies
|
|
data_json_str = data_json_str.encode('utf-8').decode('unicode_escape') # 去除转义
|
|
data_json = ('{"data":{"property":' + data_json_str.split(':{"property":')[-1])[:-1]
|
|
jd = json.loads(data_json).get("data").get("property")
|
|
|
|
new_row = [
|
|
jd.get("zpid"),
|
|
jd.get("price"),
|
|
jd.get("yearBuilt"),
|
|
jd.get("address").get("streetAddress"),
|
|
jd.get("bedrooms"),
|
|
jd.get("bathrooms"),
|
|
"|".join([i.get('mixedSources').get('jpeg')[-1].get('url') for i in jd.get("responsivePhotos")]),
|
|
jd.get('timeOnZillow'),
|
|
jd.get('pageViewCount'),
|
|
jd.get('favoriteCount'),
|
|
'' if not jd.get("homeInsights") else "|".join(
|
|
[i for i in jd.get("homeInsights")[0].get("insights")[0].get("phrases")]),
|
|
jd.get("description")
|
|
]
|
|
with open('data.csv', 'a', newline='', encoding='utf-8') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(new_row)
|
|
print(f"成功保存:{new_row[0]}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
Z = Zillow()
|
|
search_data = Z.get_search_page_info()
|
|
Z.json_parsing(search_data)
|
|
# Z.get_JSESSIONID()
|