Crawler/web/zillow_com/getInfo.py

284 lines
13 KiB
Python

import csv
import os
import time
import requests
import json
import re
class Zillow:
def __init__(self):
self.baseurl = "https://www.zillow.com"
self.proxies = {
"http": "http://127.0.0.1:7890",
"https": "http://127.0.0.1:7890",
}
self.cookies = {
'zguid': '24|%24e835e7de-1e03-40a2-9e0b-ea6557fce975',
'zgsession': '1|5a92d272-1f0a-4eab-aad9-0b01bfc4e12e',
'pxcts': '4efbe577-f671-11ef-944c-1da0cdd7af45',
'_pxvid': '4efbd04b-f671-11ef-944a-e6989223c886',
'zjs_anonymous_id': '%22e835e7de-1e03-40a2-9e0b-ea6557fce975%22',
'zg_anonymous_id': '%224b8a74fb-12d3-4dd5-b453-91d2f059cc5c%22',
'_ga': 'GA1.2.1116450831.1740815202',
'_gid': 'GA1.2.1210142068.1740815202',
'_gcl_au': '1.1.1930799100.1740815207',
'_scid': '3cxthBM96c4ODxO1v0Al3ufRKQFwF0wr',
'DoubleClickSession': 'true',
'_ScCbts': '%5B%5D',
'_pin_unauth': 'dWlkPU4yRmtNVGhtTmpVdE5tWmtaUzAwWXpnNUxUbGtPRGt0WkRrek1UTm1OREUyTTJJdw',
'_tt_enable_cookie': '1',
'_ttp': '01JN8AR8CC889QQ2N91WJNGVRG_.tt.1',
'_fbp': 'fb.1.1740815213095.801729201273859650',
'_sctr': '1%7C1740758400000',
'_clck': '1gi6cgn%7C2%7Cftu%7C0%7C1886',
'_lr_env_src_ats': 'false',
'g_state': '{"i_l":0}',
'loginmemento': '1|38bd958cf99c0efa6fcdf2d8b4ba656e18214f94b1d0b8482177e5821d485cb5',
'userid': 'X|3|516ce03fb1857a73%7C9%7CtUJB3zsjokPFJzXVcdK9kYKiV6BT9Hom',
'zjs_user_id': '%22X1-ZUqbce6dl8y0w9_1oeg0%22',
'_derived_epik': 'dj0yJnU9YzhHUUI3UjgwVTYyektsUXlURWZ5UnNPZGpPTDBwUi0mbj01MkNxOF9nSm1udHEwczFUVGoyM0p3Jm09NCZ0PUFBQUFBR2ZDeDJRJnJtPTQmcnQ9QUFBQUFHZkN4MlEmc3A9Mg',
'tfpsi': 'cf8acdc2-7676-40ea-810c-a9d9613cd3ac',
'ZILLOW_SID': '1|AAAAAVVbFRIBVVsVEuEBjRdhLdPgArRt9zTF9A9GUy9n6qhArfRCvbRpWcUyvOX17mDKQCRGzog4qedbe0aFqFnGaQzFa2AVHA',
'JSESSIONID': '2AB46AF163E850A6A93A55EEAFBFA961',
'_px3': '2b2ceb1e9473516a5889d6b089882f2df95563f843864b748551a222e3a93bd6:Re6tLyXWJyuSifQZrCVdgU+HEPF8Ih1TmBAyFOt2qBZwjYXKAdScUmpmzzTUHclqZwD1vCAkP6Pow8uPxHGEBw==:1000:8rliApJL6+buJDD0QJN5jVM3zTksVkovJZ9PzmBfy8r178UA1kmnQ3o8GCOS0s/wmN0v5TATeetgtPpn8ZgJH/nA3DeEY9emAKW8bwwCOzP2xxF8rOEN4IYvM48vCdaEhFnLMTOWUR5ZhrMtnuinXvnFPpcsoTdiVsjPiPUunfMbpjRVUSEeRGfrO2BvQBnqh1bPhqq4CXReAdRoEbUdIoO+jZ250vTxbnHyWHDnnsk=',
'_rdt_uuid': '1740815209620.81e8d67a-5a8f-489d-844a-6eb51bea084d',
'_scid_r': '7ExthBM96c4ODxO1v0Al3ufRKQFwF0wrejgyEw',
'search': '6|1743432649336%7Crect%3D40.945233052704474%2C-73.83509060791016%2C40.845317795985295%2C-73.98271939208985%26rid%3D17182%26disp%3Dmap%26mdm%3Dauto%26p%3D1%26listPriceActive%3D1%26fs%3D1%26fr%3D0%26mmm%3D0%26rs%3D0%26singlestory%3D0%26housing-connector%3D0%26parking-spots%3Dnull-%26abo%3D0%26garage%3D0%26pool%3D0%26ac%3D0%26waterfront%3D0%26finished%3D0%26unfinished%3D0%26cityview%3D0%26mountainview%3D0%26parkview%3D0%26waterview%3D0%26hoadata%3D1%26zillow-owned%3D0%263dhome%3D0%26showcase%3D0%26featuredMultiFamilyBuilding%3D0%26onlyRentalStudentHousingType%3D0%26onlyRentalIncomeRestrictedHousingType%3D0%26onlyRentalMilitaryHousingType%3D0%26onlyRentalDisabledHousingType%3D0%26onlyRentalSeniorHousingType%3D0%26commuteMode%3Ddriving%26commuteTimeOfDay%3Dnow%09%0917182%09%7B%22isList%22%3Atrue%2C%22isMap%22%3Atrue%7D%09%09%09%09%09',
'_clsk': 'uwbs56%7C1740840661345%7C2%7C0%7Cr.clarity.ms%2Fcollect',
'AWSALB': 'SVlf4eVEwg9mRsnpIwRyN+xS9wd8r/tNawixFSnBEf7jkmDoA1qS5ygg/3YxMalClDa1hkwxgRsZsha0aX46GGoXENyABk7jsy4YM0+YnjOiQ4CnF/ZFkf377PDR',
'AWSALBCORS': 'SVlf4eVEwg9mRsnpIwRyN+xS9wd8r/tNawixFSnBEf7jkmDoA1qS5ygg/3YxMalClDa1hkwxgRsZsha0aX46GGoXENyABk7jsy4YM0+YnjOiQ4CnF/ZFkf377PDR',
'_uetsid': '566607f0f67111ef9e8f315a86932718',
'_uetvid': '5665f370f67111efae377d2348e45d1f',
}
self.csvdata = [
[
'zpid',
'price', # 价格
'yearBuilt', # 始建
'streetAddress', # 地址
'bedrooms', # 卧室数量
'bathrooms', # 浴室数量
'responsivePhotos', # 照片
'timeOnZillow', # 展示天数
'pageViewCount', # 页面浏览量
'favoriteCount', # 收藏数量
'phrases', # 标签
'description', # 描述
],
]
if not os.path.exists('data.csv'):
with open('data.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(self.csvdata[0]) # 写入标题行
# self.update_cookies_ZILLOW_SID() # 更新cookies
def get_JSESSIONID(self):
cookies = self.cookies
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.zillow.com/homedetails/4705-Henry-Hudson-Pkwy-APT-6B-Bronx-NY-10471/244446711_zpid/',
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
}
params = {
'zpid': '244446711',
}
response = requests.get(
'https://www.zillow.com/ajax/homedetail/MarkPropertyViewed.htm',
params=params,
cookies=cookies,
headers=headers,
)
JSESSIONID = response.headers.get('x-requested-session')
print(JSESSIONID)
self.cookies['JSESSIONID'] = JSESSIONID
cookies = response.headers.get('set-cookies')
print(cookies)
def update_cookies_ZILLOW_SID(self):
cookies = self.cookies
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=1, i',
'referer': 'https://www.zillow.com/homedetails/3531-Bronxwood-Ave-APT-3F-Bronx-NY-10469/2053005716_zpid/',
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
}
params = {
'featureFlags': [
'SHOPPING_ZOAM_MIGRATION_FOR_SALE_HDP',
'CIAM_ZOAM_MIGRATION_GLOBAL',
],
}
response = requests.get('https://www.zillow.com/api/user/featureFlags', params=params, cookies=cookies,
headers=headers, proxies=self.proxies)
scookies = response.headers['set-cookie'].split(';')[0].split('=')[1]
print(scookies)
self.cookies['ZILLOW_SID'] = scookies
def get_search_page_info(self):
url = "/async-create-search-page-state"
cookies = self.cookies
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'content-type': 'application/json',
'origin': 'https://www.zillow.com',
'pragma': 'no-cache',
'priority': 'u=1, i',
# 'referer': 'https://www.zillow.com/new-york-ny/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22isMapVisible%22%3Atrue%2C%22mapBounds%22%3A%7B%22west%22%3A-73.99487916636572%2C%22east%22%3A-73.69962159800635%2C%22south%22%3A40.799450997125795%2C%22north%22%3A40.976203993908605%7D%2C%22mapZoom%22%3A12%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A6181%2C%22regionType%22%3A6%7D%5D%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%7D%2C%22isListVisible%22%3Atrue%7D',
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
}
json_data = {
'searchQueryState': {
'pagination': {},
'isMapVisible': True,
'mapBounds': {
'west': -73.99487916636572,
'east': -73.69962159800635,
'south': 40.79945099712579,
'north': 40.97620399390861,
},
'mapZoom': 12,
'regionSelection': [
{
'regionId': 6181,
'regionType': 6,
},
],
'filterState': {
'sortSelection': {
'value': 'globalrelevanceex',
},
},
'isListVisible': True,
},
'wants': {
'cat1': [
'mapResults',
],
},
'requestId': 2,
'isDebugRequest': False,
}
response = requests.put(url=self.baseurl + url, cookies=cookies,
headers=headers, json=json_data, proxies=self.proxies)
json_data = response.json()
return json_data
def json_parsing(self, search_json):
for i in search_json.get("cat1").get("searchResults").get("mapResults"):
detailurl = i.get("detailUrl")
if "homedetails" in detailurl:
self.get_homedetails(i.get("detailUrl"))
def get_homedetails(self, detailUrl, r=3):
cookies = self.cookies
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=0, i',
'sec-ch-ua': '"Not(A:Brand";v="99", "Google Chrome";v="133", "Chromium";v="133"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36'
}
if r > 0:
try:
response = requests.get(url=self.baseurl + detailUrl, cookies=cookies, headers=headers,
proxies=self.proxies)
data_json_str = re.findall(r'\"gdpClientCache\":(.*?),\"composedGraphQLQuery\"', response.text)[0]
except:
time.sleep(2)
print(response.status_code)
print(self.baseurl + detailUrl)
self.get_JSESSIONID()
if r == 3:
with open('error_list.txt', 'w', encoding='utf-8') as f:
f.write(self.baseurl + detailUrl)
return self.get_homedetails(detailUrl, r - 1)
else:
print("重试最大次数!")
return
if response.headers.get('set-cookie'):
scookies = response.headers['set-cookie'].split(';')[0].split('=')[1]
self.cookies['ZILLOW_SID'] = scookies
data_json_str = data_json_str.encode('utf-8').decode('unicode_escape') # 去除转义
data_json = ('{"data":{"property":' + data_json_str.split(':{"property":')[-1])[:-1]
jd = json.loads(data_json).get("data").get("property")
new_row = [
jd.get("zpid"),
jd.get("price"),
jd.get("yearBuilt"),
jd.get("address").get("streetAddress"),
jd.get("bedrooms"),
jd.get("bathrooms"),
"|".join([i.get('mixedSources').get('jpeg')[-1].get('url') for i in jd.get("responsivePhotos")]),
jd.get('timeOnZillow'),
jd.get('pageViewCount'),
jd.get('favoriteCount'),
'' if not jd.get("homeInsights") else "|".join(
[i for i in jd.get("homeInsights")[0].get("insights")[0].get("phrases")]),
jd.get("description")
]
with open('data.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(new_row)
print(f"成功保存:{new_row[0]}")
if __name__ == '__main__':
Z = Zillow()
search_data = Z.get_search_page_info()
Z.json_parsing(search_data)
# Z.get_JSESSIONID()