├── src └── pyzill │ ├── __init__.py │ ├── utils.py │ ├── parse.py │ ├── details.py │ └── search.py ├── pyproject.toml ├── LICENSE ├── test.py ├── .gitignore └── README.md /src/pyzill/__init__.py: -------------------------------------------------------------------------------- 1 | from pyzill.details import get_from_home_id, get_from_deparment_id, get_from_deparment_url, get_from_home_url 2 | from pyzill.search import for_sale,for_rent,sold 3 | from pyzill.utils import parse_proxy 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "pyzill" 7 | version = '1.0.3' 8 | description = 'Zillow scraper in Python' 9 | authors = [ 10 | { name="John Balvin", email="johnchristian@hotmail.es" }, 11 | ] 12 | readme= "README.md" 13 | license = { text = "MIT" } 14 | keywords=['zillow', 'scraper', 'crawler'] 15 | dependencies=['bs4','requests','curl_cffi'] 16 | 17 | [project.urls] 18 | Homepage='https://github.com/johnbalvin/pyzill' -------------------------------------------------------------------------------- /src/pyzill/utils.py: -------------------------------------------------------------------------------- 1 | from re import compile 2 | from typing import Tuple 3 | from urllib.parse import quote 4 | 5 | regex_space = compile(r"[\s ]+") 6 | regx_price = compile(r"\d+") 7 | 8 | 9 | def remove_space(value: str) -> str: 10 | """remove unwanted spaces in given string 11 | 12 | Args: 13 | value (str): input string with unwanted spaces 14 | 15 | Returns: 16 | str: string with single spaces 17 | """ 18 | return regex_space.sub(" ", value.strip()) 19 | 20 | 21 | def get_nested_value(dic, key_path, default=None): 22 | keys = key_path.split(".") 23 | current = dic 24 | for key in keys: 25 | current = current.get(key, {}) 26 | if current == {} or current is None: 27 | return default 28 | return current 29 | 30 | def parse_proxy(ip_or_domain: str,port: str, username: str, password: str) -> (str): 31 | encoded_username = quote(username) 32 | encoded_password = quote(password) 33 | proxy_url = f"http://{encoded_username}:{encoded_password}@{ip_or_domain}:{port}" 34 | return proxy_url 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 John 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | 2 | import pyzill 3 | import json 4 | proxy_url = pyzill.parse_proxy("premium.residential.proxyrack.net","9000","masamasa-country-US","G7NR8PY-6UUOGDK-B3KHXDU-JLMUNR7-IXHHRVL-0N0MR6S-AX0ESBN") 5 | ne_lat = 38.602951833355434 6 | ne_long = -87.22283859375 7 | sw_lat = 23.42674607019482 8 | sw_long = -112.93084640625 9 | pagination = 1 10 | #pagination is for the list that you see at the right when searching 11 | #you don't need to iterate over all the pages because zillow sends the whole data on mapresults at once on the first page 12 | #however the maximum result zillow returns is 500, so if mapResults is 500 13 | #try playing with the zoom or moving the coordinates, pagination won't help because you will always get at maximum 500 results 14 | pagination = 1 15 | 16 | results_rent = pyzill.for_rent(pagination, 17 | search_value="",is_entire_place=False,is_room=True, 18 | min_beds=1,max_beds=None, 19 | min_bathrooms=None,max_bathrooms=None, 20 | min_price=10000,max_price=None, 21 | ne_lat=ne_lat,ne_long=ne_long,sw_lat=sw_lat,sw_long=sw_long, 22 | zoom_value=15, 23 | proxy_url=proxy_url) 24 | jsondata_rent = json.dumps(results_rent) 25 | f = open("./jsondata_rent2.json", "w") 26 | f.write(jsondata_rent) 27 | f.close() -------------------------------------------------------------------------------- /src/pyzill/parse.py: -------------------------------------------------------------------------------- 1 | from html import unescape 2 | from json import loads 3 | from typing import Any 4 | 5 | from bs4 import BeautifulSoup # type: ignore 6 | 7 | from pyzill.utils import remove_space,get_nested_value 8 | 9 | 10 | def parse_body_home(body: bytes) -> dict[str, Any]: 11 | """parse HTML content to retrieve JSON data 12 | 13 | Args: 14 | body (bytes): HTML content of web page 15 | 16 | Returns: 17 | dict[str, Any]: parsed property information 18 | """ 19 | componentProps = parse_body(body) 20 | data_raw = get_nested_value(componentProps,"gdpClientCache") 21 | property_json = loads(data_raw) 22 | parsed_data={} 23 | for data in property_json.values(): 24 | if "property" in str(data): 25 | parsed_data = data.get("property") 26 | return parsed_data 27 | 28 | def parse_body_deparments(body: bytes) -> dict[str, Any]: 29 | """parse HTML content to retrieve JSON data 30 | 31 | Args: 32 | body (bytes): HTML content of web page 33 | 34 | Returns: 35 | dict[str, Any]: parsed property information 36 | """ 37 | componentProps = parse_body(body) 38 | department_json = get_nested_value(componentProps,"initialReduxState.gdp") 39 | return department_json 40 | 41 | def parse_body(body: bytes) -> dict[str, Any]: 42 | """parse HTML content to retrieve JSON data 43 | 44 | Args: 45 | body (bytes): HTML content of web page 46 | 47 | Returns: 48 | dict[str, Any]: parsed property information 49 | """ 50 | soup = BeautifulSoup(body, "html.parser") 51 | selection = soup.select_one("#__NEXT_DATA__") 52 | if selection: 53 | htmlData = selection.getText() 54 | htmlData = remove_space(unescape(htmlData)) 55 | data = loads(htmlData) 56 | return get_nested_value(data,"props.pageProps.componentProps") -------------------------------------------------------------------------------- /src/pyzill/details.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from curl_cffi import requests 3 | from pyzill.parse import parse_body_home,parse_body_deparments 4 | 5 | headers = { 6 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 7 | "Accept-Language": "en", 8 | "Cache-Control": "no-cache", 9 | "Pragma": "no-cache", 10 | "Sec-Ch-Ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"', 11 | "Sec-Ch-Ua-Mobile": "?0", 12 | "Sec-Ch-Ua-Platform": '"Windows"', 13 | "Sec-Fetch-Dest": "document", 14 | "Sec-Fetch-Mode": "navigate", 15 | "Sec-Fetch-Site": "none", 16 | "Sec-Fetch-User": "?1", 17 | "Upgrade-Insecure-Requests": "1", 18 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", 19 | } 20 | 21 | def get_from_home_id( 22 | property_id: int, proxy_url: str | None = None 23 | ) -> dict[str, Any]: 24 | """Scrape data for property based on property ID from zillow 25 | 26 | Args: 27 | property_id (int): ID for any property from zillow 28 | proxy_url (str | None, optional): proxy URL for masking the request. Defaults to None. 29 | 30 | Returns: 31 | dict[str, Any]: parsed property information 32 | """ 33 | home_url = f"https://www.zillow.com/homedetails/any-title/{property_id}_zpid/" 34 | data = get_from_home_url(home_url, proxy_url) 35 | return data 36 | 37 | def get_from_deparment_id( 38 | deparment_id: str, proxy_url: str | None = None 39 | ) -> dict[str, Any]: 40 | """Scrape data for property based on deparment ID from zillow 41 | 42 | Args: 43 | property_id (int): ID for any property from zillow 44 | proxy_url (str | None, optional): proxy URL for masking the request. Defaults to None. 45 | 46 | Returns: 47 | dict[str, Any]: parsed property information 48 | """ 49 | 50 | home_url = f"https://www.zillow.com/apartments/texas/the-lennox/{deparment_id}" 51 | proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None 52 | response = requests.get(url=home_url, headers=headers, proxies=proxies, impersonate="chrome124") 53 | data = parse_body_deparments(response.content) 54 | return data 55 | 56 | def get_from_deparment_url( 57 | deparment_url: int, proxy_url: str | None = None 58 | ) -> dict[str, Any]: 59 | """Scrape data for property based on deparment ID from zillow 60 | 61 | Args: 62 | property_id (int): ID for any property from zillow 63 | proxy_url (str | None, optional): proxy URL for masking the request. Defaults to None. 64 | 65 | Returns: 66 | dict[str, Any]: parsed property information 67 | """ 68 | proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None 69 | response = requests.get(url=deparment_url, headers=headers, proxies=proxies, impersonate="chrome124") 70 | data = parse_body_deparments(response.content) 71 | return data 72 | 73 | def get_from_home_url(home_url: str, proxy_url: str | None = None) -> dict[str, Any]: 74 | """Scrape given URL and parse home detail 75 | 76 | Args: 77 | home_url (str): URL for the property 78 | proxy_url (str | None, optional): proxy URL for masking the request. Defaults to None. 79 | 80 | Returns: 81 | dict[str, Any]: parsed property information 82 | """ 83 | proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None 84 | response = requests.get(url=home_url, headers=headers, proxies=proxies, impersonate="chrome124") 85 | response.raise_for_status() 86 | data = parse_body_home(response.content) 87 | return data -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | # ruff 171 | .ruff_cache/ 172 | 173 | # LSP config files 174 | pyrightconfig.json 175 | 176 | # End of https://www.toptal.com/developers/gitignore/api/python 177 | 178 | *.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zillow scraper in Python 2 | 3 | ## Overview 4 | This project is an open-source tool developed in Python for extracting product information from Zillow. It's designed to be easy to use, making it an ideal solution for developers looking for Zillow product data. 5 | 6 | ## Features 7 | - Full search support 8 | - Extracts detailed product information from Zillow 9 | - Implemented in Python just because it's popular 10 | - Easy to integrate with existing Python projects 11 | 12 | ### Important 13 | - Use rotating residential proxies, zillow will block if you make multiple requests with the same IP, 14 | 15 | ### Install 16 | 17 | ```bash 18 | $ pip install pyzill 19 | ``` 20 | ## Examples 21 | 22 | ```Python 23 | import pyzill 24 | import json 25 | ne_lat = 38.602951833355434 26 | ne_long = -87.22283859375 27 | sw_lat = 23.42674607019482 28 | sw_long = -112.93084640625 29 | pagination = 1 30 | #pagination is for the list that you see at the right when searching 31 | #you don't need to iterate over all the pages because zillow sends the whole data on mapresults at once on the first page 32 | #however the maximum result zillow returns is 500, so if mapResults is 500 33 | #try playing with the zoom or moving the coordinates, pagination won't help because you will always get at maximum 500 results 34 | pagination = 1 35 | proxy_url = pyzill.parse_proxy("[proxy_ip or proxy_domain]","[proxy_port]","[proxy_username]","[proxy_password]") 36 | results_sold = pyzill.sold(pagination, 37 | search_value="miami", 38 | min_beds=1,max_beds=1, 39 | min_bathrooms=None,max_bathrooms=None, 40 | min_price=10000,max_price=None, 41 | ne_lat=ne_lat,ne_long=ne_long,sw_lat=sw_lat,sw_long=sw_long, 42 | zoom_value=5, 43 | proxy_url=proxy_url) 44 | 45 | results_sale = pyzill.for_sale(pagination, 46 | search_value="", 47 | min_beds=None,max_beds=None, 48 | min_bathrooms=3,max_bathrooms=None, 49 | min_price=None,max_price=None, 50 | ne_lat=ne_lat,ne_long=ne_long,sw_lat=sw_lat,sw_long=sw_long, 51 | zoom_value=10, 52 | proxy_url=proxy_url) 53 | 54 | results_rent = pyzill.for_rent(pagination, 55 | search_value="",is_entire_place=False,is_room=True, 56 | min_beds=1,max_beds=None, 57 | min_bathrooms=None,max_bathrooms=None, 58 | min_price=10000,max_price=None, 59 | ne_lat=ne_lat,ne_long=ne_long,sw_lat=sw_lat,sw_long=sw_long, 60 | zoom_value=15, 61 | proxy_url=proxy_url) 62 | jsondata_sold = json.dumps(results_sold) 63 | jsondata_sale = json.dumps(results_sale) 64 | jsondata_rent = json.dumps(results_rent) 65 | f = open("./jsondata_sold.json", "w") 66 | f.write(jsondata_sold) 67 | f.close() 68 | f = open("./jsondata_sale.json", "w") 69 | f.write(jsondata_sale) 70 | f.close() 71 | f = open("./jsondata_rent.json", "w") 72 | f.write(jsondata_rent) 73 | f.close() 74 | ``` 75 | # For homes 76 | 77 | ```Python 78 | import pyzill 79 | import json 80 | property_url="https://www.zillow.com/homedetails/858-Shady-Grove-Ln-Harrah-OK-73045/339897685_zpid/" 81 | proxy_url = pyzill.parse_proxy("[proxy_ip or proxy_domain]","[proxy_port]","[proxy_username]","[proxy_password]") 82 | data = pyzill.get_from_home_url(property_url,proxy_url) 83 | jsondata = json.dumps(data) 84 | f = open("details.json", "w") 85 | f.write(jsondata) 86 | f.close() 87 | ``` 88 | 89 | ```Python 90 | import pyzill 91 | import json 92 | property_id=2056016566 93 | proxy_url = pyzill.parse_proxy("[proxy_ip or proxy_domain]","[proxy_port]","[proxy_username]","[proxy_password]") 94 | data = pyzill.get_from_home_id(property_id,proxy_url) 95 | jsondata = json.dumps(data) 96 | f = open("details.json", "w") 97 | f.write(jsondata) 98 | f.close() 99 | ``` 100 | 101 | # For departments 102 | 103 | ```Python 104 | import pyzill 105 | import json 106 | property_url="https://www.zillow.com/apartments/kissimmee-fl/the-nexus-at-overbrook/9DSWrh/" 107 | proxy_url = pyzill.parse_proxy("[proxy_ip or proxy_domain]","[proxy_port]","[proxy_username]","[proxy_password]") 108 | data = pyzill.get_from_deparment_url(property_url,proxy_url) 109 | jsondata = json.dumps(data) 110 | f = open("details.json", "w") 111 | f.write(jsondata) 112 | f.close() 113 | ``` 114 | 115 | ```Python 116 | import pyzill 117 | import json 118 | property_id="CgKZT4" 119 | proxy_url = pyzill.parse_proxy("[proxy_ip or proxy_domain]","[proxy_port]","[proxy_username]","[proxy_password]") 120 | data = pyzill.get_from_deparment_id(property_id,proxy_url) 121 | jsondata = json.dumps(data) 122 | f = open("details.json", "w") 123 | f.write(jsondata) 124 | f.close() 125 | ``` -------------------------------------------------------------------------------- /src/pyzill/search.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List 2 | from curl_cffi import requests 3 | import json 4 | 5 | 6 | def for_sale( 7 | pagination: int, 8 | search_value: str, 9 | min_beds: int, 10 | max_beds: int, 11 | min_bathrooms: int, 12 | max_bathrooms: int, 13 | min_price: int, 14 | max_price: int, 15 | ne_lat: float, 16 | ne_long: float, 17 | sw_lat: float, 18 | sw_long: float, 19 | zoom_value: int, 20 | proxy_url: str | None = None, 21 | ) -> dict[str, Any]: 22 | """get results of the listing that are for sale, you will get a dictionary with the keywords 23 | mapResults and listResults, use mapResults which contains all the listings from all paginations 24 | listResults is more for the right side bar that you see when searching on zillow. 25 | Be aware the the maximum size of mapResults is 500 so if you get results with size 500, so if you want 26 | to get the whole result frm a particular area, you need to play with the zoom, or the coordinates. 27 | Even if you try to paginate over all results, it won't work even if you use mapResults or listResults 28 | I would recomend not use pagination because you have all results(with 500 maximum) on mapResults 29 | Args: 30 | pagination (int): number of page in pagination 31 | ne_lat (float): ne latitude value 32 | ne_long (float): ne longitude value 33 | sw_lat (float): sw latitude value 34 | sw_long (float): sw longitude value 35 | sw_long (float): sw longitude value 36 | proxy_url (str | None, optional): proxy URL for masking the request. Defaults to None. 37 | 38 | Returns: 39 | dict[str, Any]: listing of properties in JSON format 40 | """ 41 | rent = { 42 | "sortSelection": {"value": "globalrelevanceex"}, 43 | "isAllHomes": {"value": True}, 44 | } 45 | return search(pagination,search_value,min_beds,max_beds,min_bathrooms,max_bathrooms,min_price,max_price,ne_lat,ne_long,sw_lat,sw_long,zoom_value,rent,proxy_url) 46 | 47 | def for_rent( 48 | pagination: int, 49 | search_value: str, 50 | is_entire_place: bool, 51 | is_room: bool, 52 | min_beds: int, 53 | max_beds: int, 54 | min_bathrooms: int, 55 | max_bathrooms: int, 56 | min_price: int, 57 | max_price: int, 58 | ne_lat: float, 59 | ne_long: float, 60 | sw_lat: float, 61 | sw_long: float, 62 | zoom_value: int, 63 | proxy_url: str | None = None, 64 | ) -> dict[str, Any]: 65 | """get results of the listing that are for rent, you will get a dictionary with the keywords 66 | mapResults and listResults, use mapResults which contains all the listings from all paginations 67 | listResults is more for the right side bar that you see when searching on zillow. 68 | Be aware the the maximum size of mapResults is 500 so if you get results with size 500, so if you want 69 | to get the whole result frm a particular area, you need to play with the zoom, or the coordinates. 70 | Even if you try to paginate over all results, it won't work even if you use mapResults or listResults 71 | I would recomend not use pagination because you have all results(with 500 maximum) on mapResults 72 | Args: 73 | pagination (int): number of page in pagination 74 | ne_lat (float): ne latitude value 75 | ne_long (float): ne longitude value 76 | sw_lat (float): sw latitude value 77 | sw_long (float): sw longitude value 78 | sw_long (float): sw longitude value 79 | proxy_url (str | None, optional): proxy URL for masking the request. Defaults to None. 80 | 81 | Returns: 82 | dict[str, Any]: listing of properties in JSON format 83 | """ 84 | rent = { 85 | "sortSelection": {"value": "priorityscore"}, 86 | "isNewConstruction": {"value": False}, 87 | "isForSaleForeclosure": {"value": False}, 88 | "isForSaleByOwner": {"value": False}, 89 | "isForSaleByAgent": {"value": False}, 90 | "isForRent": {"value": True}, 91 | "isComingSoon": {"value": False}, 92 | "isAuction": {"value": False}, 93 | "isAllHomes": {"value": True}, 94 | } 95 | if is_room: 96 | rent["isRoomForRent"] = {"value": True} 97 | if not is_entire_place: 98 | rent["isEntirePlaceForRent"] = {"value": False} 99 | return search(pagination,search_value,min_beds,max_beds,min_bathrooms,max_bathrooms,min_price,max_price,ne_lat,ne_long,sw_lat,sw_long,zoom_value,rent,proxy_url) 100 | 101 | def sold( 102 | pagination: int, 103 | search_value: str, 104 | min_beds: int, 105 | max_beds: int, 106 | min_bathrooms: int, 107 | max_bathrooms: int, 108 | min_price: int, 109 | max_price: int, 110 | ne_lat: float, 111 | ne_long: float, 112 | sw_lat: float, 113 | sw_long: float, 114 | zoom_value: int, 115 | proxy_url: str | None = None, 116 | ) -> dict[str, Any]: 117 | """get results of the listing that were sold, you will get a dictionary with the keywords 118 | mapResults and listResults, use mapResults which contains all the listings from all paginations 119 | listResults is more for the right side bar that you see when searching on zillow. 120 | Be aware the the maximum size of mapResults is 500 so if you get results with size 500, so if you want 121 | to get the whole result frm a particular area, you need to play with the zoom, or the coordinates. 122 | Even if you try to paginate over all results, it won't work even if you use mapResults or listResults 123 | I would recomend not use pagination because you have all results(with 500 maximum) on mapResults 124 | Args: 125 | pagination (int): number of page in pagination 126 | ne_lat (float): ne latitude value 127 | ne_long (float): ne longitude value 128 | sw_lat (float): sw latitude value 129 | sw_long (float): sw longitude value 130 | sw_long (float): sw longitude value 131 | proxy_url (str | None, optional): proxy URL for masking the request. Defaults to None. 132 | 133 | Returns: 134 | dict[str, Any]: listing of properties in JSON format 135 | """ 136 | rent = { 137 | "sortSelection": {"value": "globalrelevanceex"}, 138 | "isNewConstruction": {"value": False}, 139 | "isForSaleForeclosure": {"value": False}, 140 | "isForSaleByOwner": {"value": False}, 141 | "isForSaleByAgent": {"value": False}, 142 | "isForRent": {"value": False}, 143 | "isComingSoon": {"value": False}, 144 | "isAuction": {"value": False}, 145 | "isAllHomes": {"value": True}, 146 | "isRecentlySold": {"value": True}, 147 | } 148 | return search(pagination,search_value,min_beds,max_beds,min_bathrooms,max_bathrooms,min_price,max_price,ne_lat,ne_long,sw_lat,sw_long,zoom_value,rent,proxy_url) 149 | 150 | def search( 151 | pagination: int, 152 | search_value: str, 153 | min_beds: int, 154 | max_beds: int, 155 | min_bathrooms: int, 156 | max_bathrooms: int, 157 | min_price: int, 158 | max_price: int, 159 | ne_lat: float, 160 | ne_long: float, 161 | sw_lat: float, 162 | sw_long: float, 163 | zoom_value: int, 164 | filter_state: dict[str, Any], 165 | proxy_url: str | None = None, 166 | ) -> dict[str, Any]: 167 | """get results of the listing of the given page number 168 | 169 | Args: 170 | pagination (int): number of page in pagination 171 | ne_lat (float): ne latitude value 172 | ne_long (float): ne longitude value 173 | sw_lat (float): sw latitude value 174 | sw_long (float): sw longitude value 175 | sw_long (float): sw longitude value 176 | filter_state (dict[str, Any]): input data for making the search 177 | proxy_url (str | None, optional): proxy URL for masking the request. Defaults to None. 178 | 179 | Returns: 180 | dict[str, Any]: listing of properties in JSON format 181 | """ 182 | headers = { 183 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 184 | "Accept-Language": "en", 185 | "Content-Type": "application/json", 186 | "Cache-Control": "no-cache", 187 | "Pragma": "no-cache", 188 | "origin": "https://www.zillow.com", 189 | "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', 190 | "Sec-Ch-Ua-Mobile": "?0", 191 | "Sec-Ch-Ua-Platform": '"Windows"', 192 | "Sec-Fetch-Dest": "document", 193 | "Sec-Fetch-Mode": "navigate", 194 | "Sec-Fetch-Site": "none", 195 | "Sec-Fetch-User": "?1", 196 | "Upgrade-Insecure-Requests": "1", 197 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 198 | } 199 | inputData = { 200 | "searchQueryState": { 201 | "isMapVisible": True, 202 | "isListVisible": True, 203 | "mapBounds": { 204 | "north": ne_lat, 205 | "east": ne_long, 206 | "south": sw_lat, 207 | "west": sw_long, 208 | }, 209 | "filterState": filter_state, 210 | "mapZoom": zoom_value, 211 | "pagination": { 212 | "currentPage": pagination, 213 | }, 214 | }, 215 | "wants": { 216 | "cat1": ["listResults", "mapResults"], 217 | "cat2": ["total"], 218 | }, 219 | "requestId": 10, 220 | "isDebugRequest": False, 221 | } 222 | if search_value is not None: 223 | inputData["searchQueryState"]["usersSearchTerm"]=search_value 224 | 225 | if min_beds is not None or max_beds is not None: 226 | beds = {} 227 | if min_beds is not None: 228 | beds["min"] = min_beds 229 | if max_beds is not None: 230 | beds["max"] = max_beds 231 | inputData["searchQueryState"]["filterState"]["beds"] = beds 232 | 233 | if min_bathrooms is not None or max_bathrooms is not None: 234 | baths = {} 235 | if min_bathrooms is not None: 236 | baths["min"] = min_bathrooms 237 | if max_bathrooms is not None: 238 | baths["max"] = max_bathrooms 239 | inputData["searchQueryState"]["filterState"]["baths"] = baths 240 | 241 | if min_price is not None or max_price is not None: 242 | price = {} 243 | if min_price is not None: 244 | price["min"] = min_price 245 | if max_price is not None: 246 | price["max"] = max_price 247 | inputData["searchQueryState"]["filterState"]["price"] = price 248 | 249 | proxies = {"http": proxy_url, "https": proxy_url} if proxy_url else None 250 | response = requests.put( 251 | url="https://www.zillow.com/async-create-search-page-state", 252 | json=inputData, 253 | headers=headers, 254 | proxies=proxies, 255 | impersonate="chrome124", 256 | ) 257 | data = response.json() 258 | return data.get("cat1", {}).get("searchResults", {}) --------------------------------------------------------------------------------