├── .idea ├── .gitignore ├── misc.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── modules.xml └── web scraping.iml ├── __pycache__ └── Connexion.cpython-39.pyc ├── Connexion.py ├── ScrapingToCsvFile.py ├── ScrapingToDB_TextFile.py └── README.md /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /__pycache__/Connexion.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/frozen-dev71/Web-Scraping-RealEstate-Beautifulsoup/main/__pycache__/Connexion.cpython-39.pyc -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/web scraping.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Connexion.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | 3 | class Dbconnect(object): 4 | def __init__(self): 5 | self.dbconection = pymysql.connect(host='localhost',port=3308, user='root',passwd='', db='realestate') 6 | self.dbcursor = self.dbconection.cursor() 7 | 8 | def commit_db(self): 9 | self.dbconection.commit() 10 | 11 | def close_db(self): 12 | self.dbcursor.close() 13 | self.dbconection.close() 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 34 | -------------------------------------------------------------------------------- /ScrapingToCsvFile.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | from csv import writer 4 | 5 | 6 | url = "https://www.realtor.com/realestateandhomes-search/Stockton_CA/show-newest-listings" 7 | headers = { 8 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", 9 | "Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 10 | "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"} 11 | 12 | page = requests.get(url, headers=headers) 13 | soup = BeautifulSoup(page.text, 'html.parser') 14 | lists = soup.find_all('div', class_="jsx-2775064451 fallBackImgWrap") 15 | 16 | with open('housing.csv', 'w', encoding='utf8', newline='') as f: 17 | thewriter = writer(f) 18 | header = ['Location', 'Status', 'Price', 'Owner', 'Bed', 'Bath', 'SQFT', 'SQFT_LOT'] 19 | thewriter.writerow(header) 20 | for list in lists: 21 | if list != None: 22 | location = list.find('div', class_="jsx-1982357781 address ellipsis srp-page-address srp-address-redesign") 23 | price = list.find('span', class_="Price__Component-rui__x3geed-0 gipzbd") 24 | status = list.find('span', class_="jsx-3853574337 statusText") 25 | ow = list.find_all('span', class_="jsx-287440024") 26 | owner = ow[1] 27 | infos = list.find_all('span', class_="jsx-946479843 meta-value") 28 | for i in range(len(infos)): 29 | infos[i] = infos[i].text if infos[i] != None else 'Not specified' 30 | location = location.text if location != None else 'Not specified' 31 | price = price.text if price != None else 'Not specified' 32 | owner = owner.text if owner != None else 'Not specified' 33 | status = status.text if status != None else 'Not specified' 34 | info = [location, status, price, owner] 35 | for i in range(len(infos)): 36 | info.append(infos[i]) 37 | if len(infos) < 4: 38 | for i in range(len(infos), 4): 39 | info.append("NoV") 40 | thewriter.writerow(info) 41 | 42 | 43 | -------------------------------------------------------------------------------- /ScrapingToDB_TextFile.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import Connexion 4 | 5 | url = "https://www.realtor.com/realestateandhomes-search/Stockton_CA/show-newest-listings" 6 | headers = { 7 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", 8 | "Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 9 | "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"} 10 | 11 | page = requests.get(url, headers=headers) 12 | soup = BeautifulSoup(page.text, 'html.parser') 13 | lists = soup.find_all('div', class_="jsx-2775064451 fallBackImgWrap") 14 | 15 | db = Connexion.Dbconnect() 16 | with open('data.txt', 'w') as f: 17 | for list in lists: 18 | if list != None: 19 | location = list.find('div', class_="jsx-1982357781 address ellipsis srp-page-address srp-address-redesign") 20 | price = list.find('span', class_="Price__Component-rui__x3geed-0 gipzbd") 21 | status = list.find('span', class_="jsx-3853574337 statusText") 22 | ow = list.find_all('span', class_="jsx-287440024") 23 | owner = ow[1] 24 | infos = list.find_all('span', class_="jsx-946479843 meta-value") 25 | for i in range(len(infos)): 26 | infos[i] = infos[i].text if infos[i] != None else 'Not specified' 27 | location = location.text if location != None else 'Not specified' 28 | price = price.text if price!=None else 'Not specified' 29 | owner = owner.text if owner != None else 'Not specified' 30 | status = status.text if status != None else 'Not specified' 31 | info = [location, status, price, owner] 32 | for i in range(len(infos)): 33 | info.append(infos[i]) 34 | if len(infos) < 4: 35 | for i in range(len(infos), 4): 36 | info.append("NoV") 37 | sql = "INSERT INTO house(location,status,price,owner,bed,bath,sqft,sqft_lot) VALUES "+str(tuple(info)) 38 | db.dbcursor.execute(sql) 39 | db.commit_db() 40 | for i in range(len(info)): 41 | f.write(info[i]) 42 | f.write("; ") 43 | f.write('\n') 44 | 45 | db.close_db() 46 | 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real Estate Web Scraper 2 | 3 | This project is a Python web scraper that extracts real estate data from the Realtor.com website for the city of Stockton, California. The data is then saved to either a MySQL database or a CSV file, depending on the file that is run. The project consists of two source code files, ScrapingToDB_TextFile.py and ScrapingToCsvFile.py, which accomplish the same task of scraping and storing the data, but use different methods to store the data. 4 | 5 | ## Getting Started 6 | 7 | To run the web scraper, you will need to have Python 3 installed on your machine. You will also need to install the following Python libraries: 8 | 9 | - BeautifulSoup 10 | - requests 11 | - mysql-connector-python (if using the realestate_db.py file) 12 | 13 | 14 | You can install these libraries by running the following command in your terminal: 15 | 'pip install beautifulsoup4 requests mysql-connector-python' 16 | 17 | 18 | ## How to Use 19 | 20 | To use the web scraper, you can simply run one of the two source code files, depending on how you want to store the data. 21 | 22 | ### ScrapingToDB_TextFile.py 23 | 24 | This file will save the scraped data to a MySQL database. Before running the file, you will need to set up a MySQL database and update the Connexion.py file with your database credentials. Once you have done this, you can run the file by navigating to the directory where the file is located and running the following command: 25 | 'python ScrapingToDB_TextFile.py' 26 | 27 | 28 | The program will then scrape the data from the Realtor.com website and save it to the MySQL database and a local text file named 'data.txt'. 29 | 30 | 31 | ### ScrapingToCsvFile.py 32 | 33 | This file will save the scraped data to a CSV file. To run the file, navigate to the directory where the file is located and run the following command: 34 | 'python ScrapingToCsvFile.py' 35 | 36 | The program will then scrape the data from the Realtor.com website and save it to a CSV file named 'housing.csv' in the same directory. 37 | 38 | 39 | ## Data Collected 40 | 41 | The web scraper collects the following data for each real estate listing: 42 | 43 | - Location 44 | - Status 45 | - Price 46 | - Owner 47 | - Number of Bedrooms 48 | - Number of Bathrooms 49 | - Total Square Footage 50 | - Square Footage of Lot 51 | 52 | 53 | ## Contributing 54 | 55 | If you would like to contribute to this project, feel free to submit a pull request with your changes. 56 | 57 | ## Credits 58 | 59 | This project was created by Oussama Fikri. The code is based on examples from the Beautiful Soup documentation and the requests library documentation. 60 | --------------------------------------------------------------------------------