├── .idea
├── .gitignore
├── misc.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── modules.xml
└── web scraping.iml
├── __pycache__
└── Connexion.cpython-39.pyc
├── Connexion.py
├── ScrapingToCsvFile.py
├── ScrapingToDB_TextFile.py
└── README.md
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/__pycache__/Connexion.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/frozen-dev71/Web-Scraping-RealEstate-Beautifulsoup/main/__pycache__/Connexion.cpython-39.pyc
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/web scraping.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Connexion.py:
--------------------------------------------------------------------------------
1 | import pymysql
2 |
3 | class Dbconnect(object):
4 | def __init__(self):
5 | self.dbconection = pymysql.connect(host='localhost',port=3308, user='root',passwd='', db='realestate')
6 | self.dbcursor = self.dbconection.cursor()
7 |
8 | def commit_db(self):
9 | self.dbconection.commit()
10 |
11 | def close_db(self):
12 | self.dbcursor.close()
13 | self.dbconection.close()
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/ScrapingToCsvFile.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 | from csv import writer
4 |
5 |
6 | url = "https://www.realtor.com/realestateandhomes-search/Stockton_CA/show-newest-listings"
7 | headers = {
8 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
9 | "Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
10 | "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
11 |
12 | page = requests.get(url, headers=headers)
13 | soup = BeautifulSoup(page.text, 'html.parser')
14 | lists = soup.find_all('div', class_="jsx-2775064451 fallBackImgWrap")
15 |
16 | with open('housing.csv', 'w', encoding='utf8', newline='') as f:
17 | thewriter = writer(f)
18 | header = ['Location', 'Status', 'Price', 'Owner', 'Bed', 'Bath', 'SQFT', 'SQFT_LOT']
19 | thewriter.writerow(header)
20 | for list in lists:
21 | if list != None:
22 | location = list.find('div', class_="jsx-1982357781 address ellipsis srp-page-address srp-address-redesign")
23 | price = list.find('span', class_="Price__Component-rui__x3geed-0 gipzbd")
24 | status = list.find('span', class_="jsx-3853574337 statusText")
25 | ow = list.find_all('span', class_="jsx-287440024")
26 | owner = ow[1]
27 | infos = list.find_all('span', class_="jsx-946479843 meta-value")
28 | for i in range(len(infos)):
29 | infos[i] = infos[i].text if infos[i] != None else 'Not specified'
30 | location = location.text if location != None else 'Not specified'
31 | price = price.text if price != None else 'Not specified'
32 | owner = owner.text if owner != None else 'Not specified'
33 | status = status.text if status != None else 'Not specified'
34 | info = [location, status, price, owner]
35 | for i in range(len(infos)):
36 | info.append(infos[i])
37 | if len(infos) < 4:
38 | for i in range(len(infos), 4):
39 | info.append("NoV")
40 | thewriter.writerow(info)
41 |
42 |
43 |
--------------------------------------------------------------------------------
/ScrapingToDB_TextFile.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 | import Connexion
4 |
5 | url = "https://www.realtor.com/realestateandhomes-search/Stockton_CA/show-newest-listings"
6 | headers = {
7 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
8 | "Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
9 | "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
10 |
11 | page = requests.get(url, headers=headers)
12 | soup = BeautifulSoup(page.text, 'html.parser')
13 | lists = soup.find_all('div', class_="jsx-2775064451 fallBackImgWrap")
14 |
15 | db = Connexion.Dbconnect()
16 | with open('data.txt', 'w') as f:
17 | for list in lists:
18 | if list != None:
19 | location = list.find('div', class_="jsx-1982357781 address ellipsis srp-page-address srp-address-redesign")
20 | price = list.find('span', class_="Price__Component-rui__x3geed-0 gipzbd")
21 | status = list.find('span', class_="jsx-3853574337 statusText")
22 | ow = list.find_all('span', class_="jsx-287440024")
23 | owner = ow[1]
24 | infos = list.find_all('span', class_="jsx-946479843 meta-value")
25 | for i in range(len(infos)):
26 | infos[i] = infos[i].text if infos[i] != None else 'Not specified'
27 | location = location.text if location != None else 'Not specified'
28 | price = price.text if price!=None else 'Not specified'
29 | owner = owner.text if owner != None else 'Not specified'
30 | status = status.text if status != None else 'Not specified'
31 | info = [location, status, price, owner]
32 | for i in range(len(infos)):
33 | info.append(infos[i])
34 | if len(infos) < 4:
35 | for i in range(len(infos), 4):
36 | info.append("NoV")
37 | sql = "INSERT INTO house(location,status,price,owner,bed,bath,sqft,sqft_lot) VALUES "+str(tuple(info))
38 | db.dbcursor.execute(sql)
39 | db.commit_db()
40 | for i in range(len(info)):
41 | f.write(info[i])
42 | f.write("; ")
43 | f.write('\n')
44 |
45 | db.close_db()
46 |
47 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Real Estate Web Scraper
2 |
3 | This project is a Python web scraper that extracts real estate data from the Realtor.com website for the city of Stockton, California. The data is then saved to either a MySQL database or a CSV file, depending on the file that is run. The project consists of two source code files, ScrapingToDB_TextFile.py and ScrapingToCsvFile.py, which accomplish the same task of scraping and storing the data, but use different methods to store the data.
4 |
5 | ## Getting Started
6 |
7 | To run the web scraper, you will need to have Python 3 installed on your machine. You will also need to install the following Python libraries:
8 |
9 | - BeautifulSoup
10 | - requests
11 | - mysql-connector-python (if using the realestate_db.py file)
12 |
13 |
14 | You can install these libraries by running the following command in your terminal:
15 | 'pip install beautifulsoup4 requests mysql-connector-python'
16 |
17 |
18 | ## How to Use
19 |
20 | To use the web scraper, you can simply run one of the two source code files, depending on how you want to store the data.
21 |
22 | ### ScrapingToDB_TextFile.py
23 |
24 | This file will save the scraped data to a MySQL database. Before running the file, you will need to set up a MySQL database and update the Connexion.py file with your database credentials. Once you have done this, you can run the file by navigating to the directory where the file is located and running the following command:
25 | 'python ScrapingToDB_TextFile.py'
26 |
27 |
28 | The program will then scrape the data from the Realtor.com website and save it to the MySQL database and a local text file named 'data.txt'.
29 |
30 |
31 | ### ScrapingToCsvFile.py
32 |
33 | This file will save the scraped data to a CSV file. To run the file, navigate to the directory where the file is located and run the following command:
34 | 'python ScrapingToCsvFile.py'
35 |
36 | The program will then scrape the data from the Realtor.com website and save it to a CSV file named 'housing.csv' in the same directory.
37 |
38 |
39 | ## Data Collected
40 |
41 | The web scraper collects the following data for each real estate listing:
42 |
43 | - Location
44 | - Status
45 | - Price
46 | - Owner
47 | - Number of Bedrooms
48 | - Number of Bathrooms
49 | - Total Square Footage
50 | - Square Footage of Lot
51 |
52 |
53 | ## Contributing
54 |
55 | If you would like to contribute to this project, feel free to submit a pull request with your changes.
56 |
57 | ## Credits
58 |
59 | This project was created by Oussama Fikri. The code is based on examples from the Beautiful Soup documentation and the requests library documentation.
60 |
--------------------------------------------------------------------------------