├── README.md
├── airflow
    ├── airflowdag.py
    ├── airflowec2.png
    └── airflowsetup.md
├── ec2
    └── seleniumboot.sh
├── presentation
    ├── mypresentation.pdf
    └── zillowpresentation.pptx
├── scripts
    ├── market.py
    ├── market_test.py
    ├── sold.py
    ├── zillow_functions.py
    ├── zillow_recently_sold.py
    ├── zillow_sold_functions.py
    └── zillowxgboost.py
└── spark
    ├── sparkbootstrap.sh
    └── sparktopostgres.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # Zillow Data Engineering End to End Pipeline
 2 | 
 3 | ### Contents
 4 | I. EC2 - initiate a VM with a bootstrap to run essential scripts
 5 | 
 6 | II. Scripts - to scrape and parse data. XGBoost Regression
 7 | 
 8 | III. Airflow - DAG setup
 9 | 
10 | IV. Spark - Pushing data to Postgres DB
11 | 
12 | V. Presentation - All project objectives can be found here!
13 | 
14 | [Click here for the interactive dashboard](https://jerrydatascience.tumblr.com/zillow-interactive)
15 | 
16 | [Click here for machine learning interactive map](https://jerrydatascience.tumblr.com/zillow-machine-learning)
17 | 
18 | 
19 | ### Brief Introduction
20 | Predictive Analytics with Real Estate Data. My goal is to assess which areas will be good real estate investments based off historical and current market data. I started by picking specific county regions with good employment rates and a large population - SF Bay Area (CA) , Seattle (WA), and Boston (MA).
21 | 
22 | ### Obtaining Data
23 | Initially, I was going to use Zillow's API, however, their API has limited data query and call limits (1000 per day). With a data scraper, I can input a list of zip codes and get details of every listing. Additionally, I can make 3x the amount of calls to the Zillow site.
24 | 
25 | i) Current For Sale Properties
26 | ii) Recently Sold Properties
27 | 
28 | Historical Real Estate sales by zip code since 1996
29 | 
30 | Unemployment by zip code -> mapped by County data
31 | 
32 | ### Big Data
33 |    1. **Stream**: Zillow data scrape current properties for sale and recently sold properties with daily airflow DAG on EC2.
34 |    2. **Store**: Storage for unstructed HTML data for each listing in S3.
35 |    3. **Structure**: Parse HTML data and store in S3. Use Apache Spark to put data into Postgres table.
36 |    4. **Synthesize**: 1) Filter average house prices by zip code 2) XGBoost Regression on predicted house prices, plotted with a heat map
37 |    5. **Show**: Interactive Tableau website
38 | 
39 | ### Zillow Automated Scraper
40 | ![](https://i.imgur.com/E6RI8Hm.gif)
41 | 
42 | ### Zillow Data Architecture
43 | ![](https://i.imgur.com/bLuGWMj.png)
44 | 
45 | ### 8 properties of Big Data
46 | 
47 | #### Robustness and Fault Tolerance
48 | All systems belong to AWS, which is highly robust. In addition, all systems integrates with one another. If the scraping goes down, data itself will be safe in S3. 
49 | 
50 | #### Low latency reads and updates
51 | Since the real estate sales cycle is relatively slow, there is no need for real time updates.
52 | 
53 | #### Scalability
54 | A majority of these technologies are fully scalable. S3 and Spark have high scalability. MySQL or SparkSQL might be a better fit than Postgres for scalability. Tableau is also very scalable.
55 | 
56 | #### Generalization
57 | This data pipeline architecture is easily extendible for other scraping and can be reused.
58 | 
59 | #### Extensibility
60 | All of the systems in place support the addition of new data sources, features, and models.
61 | 
62 | #### Ad hoc queries
63 | Postgres and Tableau will be used to do ad hoc queries. 
64 | 
65 | #### Minimal maintenance
66 | This system is complex, and requires monitoring to make sure everything is running. I attempted to make an e-mail airflow DAG in case of errors. 
67 | #### Debuggability
68 | Data will always be stored in complete form in S3, so bugs can be easily traced out. Ability for data to be restructured and models can be recomputed if something goes wrong.
69 | 
70 | 


--------------------------------------------------------------------------------
/airflow/airflowdag.py:
--------------------------------------------------------------------------------
 1 | from airflow import DAG
 2 | from airflow.operators.bash_operator import BashOperator
 3 | from datetime import datetime, timedelta
 4 | default_args = {
 5 |     'owner': 'airflow',
 6 |     'depends_on_past': False,
 7 |     'start_date': datetime(2017, 5, 3),
 8 |     'email': ['jerrykhong@gmail.com'],
 9 |     'email_on_failure': True,
10 |     'email_on_retry': False,
11 |     'retries': 1,
12 |     'retry_delay': timedelta(minutes=5),
13 | }
14 | 
15 | dag = DAG(
16 |     'dag_1', default_args=default_args, schedule_interval='30 7 * * *')
17 | 
18 | t1 = BashOperator(
19 |     task_id='scrape_market',
20 |     bash_command='python ~/Programming/zillow/scripts/market_test.py',
21 |     dag=dag)
22 | 
23 | t2 = BashOperator(
24 |     task_id='scrape_sold',
25 |     bash_command='python ~/Programming/zillow/scripts/zillow_recently_sold.py',
26 |     dag=dag)
27 | 
28 | t2.set_upstream(t1)
29 | 


--------------------------------------------------------------------------------
/airflow/airflowec2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdkhong/Zillow-Data-Engineering/36ee971c6ec0ec3d7df3a33ecf477b56f3824cd8/airflow/airflowec2.png


--------------------------------------------------------------------------------
/airflow/airflowsetup.md:
--------------------------------------------------------------------------------
 1 | sudo apt-get install build-essential libsasl2-dev binutils
 2 | sudo easy_install -U setuptools
 3 | export AIRFLOW_HOME=~/airflow
 4 | sudo pip install airflow[s3,python]
 5 | 
 6 | airflow initdb
 7 | cd airflow/
 8 | mkdir dags
 9 | mkdir logs
10 | 
11 | vim airflow.cfg
12 | airflow webserver
13 | airflow scheduler
14 | 
15 | #test
16 | python airflowdag.py airflow test dag_1 scrape_market 2017-0
17 | 


--------------------------------------------------------------------------------
/ec2/seleniumboot.sh:
--------------------------------------------------------------------------------
 1 | sudo apt-get update
 2 | sudo apt-get install libxss1 libappindicator1 libindicator7
 3 | wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
 4 | sudo dpkg -i google-chrome*.deb
 5 | sudo apt-get install -f
 6 | sudo apt-get install xvfb unzip
 7 | wget -N http://chromedriver.storage.googleapis.com/2.26/chromedriver_linux64.zip
 8 | unzip chromedriver_linux64.zip
 9 | chmod +x chromedriver
10 | sudo mv -f chromedriver /usr/local/share/chromedriver
11 | sudo ln -s /usr/local/share/chromedriver /usr/local/bin/chromedriver
12 | sudo ln -s /usr/local/share/chromedriver /usr/bin/chromedriver
13 | sudo apt-get install python-pip
14 | sudo pip install pyvirtualdisplay selenium pandas boto 
15 | ## if needed: sudo rm /var/cache/apt/archives/lock sudo rm /var/lib/dpkg/lock sudo rm /var/lib/apt/lists/lock


--------------------------------------------------------------------------------
/presentation/mypresentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdkhong/Zillow-Data-Engineering/36ee971c6ec0ec3d7df3a33ecf477b56f3824cd8/presentation/mypresentation.pdf


--------------------------------------------------------------------------------
/presentation/zillowpresentation.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdkhong/Zillow-Data-Engineering/36ee971c6ec0ec3d7df3a33ecf477b56f3824cd8/presentation/zillowpresentation.pptx


--------------------------------------------------------------------------------
/scripts/market.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import ast
 3 | import time
 4 | import pandas as pd
 5 | from io import StringIO
 6 | import boto3
 7 | import boto
 8 | import zillow_functions as zl
 9 | 
10 | df = pd.DataFrame({'address': [],
11 |                    'bathrooms': [],
12 |                    'bedrooms': [],
13 |                    'city': [],
14 |                    'days_on_zillow': [],
15 |                    'price': [],
16 |                    'sale_type': [],
17 |                    'state': [],
18 |                    'sqft': [],
19 |                    'url': [],
20 |                    'zip': [],
21 |                    'zpid': []})
22 | 
23 | 
24 | conn = boto.connect_s3()
25 | bucket = conn.get_bucket('zillowstreamjk')
26 | 
27 | # CHANGE BUCKET NAME
28 | for key in bucket.list(prefix='rawdata/market/AZ'):
29 |     sonnets = bucket.get_key(key.key)
30 |     text = sonnets.get_contents_as_string(encoding='utf-8')
31 |     x = ast.literal_eval(text)
32 | 
33 |     for n in range(len(x)):
34 |         soup = BeautifulSoup(x[n], "lxml")
35 |         new_obs = []
36 | 
37 |         # List that contains number of beds, baths, and total sqft (and
38 |         # sometimes price as well).
39 |         card_info = zl.get_card_info(soup)
40 | 
41 |         # Street Address
42 |         new_obs.append(zl.get_street_address(soup))
43 | 
44 |         # Bathrooms
45 |         new_obs.append(zl.get_bathrooms(card_info))
46 | 
47 |         # Bedrooms
48 |         new_obs.append(zl.get_bedrooms(card_info))
49 | 
50 |         # City
51 |         new_obs.append(zl.get_city(soup))
52 | 
53 |         # Days on the Market/Zillow
54 |         new_obs.append(zl.get_days_on_market(soup))
55 | 
56 |         # Price
57 |         new_obs.append(zl.get_price(soup, card_info))
58 | 
59 |         # Sale Type (House for Sale, New Construction, Foreclosure, etc.)
60 |         new_obs.append(zl.get_sale_type(soup))
61 | 
62 |         # Sqft
63 |         new_obs.append(zl.get_sqft(card_info))
64 | 
65 |         # State
66 |         new_obs.append(zl.get_state(soup))
67 | 
68 |         # URL for each house listing
69 |         new_obs.append(zl.get_url(soup))
70 | 
71 |         # Zipcode
72 |         new_obs.append(zl.get_zipcode(soup))
73 | 
74 |         # Zipco
75 |         new_obs.append(zl.get_id(soup))
76 | 
77 |         # Append new_obs to df as a new observation
78 |         if len(new_obs) == len(df.columns):
79 |             df.loc[len(df.index)] = new_obs
80 | 
81 | 
82 | # Write df to CSV.
83 | columns = ['address', 'city', 'state', 'zip', 'price', 'sqft', 'bedrooms',
84 |            'bathrooms', 'days_on_zillow', 'sale_type', 'url', 'zpid']
85 | 
86 | df = df[columns]
87 | localtime = time.localtime()
88 | timeString = time.strftime("%Y%m%d%H%M%S", localtime)
89 | csv_buffer = StringIO()
90 | df.to_csv(csv_buffer, index=False)
91 | # Upload CSV to S3, REMEMBER TO CHANGE KEY NAME
92 | s3_key = 'parsed/market/AZ/' + ''.join(timeString) + ".csv"
93 | s3_resource = boto3.resource('s3')
94 | s3_resource.Object('zillowstreamjk', s3_key).put(Body=csv_buffer.getvalue())


--------------------------------------------------------------------------------
/scripts/market_test.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import zillow_functions as zl
 3 | from bs4 import BeautifulSoup
 4 | import boto
 5 | import pandas as pd
 6 | 
 7 | 
 8 | # List of zip codes
 9 | df = pd.read_csv('~/Programming/zillow/work_zip_codes1.csv')
10 | 
11 | # CHANGE THE S3 FOLDER NAME!!!
12 | # state = df[df['zip'] == 95138]
13 | temp = df.ix[:, 'zip'].tolist()
14 | # zipcodes = [temp[(i + 1) * 20: (i + 2) * 20] for i in range(int(len(temp) / 20))]
15 | 
16 | 
17 | # Initialize the webdriver.
18 | driver = zl.init_driver("/anaconda/bin/chromedriver")
19 | 
20 | # Go to www.zillow.com/homes
21 | zl.navigate_to_website(driver, "http://www.zillow.com/homes")
22 | 
23 | # Click the "buy" button.
24 | zl.click_buy_button(driver)
25 | 
26 | 
27 | def scrape_data(zc):
28 |     st = zc
29 | 
30 |     conn = boto.connect_s3()
31 |     bucket = conn.get_bucket('zillowstreamjk')
32 | 
33 |     # Create 11 variables from the scrapped HTML data.
34 |     # These variables will make up the final output dataframe.
35 |     # df = pd.DataFrame({'address': [],
36 |     #                    'bathrooms': [],
37 |     #                    'bedrooms': [],
38 |     #                    'city': [],
39 |     #                    'days_on_zillow': [],
40 |     #                    'price': [],
41 |     #                    'sale_type': [],
42 |     #                    'state': [],
43 |     #                    'sqft': [],
44 |     #                    'url': [],
45 |     #                    'zip': []})
46 | 
47 |     # Get total number of search terms.
48 |     numSearchTerms = len(st)
49 | 
50 |     # Start the scraping.
51 | 
52 |     for k in range(numSearchTerms):
53 |         # Define search term (must be str object).
54 |         search_term = st[k]
55 | 
56 |         # Enter search term and execute search.
57 |         if zl.enter_search_term(driver, search_term):
58 |             print("Entering search term number " + str(k + 1) +
59 |                   " out of " + str(numSearchTerms))
60 |         else:
61 |             print("Search term " + str(k + 1) +
62 |                   " failed, moving onto next search term\n***")
63 |             continue
64 | 
65 |         # Check to see if any results were returned from the search.
66 |         # If there were none, move onto the next search.
67 |         if zl.results_test(driver):
68 |             print("Search " + str(search_term) +
69 |                   " returned zero results. Moving onto the next search\n***")
70 |             continue
71 | 
72 |         # Pull the html for each page of search results. Zillow caps results at
73 |         # 20 pages, each page can contain 26 home listings, thus the cap on home
74 |         # listings per search is 520.
75 |         rawdata = zl.get_html(driver)
76 |         print(str(len(rawdata)) + " pages of listings found")
77 |         listings = zl.get_listings(rawdata)
78 |         # Take the extracted HTML and split it up by individual home listings.
79 | 
80 |         k = boto.s3.key.Key(bucket)
81 |         localtime = time.localtime()
82 |         timeString = time.strftime("%Y%m%d%H%M%S", localtime)
83 |         k.key = 'market/general/LA' + ''.join(timeString)
84 |         k.content_type = 'text/html'
85 |         k.set_contents_from_string(str(listings), policy='public-read')
86 | 
87 | 
88 | # for key, value in enumerate(zipcodes):
89 | #     scrape_data(value)
90 | scrape_data(temp)
91 | # scrape_data(['94105'])
92 | 
93 | # Close the webdriver connection.
94 | zl.close_connection(driver)
95 | 


--------------------------------------------------------------------------------
/scripts/sold.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import ast
 3 | import time
 4 | import pandas as pd
 5 | from io import StringIO
 6 | import boto3
 7 | import boto
 8 | import zillow_sold_functions as zsl
 9 | 
10 | df = pd.DataFrame({'address': [],
11 |                    'bathrooms': [],
12 |                    'bedrooms': [],
13 |                    'city': [],
14 |                    'sale_type': [],
15 |                    'state': [],
16 |                    'sqft': [],
17 |                    'url': [],
18 |                    'zip': [],
19 |                    'zpid':[]})
20 | 
21 | 
22 | conn = boto.connect_s3()
23 | bucket = conn.get_bucket('zillowstreamjk')
24 | 
25 | # CHANGE BUCKET NAME
26 | for key in bucket.list(prefix='rawdata/sold/AZ'):
27 |     sonnets = bucket.get_key(key.key)
28 |     text = sonnets.get_contents_as_string(encoding='utf-8')
29 |     x = ast.literal_eval(text)
30 | 
31 |     for n in range(len(x)):
32 |         soup = BeautifulSoup(x[n], "lxml")
33 |         new_obs = []
34 | 
35 |         # List that contains number of beds, baths, and total sqft (and
36 |         # sometimes price as well).
37 |         card_info = zsl.get_card_info(soup)
38 | 
39 |         # Street Address
40 |         new_obs.append(zsl.get_street_address(soup))
41 | 
42 |         # Bathrooms
43 |         new_obs.append(zsl.get_bathrooms(card_info))
44 | 
45 |         # Bedrooms
46 |         new_obs.append(zsl.get_bedrooms(card_info))
47 | 
48 |         # City
49 |         new_obs.append(zsl.get_city(soup))
50 | 
51 | 
52 |         # Sale Type (House for Sale, New Construction, Foreclosure, etc.)
53 |         new_obs.append(zsl.get_sale_type(soup))
54 | 
55 |         # Sqft
56 |         new_obs.append(zsl.get_sqft(card_info))
57 | 
58 |         # State
59 |         new_obs.append(zsl.get_state(soup))
60 | 
61 |         # URL for each house listing
62 |         new_obs.append(zsl.get_url(soup))
63 | 
64 |         # Zipcode
65 |         new_obs.append(zsl.get_zipcode(soup))
66 | 
67 |         # Zipcode
68 |         new_obs.append(zsl.get_id(soup))
69 | 
70 |         # Append new_obs to df as a new observation
71 |         if len(new_obs) == len(df.columns):
72 |             df.loc[len(df.index)] = new_obs
73 | 
74 | 
75 | # Write df to CSV.
76 | columns = ['address', 'city', 'state', 'zip', 'sqft', 'bedrooms',
77 |                'bathrooms', 'sale_type', 'url', 'zpid']
78 | 
79 | df = df[columns]
80 | localtime = time.localtime()
81 | timeString = time.strftime("%Y%m%d%H%M%S", localtime)
82 | csv_buffer = StringIO()
83 | df.to_csv(csv_buffer, index=False)
84 | # Upload CSV to S3, REMEMBER TO CHANGE KEY NAME
85 | s3_key = 'parsed/sold/AZ/' + ''.join(timeString) + ".csv"
86 | s3_resource = boto3.resource('s3')
87 | s3_resource.Object('zillowstreamjk', s3_key).put(Body=csv_buffer.getvalue())


--------------------------------------------------------------------------------
/scripts/zillow_functions.py:
--------------------------------------------------------------------------------
  1 | import re as re
  2 | import time
  3 | import zipcode
  4 | from selenium import webdriver
  5 | from selenium.webdriver.common.by import By
  6 | from selenium.webdriver.support.ui import WebDriverWait
  7 | from selenium.webdriver.support import expected_conditions as EC
  8 | from selenium.common.exceptions import TimeoutException
  9 | from selenium.common.exceptions import NoSuchElementException
 10 | 
 11 | 
 12 | def zipcodes_list(st_items):
 13 |     # If st_items is a single zipcode string.
 14 |     if type(st_items) == str:
 15 |         zcObjects = zipcode.islike(st_items)
 16 |         output = [str(i).split(" ", 1)[1].split(">")[0]
 17 |                   for i in zcObjects]
 18 |     # If st_items is a list of zipcode strings.
 19 |     elif type(st_items) == list:
 20 |         zcObjects = [n for i in st_items for n in zipcode.islike(str(i))]
 21 |         output = [str(i).split(" ", 1)[1].split(">")[0]
 22 |                   for i in zcObjects]
 23 |     else:
 24 |         raise ValueError("input 'st_items' must be of type str or list")
 25 |     return (output)
 26 | 
 27 | 
 28 | def init_driver(filepath):
 29 |     driver = webdriver.Chrome(executable_path=filepath)
 30 |     driver.wait = WebDriverWait(driver, 10)
 31 |     return (driver)
 32 | 
 33 | 
 34 | def navigate_to_website(driver, site):
 35 |     driver.get(site)
 36 | 
 37 | 
 38 | def click_buy_button(driver):
 39 |     try:
 40 |         button = driver.wait.until(EC.element_to_be_clickable(
 41 |             (By.CLASS_NAME, "nav-header")))
 42 |         button.click()
 43 |         time.sleep(8)
 44 |     except (TimeoutException, NoSuchElementException):
 45 |         raise ValueError("Clicking the 'Buy' button failed")
 46 | 
 47 | 
 48 | def enter_search_term(driver, search_term):
 49 |     try:
 50 |         searchBar = driver.wait.until(EC.presence_of_element_located(
 51 |             (By.ID, "citystatezip")))
 52 |         button = driver.wait.until(EC.element_to_be_clickable(
 53 |             (By.CLASS_NAME, "zsg-icon-searchglass")))
 54 |         searchBar.clear()
 55 |         time.sleep(2)
 56 |         searchBar.send_keys(search_term)
 57 |         time.sleep(2)
 58 |         button.click()
 59 |         time.sleep(2)
 60 |         return (True)
 61 |     except (TimeoutException, NoSuchElementException):
 62 |         return (False)
 63 | 
 64 | 
 65 | def results_test(driver):
 66 |     # Check to see if there are any returned results
 67 |     try:
 68 |         no_results = driver.find_element_by_css_selector(
 69 |             '.zoom-out-message').is_displayed()
 70 |     except (NoSuchElementException, TimeoutException):
 71 |         # Check to see if the zipcode is invalid or not
 72 |         try:
 73 |             no_results = driver.find_element_by_class_name(
 74 |                 'zsg-icon-x-thick').is_displayed()
 75 |         except (NoSuchElementException, TimeoutException):
 76 |             no_results = False
 77 |     return (no_results)
 78 | 
 79 | 
 80 | def get_html(driver):
 81 |     output = []
 82 |     keep_going = True
 83 |     while keep_going:
 84 |         # Pull page HTML
 85 |         try:
 86 |             output.append(driver.page_source)
 87 |         except TimeoutException:
 88 |             pass
 89 |         try:
 90 |             # Check to see if a "next page" link exists
 91 |             keep_going = driver.find_element_by_class_name(
 92 |                 'zsg-pagination-next').is_displayed()
 93 |         except NoSuchElementException:
 94 |             keep_going = False
 95 |         if keep_going:
 96 |             # Test to ensure the "updating results" image isnt displayed.
 97 |             # Will try up to 5 times before giving up, with a 5 second wait
 98 |             # between each try.
 99 |             tries = 5
100 |             try:
101 |                 cover = driver.find_element_by_class_name(
102 |                     'list-loading-message-cover').is_displayed()
103 |             except (TimeoutException, NoSuchElementException):
104 |                 cover = False
105 |             while cover and tries > 0:
106 |                 time.sleep(4)
107 |                 tries -= 1
108 |                 try:
109 |                     cover = driver.find_element_by_class_name(
110 |                         'list-loading-message-cover').is_displayed()
111 |                 except (TimeoutException, NoSuchElementException):
112 |                     cover = False
113 |             # If the "updating results" image is confirmed to be gone
114 |             # (cover == False), click next page. Otherwise, give up on trying
115 |             # to click thru to the next page of house results, and return the
116 |             # results that have been scraped up to the current page.
117 |             if cover == False:
118 |                 try:
119 |                     driver.wait.until(EC.element_to_be_clickable(
120 |                         (By.CLASS_NAME, 'zsg-pagination-next'))).click()
121 |                     time.sleep(2)
122 |                 except TimeoutException:
123 |                     keep_going = False
124 |             else:
125 |                 keep_going = False
126 |     return (output)
127 | 
128 | 
129 | def get_listings(list_obj):
130 |     # Split the raw HTML into segments, one for each listing.
131 |     output = []
132 |     for i in list_obj:
133 |         htmlSplit = i.split('" id="zpid_')[1:]
134 |         output += htmlSplit
135 |     print(str(len(output)) + " home listings scraped\n***")
136 |     return (output)
137 | 
138 | 
139 | def get_street_address(soup_obj):
140 |     try:
141 |         street = soup_obj.find(
142 |             "span", {"itemprop": "streetAddress"}).get_text().strip()
143 |     except (ValueError, AttributeError):
144 |         street = "NA"
145 |     if len(street) == 0 or street == "null":
146 |         street = "NA"
147 |     return (street)
148 | 
149 | 
150 | def get_city(soup_obj):
151 |     try:
152 |         city = soup_obj.find(
153 |             "span", {"itemprop": "addressLocality"}).get_text().strip()
154 |     except (ValueError, AttributeError):
155 |         city = "NA"
156 |     if len(city) == 0 or city == "null":
157 |         city = "NA"
158 |     return (city)
159 | 
160 | 
161 | def get_state(soup_obj):
162 |     try:
163 |         state = soup_obj.find(
164 |             "span", {"itemprop": "addressRegion"}).get_text().strip()
165 |     except (ValueError, AttributeError):
166 |         state = "NA"
167 |     if len(state) == 0 or state == 'null':
168 |         state = "NA"
169 |     return (state)
170 | 
171 | 
172 | def get_zipcode(soup_obj):
173 |     try:
174 |         zipcode = soup_obj.find(
175 |             "span", {"itemprop": "postalCode"}).get_text().strip()
176 |     except (ValueError, AttributeError):
177 |         zipcode = "NA"
178 |     if len(zipcode) == 0 or zipcode == 'null':
179 |         zipcode = "NA"
180 |     return (zipcode)
181 | 
182 | 
183 | def get_price(soup_obj, list_obj):
184 |     # Look for price within the BeautifulSoup object.
185 |     try:
186 |         price = soup_obj.find(
187 |             "span", {"class": "zsg-photo-card-price"}).get_text().strip()
188 |     except (ValueError, AttributeError):
189 |         # If that fails, look for price within list_obj (object "card_info").
190 |         try:
191 |             price = [n for n in list_obj
192 |                      if any(["$" in n, "K" in n, "k" in n])]
193 |             if len(price) > 0:
194 |                 price = price[0].split(" ")
195 |                 price = [n for n in price if re.search("[0-9]", n) is not None]
196 |                 if len(price[0]) > 0:
197 |                     price = price[0]
198 |                 else:
199 |                     price = "NA"
200 |             else:
201 |                 price = "NA"
202 |         except (ValueError, AttributeError):
203 |             price = "NA"
204 |     if len(price) == 0 or price == "null":
205 |         price = "NA"
206 |     if price is not "NA":
207 |         # Transformations to the price string.
208 |         price = price.replace(",", "").replace("+", "").replace("$", "")
209 |         if any(["K" in price, "k" in price]):
210 |             price = price.lower().split("k")[0].strip()
211 |             price = price + "000"
212 |         if any(["M" in price, "m" in price]):
213 |             price = price.lower().split("m")[0].strip()
214 |             if "." not in price:
215 |                 price = price + "000000"
216 |             else:
217 |                 pricelen = len(price.split('.')[0]) + 6
218 |                 price = price.replace('.', '')
219 |                 diff = pricelen - len(price)
220 |                 price = price + (diff * "0")
221 |         if len(price) == 0:
222 |             price = 'NA'
223 |     return (price)
224 | 
225 | 
226 | def get_card_info(soup_obj):
227 |     # For most listings, card_info will contain info on number of bedrooms,
228 |     # number of bathrooms, square footage, and sometimes price.
229 |     try:
230 |         card = soup_obj.find(
231 |             "span", {"class": "zsg-photo-card-info"}).get_text().split(" · ")
232 |     except (ValueError, AttributeError):
233 |         card = "NA"
234 |     if len(card) == 0 or card == 'null':
235 |         card = "NA"
236 |     return (card)
237 | 
238 | 
239 | def get_sqft(list_obj):
240 |     sqft = [n for n in list_obj if "sqft" in n]
241 |     if len(sqft) > 0:
242 |         try:
243 |             sqft = float(sqft[0].split("sqft")[0].strip().replace(",", "").replace("+", ""))
244 |         except (ValueError, IndexError):
245 |             sqft = "NA"
246 |         if sqft == 0:
247 |             sqft = "NA"
248 |     else:
249 |         sqft = "NA"
250 |     return (sqft)
251 | 
252 | 
253 | def get_bedrooms(list_obj):
254 |     beds = [n for n in list_obj if any(["bd" in n, "tudio" in n])]
255 |     if len(beds) > 0:
256 |         if any([beds[0] == "Studio", beds[0] == "studio"]):
257 |             beds = 0
258 |             return (beds)
259 |         try:
260 |             beds = float(beds[0].split("bd")[0].strip())
261 |         except (ValueError, IndexError):
262 |             if any([beds[0] == "Studio", beds[0] == "studio"]):
263 |                 beds = 0
264 |             else:
265 |                 beds = "NA"
266 |     else:
267 |         beds = "NA"
268 |     return (beds)
269 | 
270 | 
271 | def get_bathrooms(list_obj):
272 |     baths = [n for n in list_obj if "ba" in n]
273 |     if len(baths) > 0:
274 |         try:
275 |             baths = float(baths[0].split("ba")[0].strip())
276 |         except (ValueError, IndexError):
277 |             baths = "NA"
278 |         if baths == 0:
279 |             baths = "NA"
280 |     else:
281 |         baths = "NA"
282 |     return (baths)
283 | 
284 | 
285 | def get_days_on_market(soup_obj):
286 |     try:
287 |         dom = soup_obj.find_all(
288 |             "span", {"class": "zsg-photo-card-notification"})
289 |         dom = [n for n in dom if "illow" in n.get_text()]
290 |         if len(dom) > 0:
291 |             dom = dom[0].get_text().strip()
292 |             dom = int(dom.split(" ")[0])
293 |         else:
294 |             dom = "NA"
295 |     except (ValueError, AttributeError):
296 |         dom = "NA"
297 |     return (dom)
298 | 
299 | 
300 | def get_sale_type(soup_obj):
301 |     try:
302 |         saletype = soup_obj.find(
303 |             "span", {"class": "zsg-photo-card-status"}).get_text().strip()
304 |     except (ValueError, AttributeError):
305 |         saletype = "NA"
306 |     if len(saletype) == 0 or saletype == 'null':
307 |         saletype = "NA"
308 |     return (saletype)
309 | 
310 | 
311 | def get_url(soup_obj):
312 |     # Try to find url in the BeautifulSoup object.
313 |     href = [n["href"] for n in soup_obj.find_all("a", href=True)]
314 |     url = [i for i in href if "homedetails" in i]
315 |     if len(url) > 0:
316 |         url = "http://www.zillow.com/homes/for_sale/" + url[0]
317 |     else:
318 |         # If that fails, contruct the url from the zpid of the listing.
319 |         url = [i for i in href if "zpid" in i and "avorite" not in i]
320 |         if len(url) > 0:
321 |             zpid = re.findall(r"\d{8,10}", href[0])
322 |             if zpid is not None and len(zpid) > 0:
323 |                 url = 'http://www.zillow.com/homes/for_sale/' \
324 |                       + str(zpid[0]) \
325 |                       + '_zpid/any_days/globalrelevanceex_sort/29.759534,' \
326 |                       + '-95.335321,29.675003,-95.502863_rect/12_zm/'
327 |             else:
328 |                 url = "NA"
329 |         else:
330 |             url = "NA"
331 |     return (url)
332 | 
333 | 
334 | def get_id(soup_obj):
335 |     # Try to find url in the BeautifulSoup object.
336 |     href = [n["href"] for n in soup_obj.find_all("a", href=True)]
337 |     url = [i for i in href if "homedetails" in i]
338 |     zpid = re.findall(r"\d{7,11}", href[0])
339 |     if zpid is not None and len(zpid) > 0:
340 |         zid = zpid
341 |     else:
342 |         zid = "NA"
343 |     return ''.join(zid)
344 | 
345 | def close_connection(driver):
346 |     driver.quit()
347 | 


--------------------------------------------------------------------------------
/scripts/zillow_recently_sold.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import time
 3 | import zillow_sold_functions as zsl
 4 | from bs4 import BeautifulSoup
 5 | import boto
 6 | import pandas as pd
 7 | import json
 8 | 
 9 | # List of zip codes
10 | df = pd.read_csv('~/Programming/zillow/work_zip_codes1.csv')
11 | 
12 | # CHANGE THE S3 FOLDER NAME!!! ended on 45 of 57
13 | # state = df[df['zip'] == 95138]
14 | temp = df.ix[:, 'zip'].tolist()
15 | #temp = df.ix[:, 'zip'].tolist()
16 | 
17 | # Initialize the webdriver.
18 | driver = zsl.init_driver("/anaconda/bin/chromedriver")
19 | 
20 | # Go to www.zillow.com/homes
21 | zsl.navigate_to_website(driver, "https://www.zillow.com/homes/recently_sold")
22 | 
23 | 
24 | def scrape_data(zc):
25 |     st = zc
26 |     print(st)
27 |     # Get total number of search terms.
28 |     numSearchTerms = len(st)
29 | 
30 |     # Start the scraping.
31 |     conn = boto.connect_s3()
32 |     bucket = conn.get_bucket('zillowstreamjk')
33 | 
34 |     for k in range(numSearchTerms):
35 |         # Define search term (must be str object).
36 |         search_term = st[k]
37 | 
38 |         # Enter search term and execute search.
39 |         if zsl.enter_search_term(driver, search_term):
40 |             print("Entering search term number " + str(k + 1) +
41 |                   " out of " + str(numSearchTerms))
42 |         else:
43 |             print("Search term " + str(k + 1) +
44 |                   " failed, moving onto next search term\n***")
45 |             continue
46 | 
47 |         # Check to see if any results were returned from the search.
48 |         # If there were none, move onto the next search.
49 |         if zsl.results_test(driver):
50 |             print("Search " + str(search_term) +
51 |                   " returned zero results. Moving onto the next search\n***")
52 |             continue
53 | 
54 |         # Pull the html for each page of search results. Zillow caps results at
55 |         # 20 pages, each page can contain 26 home listings, thus the cap on home
56 |         # listings per search is 520.
57 |         rawdata = zsl.get_html(driver)
58 |         print(str(len(rawdata)) + " pages of listings found")
59 |         # listings = zsl.get_listings(rawdata)
60 |         # Take the extracted HTML and split it up by individual home listings.
61 |         listings = zsl.get_listings(rawdata)
62 | 
63 |         k = boto.s3.key.Key(bucket)
64 |         localtime = time.localtime()
65 |         timeString = time.strftime("%Y%m%d%H%M%S", localtime)
66 |         k.key = 'rawdata/sold/general/' + ''.join(timeString)
67 |         k.content_type = 'text/html'
68 |         k.set_contents_from_string(str(listings), policy='public-read')
69 | 
70 | #
71 | # for key, value in enumerate(zipcodes):
72 | #     scrape_data(value)
73 | scrape_data(temp)
74 | 
75 | # Close the webdriver connection.
76 | zsl.close_connection(driver)
77 | 


--------------------------------------------------------------------------------
/scripts/zillow_sold_functions.py:
--------------------------------------------------------------------------------
  1 | import re as re
  2 | import time
  3 | import zipcode
  4 | from selenium import webdriver
  5 | from selenium.webdriver.common.by import By
  6 | from selenium.webdriver.support.ui import WebDriverWait
  7 | from selenium.webdriver.support import expected_conditions as EC
  8 | from selenium.common.exceptions import TimeoutException
  9 | from selenium.common.exceptions import NoSuchElementException
 10 | 
 11 | 
 12 | def zipcodes_list(st_items):
 13 |     # If st_items is a single zipcode string.
 14 |     if type(st_items) == str:
 15 |         zcObjects = zipcode.islike(st_items)
 16 |         output = [str(i).split(" ", 1)[1].split(">")[0]
 17 |                   for i in zcObjects]
 18 |     # If st_items is a list of zipcode strings.
 19 |     elif type(st_items) == list:
 20 |         zcObjects = [n for i in st_items for n in zipcode.islike(str(i))]
 21 |         output = [str(i).split(" ", 1)[1].split(">")[0]
 22 |                   for i in zcObjects]
 23 |     else:
 24 |         raise ValueError("input 'st_items' must be of type str or list")
 25 |     return (output)
 26 | 
 27 | 
 28 | def init_driver(filepath):
 29 |     driver = webdriver.Chrome(executable_path=filepath)
 30 |     driver.wait = WebDriverWait(driver, 10)
 31 |     return (driver)
 32 | 
 33 | 
 34 | def navigate_to_website(driver, site):
 35 |     driver.get(site)
 36 | 
 37 | 
 38 | def enter_search_term(driver, search_term):
 39 |     try:
 40 |         searchBar = driver.wait.until(EC.presence_of_element_located(
 41 |             (By.ID, "citystatezip")))
 42 |         button = driver.wait.until(EC.element_to_be_clickable(
 43 |             (By.CLASS_NAME, "zsg-icon-searchglass")))
 44 |         searchBar.clear()
 45 |         time.sleep(2)
 46 |         searchBar.send_keys(search_term)
 47 |         time.sleep(2)
 48 |         button.click()
 49 |         time.sleep(2)
 50 |         return (True)
 51 |     except (TimeoutException, NoSuchElementException):
 52 |         return (False)
 53 | 
 54 | 
 55 | def results_test(driver):
 56 |     # Check to see if there are any returned results
 57 |     try:
 58 |         no_results = driver.find_element_by_css_selector(
 59 |             '.zoom-out-message').is_displayed()
 60 |     except (NoSuchElementException, TimeoutException):
 61 |         # Check to see if the zipcode is invalid or not
 62 |         try:
 63 |             no_results = driver.find_element_by_class_name(
 64 |                 'zsg-icon-x-thick').is_displayed()
 65 |         except (NoSuchElementException, TimeoutException):
 66 |             no_results = False
 67 |     return (no_results)
 68 | 
 69 | 
 70 | def get_html(driver):
 71 |     output = []
 72 |     keep_going = True
 73 |     while keep_going:
 74 |         # Pull page HTML
 75 |         try:
 76 |             output.append(driver.page_source)
 77 |         except TimeoutException:
 78 |             pass
 79 |         try:
 80 |             # Check to see if a "next page" link exists
 81 |             keep_going = driver.find_element_by_class_name(
 82 |                 'zsg-pagination-next').is_displayed()
 83 |         except NoSuchElementException:
 84 |             keep_going = False
 85 |         if keep_going:
 86 |             # Test to ensure the "updating results" image isnt displayed.
 87 |             # Will try up to 5 times before giving up, with a 5 second wait
 88 |             # between each try.
 89 |             tries = 5
 90 |             try:
 91 |                 cover = driver.find_element_by_class_name(
 92 |                     'list-loading-message-cover').is_displayed()
 93 |             except (TimeoutException, NoSuchElementException):
 94 |                 cover = False
 95 |             while cover and tries > 0:
 96 |                 time.sleep(4)
 97 |                 tries -= 1
 98 |                 try:
 99 |                     cover = driver.find_element_by_class_name(
100 |                         'list-loading-message-cover').is_displayed()
101 |                 except (TimeoutException, NoSuchElementException):
102 |                     cover = False
103 |             # If the "updating results" image is confirmed to be gone
104 |             # (cover == False), click next page. Otherwise, give up on trying
105 |             # to click thru to the next page of house results, and return the
106 |             # results that have been scraped up to the current page.
107 |             if cover == False:
108 |                 try:
109 |                     driver.wait.until(EC.element_to_be_clickable(
110 |                         (By.CLASS_NAME, 'zsg-pagination-next'))).click()
111 |                     time.sleep(2)
112 |                 except TimeoutException:
113 |                     keep_going = False
114 |             else:
115 |                 keep_going = False
116 |     return (output)
117 | 
118 | 
119 | def get_listings(list_obj):
120 |     # Split the raw HTML into segments, one for each listing.
121 |     output = []
122 |     for i in list_obj:
123 |         htmlSplit = i.split('" id="zpid_')[1:]
124 |         output += htmlSplit
125 |     print(str(len(output)) + " home listings scraped\n***")
126 |     return (output)
127 | 
128 | 
129 | def get_street_address(soup_obj):
130 |     try:
131 |         street = soup_obj.find(
132 |             "span", {"itemprop": "streetAddress"}).get_text().strip()
133 |     except (ValueError, AttributeError):
134 |         street = "NA"
135 |     if len(street) == 0 or street == "null":
136 |         street = "NA"
137 |     return (street)
138 | 
139 | 
140 | def get_city(soup_obj):
141 |     try:
142 |         city = soup_obj.find(
143 |             "span", {"itemprop": "addressLocality"}).get_text().strip()
144 |     except (ValueError, AttributeError):
145 |         city = "NA"
146 |     if len(city) == 0 or city == "null":
147 |         city = "NA"
148 |     return (city)
149 | 
150 | 
151 | def get_state(soup_obj):
152 |     try:
153 |         state = soup_obj.find(
154 |             "span", {"itemprop": "addressRegion"}).get_text().strip()
155 |     except (ValueError, AttributeError):
156 |         state = "NA"
157 |     if len(state) == 0 or state == 'null':
158 |         state = "NA"
159 |     return (state)
160 | 
161 | 
162 | def get_zipcode(soup_obj):
163 |     try:
164 |         zipcode = soup_obj.find(
165 |             "span", {"itemprop": "postalCode"}).get_text().strip()
166 |     except (ValueError, AttributeError):
167 |         zipcode = "NA"
168 |     if len(zipcode) == 0 or zipcode == 'null':
169 |         zipcode = "NA"
170 |     return (zipcode)
171 | 
172 | 
173 | def get_price(soup_obj, list_obj):
174 |     # Look for price within the BeautifulSoup object.
175 |     try:
176 |         price = soup_obj.find(
177 |             "span", {"class": "zsg-photo-card-price"}).get_text().strip()
178 |     except (ValueError, AttributeError):
179 |         # If that fails, look for price within list_obj (object "card_info").
180 |         try:
181 |             price = [n for n in list_obj
182 |                      if any(["$" in n, "K" in n, "k" in n])]
183 |             if len(price) > 0:
184 |                 price = price[0].split(" ")
185 |                 price = [n for n in price if re.search("[0-9]", n) is not None]
186 |                 if len(price[0]) > 0:
187 |                     price = price[0]
188 |                 else:
189 |                     price = "NA"
190 |             else:
191 |                 price = "NA"
192 |         except (ValueError, AttributeError):
193 |             price = "NA"
194 |     if len(price) == 0 or price == "null":
195 |         price = "NA"
196 |     if price is not "NA":
197 |         # Transformations to the price string.
198 |         price = price.replace(",", "").replace("+", "").replace("$", "")
199 |         if any(["K" in price, "k" in price]):
200 |             price = price.lower().split("k")[0].strip()
201 |             price = price + "000"
202 |         if any(["M" in price, "m" in price]):
203 |             price = price.lower().split("m")[0].strip()
204 |             if "." not in price:
205 |                 price = price + "000000"
206 |             else:
207 |                 pricelen = len(price.split('.')[0]) + 6
208 |                 price = price.replace('.', '')
209 |                 diff = pricelen - len(price)
210 |                 price = price + (diff * "0")
211 |         if len(price) == 0:
212 |             price = 'NA'
213 |     return (price)
214 | 
215 | 
216 | def get_card_info(soup_obj):
217 |     # For most listings, card_info will contain info on number of bedrooms,
218 |     # number of bathrooms, square footage, and sometimes price.
219 |     try:
220 |         card = soup_obj.find(
221 |             "span", {"class": "zsg-photo-card-info"}).get_text().split(" · ")
222 |     except (ValueError, AttributeError):
223 |         card = "NA"
224 |     if len(card) == 0 or card == 'null':
225 |         card = "NA"
226 |     return (card)
227 | 
228 | 
229 | def get_sqft(list_obj):
230 |     sqft = [n for n in list_obj if " sqft" in n]
231 |     if len(sqft) > 0:
232 |         try:
233 |             sqft = float(sqft[0].split("sqft")[0].strip().replace(",", "").replace("+", ""))
234 |         except (ValueError, IndexError):
235 |             sqft = "NA"
236 |         if sqft == 0:
237 |             sqft = "NA"
238 |     else:
239 |         sqft = "NA"
240 |     return (sqft)
241 | 
242 | 
243 | def get_bedrooms(list_obj):
244 |     beds = [n for n in list_obj if any(["bd" in n, "tudio" in n])]
245 |     if len(beds) > 0:
246 |         if any([beds[0] == "Studio", beds[0] == "studio"]):
247 |             beds = 0
248 |             return (beds)
249 |         try:
250 |             beds = float(beds[0].split("bd")[0].strip())
251 |         except (ValueError, IndexError):
252 |             if any([beds[0] == "Studio", beds[0] == "studio"]):
253 |                 beds = 0
254 |             else:
255 |                 beds = "NA"
256 |     else:
257 |         beds = "NA"
258 |     return (beds)
259 | 
260 | 
261 | def get_bathrooms(list_obj):
262 |     baths = [n for n in list_obj if "ba" in n]
263 |     if len(baths) > 0:
264 |         try:
265 |             baths = float(baths[0].split("ba")[0].strip())
266 |         except (ValueError, IndexError):
267 |             baths = "NA"
268 |         if baths == 0:
269 |             baths = "NA"
270 |     else:
271 |         baths = "NA"
272 |     return (baths)
273 | 
274 | 
275 | def get_days_on_market(soup_obj):
276 |     try:
277 |         dom = soup_obj.find_all(
278 |             "span", {"class": "zsg-photo-card-notification"})
279 |         dom = [n for n in dom if "illow" in n.get_text()]
280 |         if len(dom) > 0:
281 |             dom = dom[0].get_text().strip()
282 |             dom = int(dom.split(" ")[0])
283 |         else:
284 |             dom = "NA"
285 |     except (ValueError, AttributeError):
286 |         dom = "NA"
287 |     return (dom)
288 | 
289 | 
290 | def get_sale_type(soup_obj):
291 |     try:
292 |         saletype = soup_obj.find(
293 |             "span", {"class": "zsg-photo-card-status"}).get_text().strip()
294 |     except (ValueError, AttributeError):
295 |         saletype = "NA"
296 |     if len(saletype) == 0 or saletype == 'null':
297 |         saletype = "NA"
298 |     return (saletype)
299 | 
300 | 
301 | def get_url(soup_obj):
302 |     # Try to find url in the BeautifulSoup object.
303 |     href = [n["href"] for n in soup_obj.find_all("a", href=True)]
304 |     url = [i for i in href if "homedetails" in i]
305 |     if len(url) > 0:
306 |         url = "http://www.zillow.com" + url[0]
307 |     else:
308 |         # If that fails, contruct the url from the zpid of the listing.
309 |         url = [i for i in href if "zpid" in i and "avorite" not in i]
310 |         if len(url) > 0:
311 |             zpid = re.findall(r"\d{7,10}", href[0])
312 |             if zpid is not None and len(zpid) > 0:
313 |                 url = 'http://www.zillow.com/' \
314 |                       + str(zpid[0]) \
315 |                       + '_zpid/any_days/globalrelevanceex_sort/29.759534,' \
316 |                       + '-95.335321,29.675003,-95.502863_rect/12_zm/'
317 |             else:
318 |                 url = "NA"
319 |         else:
320 |             url = "NA"
321 |     return (url)
322 | 
323 | 
324 | def get_id(soup_obj):
325 |     # Try to find url in the BeautifulSoup object.
326 |     href = [n["href"] for n in soup_obj.find_all("a", href=True)]
327 |     url = [i for i in href if "homedetails" in i]
328 |     zpid = re.findall(r"\d{8,11}", href[0])
329 |     if zpid is not None and len(zpid) > 0:
330 |         zid = zpid
331 |     else:
332 |         zid = "NA"
333 |     return ''.join(zid)
334 | 
335 | 
336 | def close_connection(driver):
337 |     driver.quit()
338 | 


--------------------------------------------------------------------------------
/scripts/zillowxgboost.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | import itertools
  6 | import datetime
  7 | pd.set_option('display.max_columns', None)
  8 | from sklearn.cross_validation import KFold
  9 | from sklearn.cross_validation import train_test_split
 10 | import xgboost as xgb
 11 | from operator import itemgetter
 12 | import time
 13 | from sklearn import preprocessing
 14 | 
 15 | 
 16 | def write_to_csv(output,score):
 17 |     now = datetime.datetime.now()
 18 |     sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
 19 |     print('Writing submission: ', sub_file)
 20 |     f = open(sub_file, 'w')
 21 |     prediction_file_object = csv.writer(f)
 22 |     prediction_file_object.writerow(["zpid","price"])  # don't forget the headers
 23 | 
 24 |     for i in range(len(test)):
 25 |         prediction_file_object.writerow([test["zpid"][test.index[i]], (output[i])])
 26 | 
 27 | def get_features(train, test):
 28 |     trainval = list(train.columns.values) # list train features
 29 |     testval = list(test.columns.values) # list test features
 30 |     output = list(set(trainval) & set(testval)) # check wich features are in common (remove the outcome column)
 31 |     output.remove('zpid') # remove non-usefull id column
 32 |     return output
 33 | 
 34 | 
 35 | def process_features(train, test):
 36 |     tables = [test, train]
 37 |     print("Handling missing values...")
 38 |     total_missing = train.isnull().sum()
 39 |     to_delete = total_missing[total_missing > (1460 / 3.)]  # select features with more than 1/3 missing values
 40 |     for table in tables:
 41 |         table.drop(to_delete.index.tolist(), axis=1, inplace=True)
 42 | 
 43 |     print("Filling Nan...")
 44 |     numerical_features = test.select_dtypes(include=["float", "int", "bool"]).columns.values
 45 |     categorical_features = train.select_dtypes(include=["object"]).columns.values
 46 |     for table in tables:
 47 |         for feature in numerical_features:
 48 |             table[feature].fillna(train[feature].median(), inplace=True)  # replace by median value
 49 |         for feature in categorical_features:
 50 |             table[feature].fillna(train[feature].value_counts().idxmax(),
 51 |                                   inplace=True)  # replace by most frequent value
 52 | 
 53 |     print("Handling categorical features...")
 54 |     for feature in categorical_features:  # Encode categorical features
 55 |         le = preprocessing.LabelEncoder()
 56 |         le.fit(train[feature])
 57 |         for table in tables:
 58 |             table[feature] = le.transform(table[feature])
 59 | 
 60 |     print("Getting features...")
 61 |     features = get_features(train, test)
 62 | 
 63 |     return train, test, features
 64 | 
 65 | 
 66 | def train_and_test_linear(train, test, features, target='price'):  # simple xgboost
 67 |     subsample = 0.8
 68 |     colsample_bytree = 0.8
 69 |     num_boost_round = 1200  # 115 originally
 70 |     early_stopping_rounds = 50
 71 |     test_size = 0.2  # 0.1 originally
 72 | 
 73 |     start_time = time.time()
 74 | 
 75 |     # start the training
 76 | 
 77 |     params = {
 78 |         "objective": "reg:linear",
 79 |         "booster": "gblinear",  # "gbtree",# default
 80 |         "eval_metric": "rmse",
 81 |         "subsample": subsample,  # collect 80% of the data only to prevent overfitting
 82 |         "colsample_bytree": colsample_bytree,
 83 |         "silent": 1,
 84 |         "seed": 0,
 85 |     }
 86 | 
 87 |     X_train, X_valid = train_test_split(train, test_size=test_size,
 88 |                                         random_state=0)  # randomly split into 90% test and 10% CV -> still has the outcome at this point
 89 |     y_train = np.log(X_train[target])  # define y as the outcome column, apply log to have same error as the leaderboard
 90 |     y_valid = np.log(X_valid[target])
 91 |     dtrain = xgb.DMatrix(X_train[features], y_train)  # DMatrix are matrix for xgboost
 92 |     dvalid = xgb.DMatrix(X_valid[features], y_valid)
 93 | 
 94 |     watchlist = [(dtrain, 'train'), (dvalid, 'eval')]  # list of things to evaluate and print
 95 |     gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds,
 96 |                     verbose_eval=True)  # find the best score
 97 |     score = gbm.best_score  # roc_auc_score(X_valid[target].values, check)
 98 |     print('Last error value: {:.6f}'.format(score))
 99 | 
100 |     print("Predict test set...")
101 |     test_prediction = gbm.predict(xgb.DMatrix(test[features]))
102 | 
103 |     print('Training time: {} minutes'.format(round((time.time() - start_time) / 60, 2)))
104 | 
105 |     return test_prediction, score
106 | 
107 | 
108 | def train_and_test_tree(train, test, features, target='price'):  # simple xgboost
109 |     eta_list = [0.1, 0.2]  # list of parameters to try
110 |     max_depth_list = [4, 6, 8]  # list of parameters to try
111 |     subsample = 0.8
112 |     colsample_bytree = 0.8
113 | 
114 |     num_boost_round = 400
115 |     early_stopping_rounds = 10
116 |     test_size = 0.2
117 | 
118 |     start_time = time.time()
119 | 
120 |     # start the training
121 |     array_score = np.ndarray((len(eta_list) * len(max_depth_list), 3))  # store score values
122 |     i = 0
123 |     for eta, max_depth in list(
124 |             itertools.product(eta_list, max_depth_list)):  # Loop over parameters to find the better set
125 |         print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth,
126 |                                                                                                     subsample,
127 |                                                                                                     colsample_bytree))
128 |         params = {
129 |             "objective": "reg:linear",
130 |             "booster": "gbtree",
131 |             "eval_metric": "rmse",  # this is the metric for the leardboard
132 |             "eta": eta,  # shrinking parameters to prevent overfitting
133 |             "tree_method": 'exact',
134 |             "max_depth": max_depth,
135 |             "subsample": subsample,  # collect 80% of the data only to prevent overfitting
136 |             "colsample_bytree": colsample_bytree,
137 |             "silent": 1,
138 |             "seed": 0,
139 |         }
140 | 
141 |         X_train, X_valid = train_test_split(train, test_size=test_size,
142 |                                             random_state=0)  # randomly split into 90% test and 10% CV -> still has the outcome at this point
143 |         y_train = np.log(X_train[target])  # define y as the outcome column
144 |         y_valid = np.log(X_valid[target])
145 |         dtrain = xgb.DMatrix(X_train[features], y_train)  # DMatrix are matrix for xgboost
146 |         dvalid = xgb.DMatrix(X_valid[features], y_valid)
147 | 
148 |         watchlist = [(dtrain, 'train'), (dvalid, 'eval')]  # list of things to evaluate and print
149 |         gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds,
150 |                         verbose_eval=True)  # find the best score
151 | 
152 |         print("Validating...")
153 |         score = gbm.best_score
154 |         print('Last error value: {:.6f}'.format(score))
155 |         array_score[i][0] = eta
156 |         array_score[i][1] = max_depth
157 |         array_score[i][2] = score
158 |         i += 1
159 |     df_score = pd.DataFrame(array_score, columns=['eta', 'max_depth', 'price'])
160 |     print("df_score : \n", df_score)
161 |     # create_feature_map(features)
162 |     importance = gbm.get_fscore()
163 |     importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
164 |     print('Importance array: ', importance)
165 |     np.save("features_importance", importance)  # save feature importance for latter use
166 |     print("Predict test set...")
167 |     test_prediction = gbm.predict(xgb.DMatrix(test[features]),
168 |                                   ntree_limit=gbm.best_ntree_limit)  # only predict with the last set of parameters
169 | 
170 |     print('Training time: {} minutes'.format(round((time.time() - start_time) / 60, 2)))
171 | 
172 |     return test_prediction, score
173 | 
174 | 
175 | def train_and_test_Kfold(train, test, features, target='price'):  # add Kfold
176 |     eta_list = [0.01]  # list of parameters to try
177 |     max_depth_list = [6]
178 |     subsample = 1  # No subsampling, as we already use Kfold latter and we don't have that much data
179 |     colsample_bytree = 1
180 | 
181 |     num_boost_round = 5500  # for small eta, increase this one
182 |     early_stopping_rounds = 500
183 |     n_folds = 12
184 |     start_time = time.time()
185 | 
186 |     # start the training
187 |     array_score = np.ndarray((len(eta_list) * len(max_depth_list), 4))  # store score values
188 |     i = 0
189 |     for eta, max_depth in list(
190 |             itertools.product(eta_list, max_depth_list)):  # Loop over parameters to find the better set
191 |         print('XGBoost params. ETA: {}, MAX_DEPTH: {}'.format(eta, max_depth))
192 |         params = {
193 |             "objective": "reg:linear",
194 |             "booster": "gbtree",
195 |             "eval_metric": "rmse",
196 |             "eta": eta,  # shrinking parameters to prevent overfitting
197 |             "tree_method": 'exact',
198 |             "max_depth": max_depth,
199 |             "subsample": subsample,  # collect 80% of the data only to prevent overfitting
200 |             "colsample_bytree": colsample_bytree,
201 |             "silent": 1,
202 |             "seed": 0,
203 |         }
204 |         kf = KFold(len(train), n_folds=n_folds)
205 |         test_prediction = np.ndarray((n_folds, len(test)))
206 |         fold = 0
207 |         fold_score = []
208 |         for train_index, cv_index in kf:
209 |             X_train, X_valid = train[features].as_matrix()[train_index], train[features].as_matrix()[cv_index]
210 |             y_train, y_valid = np.log(train[target].as_matrix()[train_index]), np.log(
211 |                 train[target].as_matrix()[cv_index])
212 | 
213 |             dtrain = xgb.DMatrix(X_train, y_train)  # DMatrix are matrix for xgboost
214 |             dvalid = xgb.DMatrix(X_valid, y_valid)
215 | 
216 |             watchlist = [(dtrain, 'train'), (dvalid, 'eval')]  # list of things to evaluate and print
217 |             gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist,
218 |                             early_stopping_rounds=early_stopping_rounds, verbose_eval=True)  # find the best score
219 | 
220 |             print("Validating...")
221 |             check = gbm.predict(xgb.DMatrix(X_valid))  # get the best score
222 |             score = gbm.best_score
223 |             print('Check last score value: {:.6f}'.format(score))
224 |             fold_score.append(score)
225 |             importance = gbm.get_fscore()
226 |             importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
227 |             print('Importance array for fold {} :\n {}'.format(fold, importance))
228 |             # np.save("features_importance",importance)
229 |             print("Predict test set...")
230 |             prediction = gbm.predict(xgb.DMatrix(test[features].as_matrix()))
231 |             # np.save("prediction_eta%s_depth%s_fold%s" %(eta,max_depth,fold),prediction) # You can save all the folds prediction to check for errors in code
232 |             test_prediction[fold] = prediction
233 |             fold = fold + 1
234 |         mean_score = np.mean(fold_score)
235 |         print("Mean Score : {}, eta : {}, depth : {}\n".format(mean_score, eta, max_depth))
236 |         array_score[i][0] = eta
237 |         array_score[i][1] = max_depth
238 |         array_score[i][2] = mean_score
239 |         array_score[i][3] = np.std(fold_score)
240 |         i += 1
241 |     final_prediction = test_prediction.mean(axis=0)
242 |     df_score = pd.DataFrame(array_score, columns=['eta', 'max_depth', 'mean_score', 'std_score'])
243 |     print("df_score : \n", df_score)  # get the complete array of scores to choose the right parameters
244 | 
245 |     print('Training time: {} minutes'.format(round((time.time() - start_time) / 60, 2)))
246 | 
247 |     return final_prediction, mean_score
248 | 
249 | 
250 | ############################################################################
251 | # Main code
252 | ###########################################################################
253 | 
254 | num_features = None  # Choose how many features you want to use. None = all
255 | 
256 | train = pd.read_csv('soldsold.csv')
257 | test = pd.read_csv('azmarket.csv')
258 | 
259 | train = train.dropna(subset = ["price", "sqft", "bedrooms", "bathrooms"])
260 | train = train.drop("county", 1)
261 | 
262 | 
263 | test = test.dropna(subset = ["price", "sqft", "bedrooms", "bathrooms"])
264 | test = test.drop("price", 1)
265 | test = test.drop("county", 1)
266 | 
267 | train, test, features = process_features(train, test)
268 | 
269 | # test_prediction,score = train_and_test_linear(train,test,features)
270 | # test_prediction,score = train_and_test_tree(train,test,features) # run at least once this one to get the features importance
271 | # features=np.load("features_importance.npy")
272 | test_prediction, score = train_and_test_Kfold(train, test, features[:num_features])
273 | 
274 | write_to_csv(np.exp(test_prediction), score)
275 | 


--------------------------------------------------------------------------------
/spark/sparkbootstrap.sh:
--------------------------------------------------------------------------------
1 | sudo yum install -y python35-pip python35-devel libxml2-devel libxslt-devel 
2 | sudo yum install gcc
3 | sudo python35 -m pip install pip pyyaml ipython jupyter pandas boto beautifulsoup4 -U
4 | 
5 | export PYSPARK_DRIVER_PYTHON='which jupyter'
6 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.open_browser=False --NotebookApp.ip='*' --NotebookApp.port=8888"


--------------------------------------------------------------------------------
/spark/sparktopostgres.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from pyspark.sql import SQLContext\n",
 12 |     "from pyspark.sql.types import *\n",
 13 |     "import numpy as np"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "sqlContext = SQLContext(sc)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "labor = sqlContext.read.load('s3a://zillowstreamjk/historical/laborzip.csv', \n",
 36 |     "                          format='com.databricks.spark.csv', \n",
 37 |     "                          header='true', \n",
 38 |     "                          inferSchema='true')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import pandas as pd"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "collapsed": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "laborzip = labor.toPandas()"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "laborzip['zip_population'] = laborzip['irs_estimated_population_2014']"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "laborzip.drop(['type','area_codes','country','latitude','longitude','employed','unemployed','irs_estimated_population_2014'], axis=1, inplace=True)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "laborzip['labor_force'] = laborzip['labor_force'].astype(float)\n",
 94 |     "laborzip['u_rate'] = laborzip['u_rate'].astype(float)\n",
 95 |     "laborzip['zip_population'] = laborzip['zip_population'].astype(float)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "import sqlalchemy\n",
107 |     "import psycopg2"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "collapsed": true
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "def connect(user, password, db, host='zillowdb.cyazdghc3lqr.us-east-1.rds.amazonaws.com', port=5432):\n",
119 |     "    '''Returns a connection and a metadata object'''\n",
120 |     "    # We connect with the help of the PostgreSQL URL\n",
121 |     "    # postgresql://federer:grandestslam@localhost:5432/tennis\n",
122 |     "    url = 'postgresql://{}:{}@{}:{}/{}'\n",
123 |     "    url = url.format(user, password, host, port, db)\n",
124 |     "\n",
125 |     "    # The return value of create_engine() is our connection object\n",
126 |     "    con = sqlalchemy.create_engine(url, client_encoding='utf8')\n",
127 |     "\n",
128 |     "    # We then bind the connection to MetaData()\n",
129 |     "    meta = sqlalchemy.MetaData(bind=con, reflect=True)\n",
130 |     "\n",
131 |     "    return con, meta"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "con, meta = connect('user', 'password', 'zillow')"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {
149 |     "collapsed": true
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "from sqlalchemy import create_engine"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "engine = create_engine('postgresql://user:password@zillowdb.cyazdghc3lqr.us-east-1.rds.amazonaws.com:5432/zillow')"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "collapsed": true
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "laborzip.to_sql('county',engine,index=False,if_exists='append')"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "-"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "collapsed": true
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "df = sqlContext.read.load('s3a://zillowstreamjk/parsed/sold/*/*', \n",
194 |     "                          format='com.databricks.spark.csv', \n",
195 |     "                          header='true', \n",
196 |     "                          inferSchema='true')"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "df = (df.withColumn('sqft', df.sqft.cast('int')).withColumn('zip', df.zip.cast('string')).withColumn('bedrooms', df.bedrooms.cast('int')).withColumn('bathrooms', df.bathrooms.cast('int')))"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "collapsed": true
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "sold = df.toPandas()"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {
225 |     "collapsed": true
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "sold.drop_duplicates(subset=['zpid'],inplace=True)\n",
230 |     "sold['sale_type'] = sold['sale_type'].map(lambda x: x.strip('SOLD:').replace('$','').replace('M',''))\n",
231 |     "sold['sale_type'] = sold['sale_type'].str.replace(' ', '').str.replace(',', '')\n",
232 |     "sold['price'] = sold['sale_type'].astype(float)"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {
239 |     "collapsed": true
240 |    },
241 |    "outputs": [],
242 |    "source": [
243 |     "sold.drop(['city','state','sale_type','url'], axis=1, inplace=True)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "collapsed": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "def times500(x):\n",
255 |     "    if x > 500:\n",
256 |     "        return x\n",
257 |     "    elif x:\n",
258 |     "        return 1000000 * x\n",
259 |     "    else:\n",
260 |     "        return"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {
267 |     "collapsed": true
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "sold['price'] = sold['price'].apply(times500)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {
278 |     "collapsed": true
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "sold = sold[sold.zpid != 'NA']"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "collapsed": true
290 |    },
291 |    "outputs": [],
292 |    "source": [
293 |     "sold.drop_duplicates(subset=['zpid'], keep='last',inplace=True)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {
300 |     "collapsed": true
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "sold.to_sql('property',engine,index=False,if_exists='append')"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {
311 |     "collapsed": true
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "nextdf = sqlContext.read.load('s3a://zillowstreamjk/parsed/market/*/*', \n",
316 |     "                          format='com.databricks.spark.csv', \n",
317 |     "                          header='true', \n",
318 |     "                          inferSchema='true')"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {
325 |     "collapsed": true
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "nextdf = (nextdf.withColumn('sqft', nextdf.sqft.cast('int')).withColumn('zip', nextdf.zip.cast('string')).withColumn('bedrooms', nextdf.bedrooms.cast('int')).withColumn('bathrooms', nextdf.bathrooms.cast('int')))"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {
336 |     "collapsed": true
337 |    },
338 |    "outputs": [],
339 |    "source": [
340 |     "market = nextdf.toPandas()"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {
347 |     "collapsed": true
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "market.drop(['city','state','sale_type','url','days_on_zillow'], axis=1, inplace=True)\n",
352 |     "market = market[market.price != 'NA']\n",
353 |     "market['price'] = market['price'].astype(float)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "collapsed": true
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "market = market[market.zpid != 'NA']"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {
371 |     "collapsed": true
372 |    },
373 |    "outputs": [],
374 |    "source": [
375 |     "market.drop_duplicates(subset=['zpid'], keep='last',inplace=True)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "metadata": {
382 |     "collapsed": true
383 |    },
384 |    "outputs": [],
385 |    "source": [
386 |     "market.to_sql('market',engine,index=False,if_exists='append')"
387 |    ]
388 |   }
389 |  ],
390 |  "metadata": {
391 |   "anaconda-cloud": {},
392 |   "kernelspec": {
393 |    "display_name": "Python [default]",
394 |    "language": "python",
395 |    "name": "python3"
396 |   },
397 |   "language_info": {
398 |    "codemirror_mode": {
399 |     "name": "ipython",
400 |     "version": 3
401 |    },
402 |    "file_extension": ".py",
403 |    "mimetype": "text/x-python",
404 |    "name": "python",
405 |    "nbconvert_exporter": "python",
406 |    "pygments_lexer": "ipython3",
407 |    "version": "3.5.2"
408 |   }
409 |  },
410 |  "nbformat": 4,
411 |  "nbformat_minor": 2
412 | }
413 | 


--------------------------------------------------------------------------------