├── README.md
├── data
    ├── kc_house_data.gl
    │   ├── dir_archive.ini
    │   ├── m_fb913aaf43c120c4.0000
    │   ├── m_fb913aaf43c120c4.frame_idx
    │   ├── m_fb913aaf43c120c4.sidx
    │   └── objects.bin
    └── kc_house_data_small.gl
    │   ├── dir_archive.ini
    │   ├── m_ef92e6258b8f7992.0000
    │   ├── m_ef92e6258b8f7992.frame_idx
    │   ├── m_ef92e6258b8f7992.sidx
    │   └── objects.bin
├── lasso-regression
    └── lasso-regression.ipynb
├── multiple-linear-regression
    ├── multiple-regression-gradient-descent.ipynb
    └── multiple-regression.ipynb
├── nearest-neighbor-regression
    └── nearest-neighbor-regression.ipynb
├── polynomial-regression
    └── polynomial-regression.ipynb
├── ridge-regression
    ├── ridge-regression-gradient-descent.ipynb
    └── ridge-regression.ipynb
└── simple-linear-regression
    └── simple-linear-regression.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | ## Machine Learning Regression: House Sales Price Prediction Models
 2 | 
 3 | ### Description
 4 | * Implemented linear regression and k nearest neighbors algorithm with gradient descent optimization to make an optimal model for predicting house prices using the Seattle King County dataset.
 5 | * Performed feature engineering and selection using lasso and ridge penalties to eliminate features which had little or no impact on the residual sum of squares error.
 6 | 
 7 | ### Code
 8 | 1. [Simple Linear Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/simple-linear-regression/simple-linear-regression.ipynb)
 9 | 2. [Multiple Linear Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/multiple-linear-regression/multiple-regression.ipynb)
10 | 3. [Multiple Linear Regression with Gradient Descent Optimization](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/multiple-linear-regression/multiple-regression-gradient-descent.ipynb)
11 | 4. [Polynomial Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/polynomial-regression/polynomial-regression.ipynb)
12 | 5. [Ridge Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/ridge-regression/ridge-regression.ipynb)
13 | 6. [Ridge Regression with Gradient Descent Optimization](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/ridge-regression/ridge-regression-gradient-descent.ipynb)
14 | 7. [Lasso Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/lasso-regression/lasso-regression.ipynb)
15 | 8. [Nearest Neighbor Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/nearest-neighbor-regression/nearest-neighbor-regression.ipynb)
16 | 
17 | ### [Data](https://github.com/agrawal-priyank/machine-learning-regression/tree/master/data)
18 | 
19 | ### Programming Language
20 | Python
21 | 
22 | ### Packages
23 | Anaconda, Graphlab Create [Installation guide](https://turi.com/learn/coursera/)
24 | 
25 | ### Tools/IDE 
26 | Jupyter notebook (IPython)
27 | 
28 | ### How to use it
29 | 1. Fork this repository to have your own copy
30 | 2. Clone your copy on your local system
31 | 3. Install necessary packages
32 | 
33 | ### Note
34 | This repository does not contain optimal machine learning models! It only assesses various models that can be built using different machine learning algorithms (either implemented or used directly from Graphlab Create package) to perform different tasks.
35 | 


--------------------------------------------------------------------------------
/data/kc_house_data.gl/dir_archive.ini:
--------------------------------------------------------------------------------
 1 | [archive]
 2 | version=1
 3 | num_prefixes=3
 4 | [metadata]
 5 | contents=sframe
 6 | [prefixes]
 7 | 0000=dir_archive.ini
 8 | 0001=objects.bin
 9 | 0002=m_fb913aaf43c120c4
10 | 


--------------------------------------------------------------------------------
/data/kc_house_data.gl/m_fb913aaf43c120c4.0000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agrawal-priyank/machine-learning-regression/8a58fda3947e28289c9277c74d899f87f0a4a79b/data/kc_house_data.gl/m_fb913aaf43c120c4.0000


--------------------------------------------------------------------------------
/data/kc_house_data.gl/m_fb913aaf43c120c4.frame_idx:
--------------------------------------------------------------------------------
 1 | [sframe]
 2 | version=0
 3 | num_segments=0
 4 | num_columns=21
 5 | nrows=21613
 6 | [column_names]
 7 | 0000=id
 8 | 0001=date
 9 | 0002=price
10 | 0003=bedrooms
11 | 0004=bathrooms
12 | 0005=sqft_living
13 | 0006=sqft_lot
14 | 0007=floors
15 | 0008=waterfront
16 | 0009=view
17 | 0010=condition
18 | 0011=grade
19 | 0012=sqft_above
20 | 0013=sqft_basement
21 | 0014=yr_built
22 | 0015=yr_renovated
23 | 0016=zipcode
24 | 0017=lat
25 | 0018=long
26 | 0019=sqft_living15
27 | 0020=sqft_lot15
28 | [column_files]
29 | 0000=m_fb913aaf43c120c4.sidx:0
30 | 0001=m_fb913aaf43c120c4.sidx:1
31 | 0002=m_fb913aaf43c120c4.sidx:2
32 | 0003=m_fb913aaf43c120c4.sidx:3
33 | 0004=m_fb913aaf43c120c4.sidx:4
34 | 0005=m_fb913aaf43c120c4.sidx:5
35 | 0006=m_fb913aaf43c120c4.sidx:6
36 | 0007=m_fb913aaf43c120c4.sidx:7
37 | 0008=m_fb913aaf43c120c4.sidx:8
38 | 0009=m_fb913aaf43c120c4.sidx:9
39 | 0010=m_fb913aaf43c120c4.sidx:10
40 | 0011=m_fb913aaf43c120c4.sidx:11
41 | 0012=m_fb913aaf43c120c4.sidx:12
42 | 0013=m_fb913aaf43c120c4.sidx:13
43 | 0014=m_fb913aaf43c120c4.sidx:14
44 | 0015=m_fb913aaf43c120c4.sidx:15
45 | 0016=m_fb913aaf43c120c4.sidx:16
46 | 0017=m_fb913aaf43c120c4.sidx:17
47 | 0018=m_fb913aaf43c120c4.sidx:18
48 | 0019=m_fb913aaf43c120c4.sidx:19
49 | 0020=m_fb913aaf43c120c4.sidx:20
50 | 


--------------------------------------------------------------------------------
/data/kc_house_data.gl/m_fb913aaf43c120c4.sidx:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"sarray" : {
  3 | 		"version" : 2,
  4 | 		"num_segments" : 1
  5 | 	},
  6 | 	"segment_files" : {
  7 | 		"0000" : "m_fb913aaf43c120c4.0000"
  8 | 	},
  9 | 	"columns" : [
 10 | 		{
 11 | 			"content_type" : "",
 12 | 			"metadata" : {
 13 | 				"__type__" : "2"
 14 | 			},
 15 | 			"segment_sizes" : {
 16 | 				"0000" : "21613"
 17 | 			}
 18 | 		},
 19 | 		{
 20 | 			"content_type" : "",
 21 | 			"metadata" : {
 22 | 				"__type__" : "6"
 23 | 			},
 24 | 			"segment_sizes" : {
 25 | 				"0000" : "21613"
 26 | 			}
 27 | 		},
 28 | 		{
 29 | 			"content_type" : "",
 30 | 			"metadata" : {
 31 | 				"__type__" : "1"
 32 | 			},
 33 | 			"segment_sizes" : {
 34 | 				"0000" : "21613"
 35 | 			}
 36 | 		},
 37 | 		{
 38 | 			"content_type" : "",
 39 | 			"metadata" : {
 40 | 				"__type__" : "1"
 41 | 			},
 42 | 			"segment_sizes" : {
 43 | 				"0000" : "21613"
 44 | 			}
 45 | 		},
 46 | 		{
 47 | 			"content_type" : "",
 48 | 			"metadata" : {
 49 | 				"__type__" : "1"
 50 | 			},
 51 | 			"segment_sizes" : {
 52 | 				"0000" : "21613"
 53 | 			}
 54 | 		},
 55 | 		{
 56 | 			"content_type" : "",
 57 | 			"metadata" : {
 58 | 				"__type__" : "1"
 59 | 			},
 60 | 			"segment_sizes" : {
 61 | 				"0000" : "21613"
 62 | 			}
 63 | 		},
 64 | 		{
 65 | 			"content_type" : "",
 66 | 			"metadata" : {
 67 | 				"__type__" : "0"
 68 | 			},
 69 | 			"segment_sizes" : {
 70 | 				"0000" : "21613"
 71 | 			}
 72 | 		},
 73 | 		{
 74 | 			"content_type" : "",
 75 | 			"metadata" : {
 76 | 				"__type__" : "2"
 77 | 			},
 78 | 			"segment_sizes" : {
 79 | 				"0000" : "21613"
 80 | 			}
 81 | 		},
 82 | 		{
 83 | 			"content_type" : "",
 84 | 			"metadata" : {
 85 | 				"__type__" : "0"
 86 | 			},
 87 | 			"segment_sizes" : {
 88 | 				"0000" : "21613"
 89 | 			}
 90 | 		},
 91 | 		{
 92 | 			"content_type" : "",
 93 | 			"metadata" : {
 94 | 				"__type__" : "0"
 95 | 			},
 96 | 			"segment_sizes" : {
 97 | 				"0000" : "21613"
 98 | 			}
 99 | 		},
100 | 		{
101 | 			"content_type" : "",
102 | 			"metadata" : {
103 | 				"__type__" : "0"
104 | 			},
105 | 			"segment_sizes" : {
106 | 				"0000" : "21613"
107 | 			}
108 | 		},
109 | 		{
110 | 			"content_type" : "",
111 | 			"metadata" : {
112 | 				"__type__" : "0"
113 | 			},
114 | 			"segment_sizes" : {
115 | 				"0000" : "21613"
116 | 			}
117 | 		},
118 | 		{
119 | 			"content_type" : "",
120 | 			"metadata" : {
121 | 				"__type__" : "0"
122 | 			},
123 | 			"segment_sizes" : {
124 | 				"0000" : "21613"
125 | 			}
126 | 		},
127 | 		{
128 | 			"content_type" : "",
129 | 			"metadata" : {
130 | 				"__type__" : "0"
131 | 			},
132 | 			"segment_sizes" : {
133 | 				"0000" : "21613"
134 | 			}
135 | 		},
136 | 		{
137 | 			"content_type" : "",
138 | 			"metadata" : {
139 | 				"__type__" : "0"
140 | 			},
141 | 			"segment_sizes" : {
142 | 				"0000" : "21613"
143 | 			}
144 | 		},
145 | 		{
146 | 			"content_type" : "",
147 | 			"metadata" : {
148 | 				"__type__" : "0"
149 | 			},
150 | 			"segment_sizes" : {
151 | 				"0000" : "21613"
152 | 			}
153 | 		},
154 | 		{
155 | 			"content_type" : "",
156 | 			"metadata" : {
157 | 				"__type__" : "2"
158 | 			},
159 | 			"segment_sizes" : {
160 | 				"0000" : "21613"
161 | 			}
162 | 		},
163 | 		{
164 | 			"content_type" : "",
165 | 			"metadata" : {
166 | 				"__type__" : "1"
167 | 			},
168 | 			"segment_sizes" : {
169 | 				"0000" : "21613"
170 | 			}
171 | 		},
172 | 		{
173 | 			"content_type" : "",
174 | 			"metadata" : {
175 | 				"__type__" : "1"
176 | 			},
177 | 			"segment_sizes" : {
178 | 				"0000" : "21613"
179 | 			}
180 | 		},
181 | 		{
182 | 			"content_type" : "",
183 | 			"metadata" : {
184 | 				"__type__" : "1"
185 | 			},
186 | 			"segment_sizes" : {
187 | 				"0000" : "21613"
188 | 			}
189 | 		},
190 | 		{
191 | 			"content_type" : "",
192 | 			"metadata" : {
193 | 				"__type__" : "1"
194 | 			},
195 | 			"segment_sizes" : {
196 | 				"0000" : "21613"
197 | 			}
198 | 		}
199 | 	]
200 | }


--------------------------------------------------------------------------------
/data/kc_house_data.gl/objects.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agrawal-priyank/machine-learning-regression/8a58fda3947e28289c9277c74d899f87f0a4a79b/data/kc_house_data.gl/objects.bin


--------------------------------------------------------------------------------
/data/kc_house_data_small.gl/dir_archive.ini:
--------------------------------------------------------------------------------
 1 | [archive]
 2 | version=1
 3 | num_prefixes=3
 4 | [metadata]
 5 | contents=sframe
 6 | [prefixes]
 7 | 0000=dir_archive.ini
 8 | 0001=objects.bin
 9 | 0002=m_ef92e6258b8f7992
10 | 


--------------------------------------------------------------------------------
/data/kc_house_data_small.gl/m_ef92e6258b8f7992.0000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agrawal-priyank/machine-learning-regression/8a58fda3947e28289c9277c74d899f87f0a4a79b/data/kc_house_data_small.gl/m_ef92e6258b8f7992.0000


--------------------------------------------------------------------------------
/data/kc_house_data_small.gl/m_ef92e6258b8f7992.frame_idx:
--------------------------------------------------------------------------------
 1 | [sframe]
 2 | version=0
 3 | num_segments=0
 4 | num_columns=21
 5 | nrows=8703
 6 | [column_names]
 7 | 0000=id
 8 | 0001=date
 9 | 0002=price
10 | 0003=bedrooms
11 | 0004=bathrooms
12 | 0005=sqft_living
13 | 0006=sqft_lot
14 | 0007=floors
15 | 0008=waterfront
16 | 0009=view
17 | 0010=condition
18 | 0011=grade
19 | 0012=sqft_above
20 | 0013=sqft_basement
21 | 0014=yr_built
22 | 0015=yr_renovated
23 | 0016=zipcode
24 | 0017=lat
25 | 0018=long
26 | 0019=sqft_living15
27 | 0020=sqft_lot15
28 | [column_files]
29 | 0000=m_ef92e6258b8f7992.sidx:0
30 | 0001=m_ef92e6258b8f7992.sidx:1
31 | 0002=m_ef92e6258b8f7992.sidx:2
32 | 0003=m_ef92e6258b8f7992.sidx:3
33 | 0004=m_ef92e6258b8f7992.sidx:4
34 | 0005=m_ef92e6258b8f7992.sidx:5
35 | 0006=m_ef92e6258b8f7992.sidx:6
36 | 0007=m_ef92e6258b8f7992.sidx:7
37 | 0008=m_ef92e6258b8f7992.sidx:8
38 | 0009=m_ef92e6258b8f7992.sidx:9
39 | 0010=m_ef92e6258b8f7992.sidx:10
40 | 0011=m_ef92e6258b8f7992.sidx:11
41 | 0012=m_ef92e6258b8f7992.sidx:12
42 | 0013=m_ef92e6258b8f7992.sidx:13
43 | 0014=m_ef92e6258b8f7992.sidx:14
44 | 0015=m_ef92e6258b8f7992.sidx:15
45 | 0016=m_ef92e6258b8f7992.sidx:16
46 | 0017=m_ef92e6258b8f7992.sidx:17
47 | 0018=m_ef92e6258b8f7992.sidx:18
48 | 0019=m_ef92e6258b8f7992.sidx:19
49 | 0020=m_ef92e6258b8f7992.sidx:20
50 | 


--------------------------------------------------------------------------------
/data/kc_house_data_small.gl/m_ef92e6258b8f7992.sidx:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"sarray" : {
  3 | 		"version" : 2,
  4 | 		"num_segments" : 1
  5 | 	},
  6 | 	"segment_files" : {
  7 | 		"0000" : "m_ef92e6258b8f7992.0000"
  8 | 	},
  9 | 	"columns" : [
 10 | 		{
 11 | 			"content_type" : "",
 12 | 			"metadata" : {
 13 | 				"__type__" : "2"
 14 | 			},
 15 | 			"segment_sizes" : {
 16 | 				"0000" : "8703"
 17 | 			}
 18 | 		},
 19 | 		{
 20 | 			"content_type" : "",
 21 | 			"metadata" : {
 22 | 				"__type__" : "6"
 23 | 			},
 24 | 			"segment_sizes" : {
 25 | 				"0000" : "8703"
 26 | 			}
 27 | 		},
 28 | 		{
 29 | 			"content_type" : "",
 30 | 			"metadata" : {
 31 | 				"__type__" : "0"
 32 | 			},
 33 | 			"segment_sizes" : {
 34 | 				"0000" : "8703"
 35 | 			}
 36 | 		},
 37 | 		{
 38 | 			"content_type" : "",
 39 | 			"metadata" : {
 40 | 				"__type__" : "1"
 41 | 			},
 42 | 			"segment_sizes" : {
 43 | 				"0000" : "8703"
 44 | 			}
 45 | 		},
 46 | 		{
 47 | 			"content_type" : "",
 48 | 			"metadata" : {
 49 | 				"__type__" : "1"
 50 | 			},
 51 | 			"segment_sizes" : {
 52 | 				"0000" : "8703"
 53 | 			}
 54 | 		},
 55 | 		{
 56 | 			"content_type" : "",
 57 | 			"metadata" : {
 58 | 				"__type__" : "1"
 59 | 			},
 60 | 			"segment_sizes" : {
 61 | 				"0000" : "8703"
 62 | 			}
 63 | 		},
 64 | 		{
 65 | 			"content_type" : "",
 66 | 			"metadata" : {
 67 | 				"__type__" : "0"
 68 | 			},
 69 | 			"segment_sizes" : {
 70 | 				"0000" : "8703"
 71 | 			}
 72 | 		},
 73 | 		{
 74 | 			"content_type" : "",
 75 | 			"metadata" : {
 76 | 				"__type__" : "1"
 77 | 			},
 78 | 			"segment_sizes" : {
 79 | 				"0000" : "8703"
 80 | 			}
 81 | 		},
 82 | 		{
 83 | 			"content_type" : "",
 84 | 			"metadata" : {
 85 | 				"__type__" : "0"
 86 | 			},
 87 | 			"segment_sizes" : {
 88 | 				"0000" : "8703"
 89 | 			}
 90 | 		},
 91 | 		{
 92 | 			"content_type" : "",
 93 | 			"metadata" : {
 94 | 				"__type__" : "0"
 95 | 			},
 96 | 			"segment_sizes" : {
 97 | 				"0000" : "8703"
 98 | 			}
 99 | 		},
100 | 		{
101 | 			"content_type" : "",
102 | 			"metadata" : {
103 | 				"__type__" : "0"
104 | 			},
105 | 			"segment_sizes" : {
106 | 				"0000" : "8703"
107 | 			}
108 | 		},
109 | 		{
110 | 			"content_type" : "",
111 | 			"metadata" : {
112 | 				"__type__" : "0"
113 | 			},
114 | 			"segment_sizes" : {
115 | 				"0000" : "8703"
116 | 			}
117 | 		},
118 | 		{
119 | 			"content_type" : "",
120 | 			"metadata" : {
121 | 				"__type__" : "0"
122 | 			},
123 | 			"segment_sizes" : {
124 | 				"0000" : "8703"
125 | 			}
126 | 		},
127 | 		{
128 | 			"content_type" : "",
129 | 			"metadata" : {
130 | 				"__type__" : "0"
131 | 			},
132 | 			"segment_sizes" : {
133 | 				"0000" : "8703"
134 | 			}
135 | 		},
136 | 		{
137 | 			"content_type" : "",
138 | 			"metadata" : {
139 | 				"__type__" : "0"
140 | 			},
141 | 			"segment_sizes" : {
142 | 				"0000" : "8703"
143 | 			}
144 | 		},
145 | 		{
146 | 			"content_type" : "",
147 | 			"metadata" : {
148 | 				"__type__" : "0"
149 | 			},
150 | 			"segment_sizes" : {
151 | 				"0000" : "8703"
152 | 			}
153 | 		},
154 | 		{
155 | 			"content_type" : "",
156 | 			"metadata" : {
157 | 				"__type__" : "2"
158 | 			},
159 | 			"segment_sizes" : {
160 | 				"0000" : "8703"
161 | 			}
162 | 		},
163 | 		{
164 | 			"content_type" : "",
165 | 			"metadata" : {
166 | 				"__type__" : "1"
167 | 			},
168 | 			"segment_sizes" : {
169 | 				"0000" : "8703"
170 | 			}
171 | 		},
172 | 		{
173 | 			"content_type" : "",
174 | 			"metadata" : {
175 | 				"__type__" : "1"
176 | 			},
177 | 			"segment_sizes" : {
178 | 				"0000" : "8703"
179 | 			}
180 | 		},
181 | 		{
182 | 			"content_type" : "",
183 | 			"metadata" : {
184 | 				"__type__" : "1"
185 | 			},
186 | 			"segment_sizes" : {
187 | 				"0000" : "8703"
188 | 			}
189 | 		},
190 | 		{
191 | 			"content_type" : "",
192 | 			"metadata" : {
193 | 				"__type__" : "1"
194 | 			},
195 | 			"segment_sizes" : {
196 | 				"0000" : "8703"
197 | 			}
198 | 		}
199 | 	]
200 | }


--------------------------------------------------------------------------------
/data/kc_house_data_small.gl/objects.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/agrawal-priyank/machine-learning-regression/8a58fda3947e28289c9277c74d899f87f0a4a79b/data/kc_house_data_small.gl/objects.bin


--------------------------------------------------------------------------------
/lasso-regression/lasso-regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Lasso Regression on House Sales Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Fire up Graphlab Create"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import graphlab"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Load in house sales data\n",
 33 |     "\n",
 34 |     "Dataset is from house sales in King County, the region where the city of Seattle, WA is located."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 98,
 40 |    "metadata": {
 41 |     "collapsed": false,
 42 |     "scrolled": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "sales = graphlab.SFrame('kc_house_data.gl/')"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "### Explore house sales data"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 99,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/html": [
 66 |        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\"><table frame=\"box\" rules=\"cols\">\n",
 67 |        "    <tr>\n",
 68 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">id</th>\n",
 69 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">date</th>\n",
 70 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">price</th>\n",
 71 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">bedrooms</th>\n",
 72 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">bathrooms</th>\n",
 73 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_living</th>\n",
 74 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_lot</th>\n",
 75 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">floors</th>\n",
 76 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">waterfront</th>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">7129300520</td>\n",
 80 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">2014-10-13 00:00:00+00:00</td>\n",
 81 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">221900.0</td>\n",
 82 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">3.0</td>\n",
 83 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1.0</td>\n",
 84 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1180.0</td>\n",
 85 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">5650</td>\n",
 86 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1</td>\n",
 87 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
 88 |        "    </tr>\n",
 89 |        "</table>\n",
 90 |        "<table frame=\"box\" rules=\"cols\">\n",
 91 |        "    <tr>\n",
 92 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">view</th>\n",
 93 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">condition</th>\n",
 94 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">grade</th>\n",
 95 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_above</th>\n",
 96 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_basement</th>\n",
 97 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">yr_built</th>\n",
 98 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">yr_renovated</th>\n",
 99 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">zipcode</th>\n",
100 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">lat</th>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
104 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">3</td>\n",
105 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">7</td>\n",
106 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1180</td>\n",
107 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
108 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1955</td>\n",
109 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
110 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">98178</td>\n",
111 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">47.51123398</td>\n",
112 |        "    </tr>\n",
113 |        "</table>\n",
114 |        "<table frame=\"box\" rules=\"cols\">\n",
115 |        "    <tr>\n",
116 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">long</th>\n",
117 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_living15</th>\n",
118 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_lot15</th>\n",
119 |        "    </tr>\n",
120 |        "    <tr>\n",
121 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">-122.25677536</td>\n",
122 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1340.0</td>\n",
123 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">5650.0</td>\n",
124 |        "    </tr>\n",
125 |        "</table>\n",
126 |        "[1 rows x 21 columns]<br/>\n",
127 |        "</div>"
128 |       ],
129 |       "text/plain": [
130 |        "Columns:\n",
131 |        "\tid\tstr\n",
132 |        "\tdate\tdatetime\n",
133 |        "\tprice\tfloat\n",
134 |        "\tbedrooms\tfloat\n",
135 |        "\tbathrooms\tfloat\n",
136 |        "\tsqft_living\tfloat\n",
137 |        "\tsqft_lot\tint\n",
138 |        "\tfloors\tstr\n",
139 |        "\twaterfront\tint\n",
140 |        "\tview\tint\n",
141 |        "\tcondition\tint\n",
142 |        "\tgrade\tint\n",
143 |        "\tsqft_above\tint\n",
144 |        "\tsqft_basement\tint\n",
145 |        "\tyr_built\tint\n",
146 |        "\tyr_renovated\tint\n",
147 |        "\tzipcode\tstr\n",
148 |        "\tlat\tfloat\n",
149 |        "\tlong\tfloat\n",
150 |        "\tsqft_living15\tfloat\n",
151 |        "\tsqft_lot15\tfloat\n",
152 |        "\n",
153 |        "Rows: 1\n",
154 |        "\n",
155 |        "Data:\n",
156 |        "+------------+---------------------------+----------+----------+-----------+\n",
157 |        "|     id     |            date           |  price   | bedrooms | bathrooms |\n",
158 |        "+------------+---------------------------+----------+----------+-----------+\n",
159 |        "| 7129300520 | 2014-10-13 00:00:00+00:00 | 221900.0 |   3.0    |    1.0    |\n",
160 |        "+------------+---------------------------+----------+----------+-----------+\n",
161 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
162 |        "| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |\n",
163 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
164 |        "|    1180.0   |   5650   |   1    |     0      |  0   |     3     |   7   |    1180    |\n",
165 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
166 |        "+---------------+----------+--------------+---------+-------------+\n",
167 |        "| sqft_basement | yr_built | yr_renovated | zipcode |     lat     |\n",
168 |        "+---------------+----------+--------------+---------+-------------+\n",
169 |        "|       0       |   1955   |      0       |  98178  | 47.51123398 |\n",
170 |        "+---------------+----------+--------------+---------+-------------+\n",
171 |        "+---------------+---------------+-----+\n",
172 |        "|      long     | sqft_living15 | ... |\n",
173 |        "+---------------+---------------+-----+\n",
174 |        "| -122.25677536 |     1340.0    | ... |\n",
175 |        "+---------------+---------------+-----+\n",
176 |        "[1 rows x 21 columns]"
177 |       ]
178 |      },
179 |      "execution_count": 99,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "sales[0:1]"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "### Import Numpy"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 100,
198 |    "metadata": {
199 |     "collapsed": true
200 |    },
201 |    "outputs": [],
202 |    "source": [
203 |     "import numpy as np"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "### Create new features"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 101,
216 |    "metadata": {
217 |     "collapsed": true
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "from math import log, sqrt\n",
222 |     "sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)\n",
223 |     "sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)\n",
224 |     "sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']\n",
225 |     "\n",
226 |     "# In the dataset, 'floors' was defined with type string, \n",
227 |     "# so we'll convert them to float, before creating a new feature.\n",
228 |     "sales['floors'] = sales['floors'].astype(float)\n",
229 |     "sales['floors_square'] = sales['floors']*sales['floors']"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "* Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly affect houses with many bedrooms.\n",
237 |     "* On the other hand, taking square root of sqft_living will decrease the separation between big house and small house. The owner may not be exactly twice as happy for getting a house that is twice as big."
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "### Selected features"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 102,
250 |    "metadata": {
251 |     "collapsed": false
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "all_features = ['bedrooms', 'bedrooms_square',\n",
256 |     "            'bathrooms',\n",
257 |     "            'sqft_living', 'sqft_living_sqrt',\n",
258 |     "            'sqft_lot', 'sqft_lot_sqrt',\n",
259 |     "            'floors', 'floors_square',\n",
260 |     "            'waterfront', 'view', 'condition', 'grade',\n",
261 |     "            'sqft_above',\n",
262 |     "            'sqft_basement',\n",
263 |     "            'yr_built', 'yr_renovated']"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "## Model with a choosen l1 penalty"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "### Linear regression model with a single l1 penalty (lasso)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 103,
283 |    "metadata": {
284 |     "collapsed": false
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "model_all = graphlab.linear_regression.create(sales, target='price', features=all_features,\n",
289 |     "                                              validation_set=None, l1_penalty=1e10, l2_penalty=0., verbose=None)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "### Explore coefficients in the model"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 105,
302 |    "metadata": {
303 |     "collapsed": false
304 |    },
305 |    "outputs": [
306 |     {
307 |      "name": "stdout",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "+------------------+-------+---------------+--------+\n",
311 |       "|       name       | index |     value     | stderr |\n",
312 |       "+------------------+-------+---------------+--------+\n",
313 |       "|   (intercept)    |  None |  274873.05595 |  None  |\n",
314 |       "|     bedrooms     |  None |      0.0      |  None  |\n",
315 |       "| bedrooms_square  |  None |      0.0      |  None  |\n",
316 |       "|    bathrooms     |  None | 8468.53108691 |  None  |\n",
317 |       "|   sqft_living    |  None | 24.4207209824 |  None  |\n",
318 |       "| sqft_living_sqrt |  None | 350.060553386 |  None  |\n",
319 |       "|     sqft_lot     |  None |      0.0      |  None  |\n",
320 |       "|  sqft_lot_sqrt   |  None |      0.0      |  None  |\n",
321 |       "|      floors      |  None |      0.0      |  None  |\n",
322 |       "|  floors_square   |  None |      0.0      |  None  |\n",
323 |       "|    waterfront    |  None |      0.0      |  None  |\n",
324 |       "|       view       |  None |      0.0      |  None  |\n",
325 |       "|    condition     |  None |      0.0      |  None  |\n",
326 |       "|      grade       |  None | 842.068034898 |  None  |\n",
327 |       "|    sqft_above    |  None | 20.0247224171 |  None  |\n",
328 |       "|  sqft_basement   |  None |      0.0      |  None  |\n",
329 |       "|     yr_built     |  None |      0.0      |  None  |\n",
330 |       "|   yr_renovated   |  None |      0.0      |  None  |\n",
331 |       "+------------------+-------+---------------+--------+\n",
332 |       "[18 rows x 4 columns]\n",
333 |       "\n",
334 |       "None\n",
335 |       "Number of non zero coefficients:  6\n"
336 |      ]
337 |     }
338 |    ],
339 |    "source": [
340 |     "print model_all['coefficients'].print_rows(num_rows=18)\n",
341 |     "print \"Number of non zero coefficients: \" ,model_all['coefficients']['value'].nnz()"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {},
347 |    "source": [
348 |     "Note that a majority of the weights have been set to zero. So by setting an L1 penalty that's large enough, we are performing a subset selection."
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "### Splitting the data"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 106,
361 |    "metadata": {
362 |     "collapsed": true
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "(training_and_validation, testing) = sales.random_split(.9,seed=1)\n",
367 |     "(training, validation) = training_and_validation.random_split(0.5, seed=1)"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {},
373 |    "source": [
374 |     "## Model with best selected L1 penalty from a range of l1 penalties"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 141,
380 |    "metadata": {
381 |     "collapsed": true
382 |    },
383 |    "outputs": [],
384 |    "source": [
385 |     "max_nonzeros = 7 # maximum non zero weights allowed"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "markdown",
390 |    "metadata": {},
391 |    "source": [
392 |     "### Exploring the larger range of values for l1 penalty to find a narrow range with the desired sparsity"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 112,
398 |    "metadata": {
399 |     "collapsed": false
400 |    },
401 |    "outputs": [],
402 |    "source": [
403 |     "l1_penalty_values = np.logspace(8, 10, num=20)"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 113,
409 |    "metadata": {
410 |     "collapsed": false
411 |    },
412 |    "outputs": [],
413 |    "source": [
414 |     "non_zeros = []\n",
415 |     "l1_penalties = []\n",
416 |     "for l1_penalty in l1_penalty_values:\n",
417 |     "    model = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None, \n",
418 |     "                                              l1_penalty=l1_penalty, l2_penalty=0., verbose=False)\n",
419 |     "    non_zeros.append(model['coefficients']['value'].nnz())\n",
420 |     "    l1_penalties.append(l1_penalty)"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "markdown",
425 |    "metadata": {},
426 |    "source": [
427 |     "L1 penalties applied to the models and the corresponding coefficients which are non zero in the model"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 114,
433 |    "metadata": {
434 |     "collapsed": false
435 |    },
436 |    "outputs": [
437 |     {
438 |      "data": {
439 |       "text/plain": [
440 |        "[100000000.0,\n",
441 |        " 127427498.57031322,\n",
442 |        " 162377673.91887242,\n",
443 |        " 206913808.11147901,\n",
444 |        " 263665089.87303555,\n",
445 |        " 335981828.62837881,\n",
446 |        " 428133239.8719396,\n",
447 |        " 545559478.11685145,\n",
448 |        " 695192796.17755914,\n",
449 |        " 885866790.41008317,\n",
450 |        " 1128837891.6846883,\n",
451 |        " 1438449888.2876658,\n",
452 |        " 1832980710.8324375,\n",
453 |        " 2335721469.0901213,\n",
454 |        " 2976351441.6313128,\n",
455 |        " 3792690190.7322536,\n",
456 |        " 4832930238.5717525,\n",
457 |        " 6158482110.6602545,\n",
458 |        " 7847599703.5146227,\n",
459 |        " 10000000000.0]"
460 |       ]
461 |      },
462 |      "execution_count": 114,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "l1_penalties"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 115,
474 |    "metadata": {
475 |     "collapsed": false
476 |    },
477 |    "outputs": [
478 |     {
479 |      "data": {
480 |       "text/plain": [
481 |        "[18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 15, 15, 13, 12, 10, 6, 5, 3, 1, 1]"
482 |       ]
483 |      },
484 |      "execution_count": 115,
485 |      "metadata": {},
486 |      "output_type": "execute_result"
487 |     }
488 |    ],
489 |    "source": [
490 |     "non_zeros"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "markdown",
495 |    "metadata": {},
496 |    "source": [
497 |     "Out of this large range, we want to find the two ends of our desired narrow range of `l1_penalty`.  At one end, we will have `l1_penalty` values that have too few non-zeros, and at the other end, we will have an `l1_penalty` that has too many non-zeros.  \n",
498 |     "\n",
499 |     "* The largest `l1_penalty` that has more non-zeros than `max_nonzeros`\n",
500 |     "* The smallest `l1_penalty` that has fewer non-zeros than `max_nonzeros'"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": 128,
506 |    "metadata": {
507 |     "collapsed": false
508 |    },
509 |    "outputs": [
510 |     {
511 |      "name": "stdout",
512 |      "output_type": "stream",
513 |      "text": [
514 |       "Min l1 penalty:  2976351441.63\n",
515 |       "Max l1 penalty:  3792690190.73\n"
516 |      ]
517 |     }
518 |    ],
519 |    "source": [
520 |     "l1_penalty_min = l1_penalties[14]\n",
521 |     "l1_penalty_max = l1_penalties[15]\n",
522 |     "print \"Min l1 penalty: \" ,l1_penalty_min\n",
523 |     "print \"Max l1 penalty: \" ,l1_penalty_max"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "markdown",
528 |    "metadata": {},
529 |    "source": [
530 |     "### Explore narrow range of values for l1 penalty to find a solution with the right number of non-zeros that has lowest RSS on the validation set "
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": 129,
536 |    "metadata": {
537 |     "collapsed": true
538 |    },
539 |    "outputs": [],
540 |    "source": [
541 |     "l1_penalty_values = np.linspace(l1_penalty_min, l1_penalty_max, 20)"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 133,
547 |    "metadata": {
548 |     "collapsed": true
549 |    },
550 |    "outputs": [],
551 |    "source": [
552 |     "all_rss = []\n",
553 |     "all_models = []\n",
554 |     "all_penalties = []\n",
555 |     "for l1_penalty in l1_penalty_values:\n",
556 |     "    model = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None, \n",
557 |     "                                              l1_penalty=l1_penalty, l2_penalty=0., verbose=False)\n",
558 |     "    predicted_price = model.predict(validation)\n",
559 |     "    residuals = predicted_price - validation['price']\n",
560 |     "    rss = (residuals*residuals).sum()\n",
561 |     "    all_rss.append(rss)\n",
562 |     "    all_models.append(model)\n",
563 |     "    all_penalties.append(l1_penalty)"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "markdown",
568 |    "metadata": {},
569 |    "source": [
570 |     "### Explore all models with number of non zeros equal to max non zeroes and it's corresponding RSS and l1 penalty "
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 135,
576 |    "metadata": {
577 |     "collapsed": false
578 |    },
579 |    "outputs": [],
580 |    "source": [
581 |     "# Loop to select those models from all models whose number of non zero coefficients are equal to max non zeros allowed that is 7\n",
582 |     "selected_models = []\n",
583 |     "selected_rss = []\n",
584 |     "selected_penalties = []\n",
585 |     "index = 0\n",
586 |     "for model in all_models:\n",
587 |     "    if model['coefficients']['value'].nnz() == 7:\n",
588 |     "        selected_models.append(model)\n",
589 |     "        selected_rss.append(all_rss[index])\n",
590 |     "        selected_penalties.append(all_penalties[index])\n",
591 |     "    index += 1"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 138,
597 |    "metadata": {
598 |     "collapsed": false
599 |    },
600 |    "outputs": [],
601 |    "source": [
602 |     "# Select a model from selected models and a l1 penalty from selected penalties that has the lowest RSS\n",
603 |     "lowest_rss, index = min((val, idx) for (idx, val) in enumerate(selected_rss))\n",
604 |     "best_model = selected_models[index]\n",
605 |     "best_l1_penalty = selected_penalties[index]"
606 |    ]
607 |   },
608 |   {
609 |    "cell_type": "markdown",
610 |    "metadata": {},
611 |    "source": [
612 |     "### Best l1 penalty"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": 139,
618 |    "metadata": {
619 |     "collapsed": false
620 |    },
621 |    "outputs": [
622 |     {
623 |      "data": {
624 |       "text/plain": [
625 |        "3448968612.1634364"
626 |       ]
627 |      },
628 |      "execution_count": 139,
629 |      "metadata": {},
630 |      "output_type": "execute_result"
631 |     }
632 |    ],
633 |    "source": [
634 |     "best_l1_penalty"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "markdown",
639 |    "metadata": {},
640 |    "source": [
641 |     "### Explore coefficients in the best model"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 140,
647 |    "metadata": {
648 |     "collapsed": false
649 |    },
650 |    "outputs": [
651 |     {
652 |      "name": "stdout",
653 |      "output_type": "stream",
654 |      "text": [
655 |       "+------------------+-------+---------------+--------+\n",
656 |       "|       name       | index |     value     | stderr |\n",
657 |       "+------------------+-------+---------------+--------+\n",
658 |       "|   (intercept)    |  None | 222253.192544 |  None  |\n",
659 |       "|     bedrooms     |  None | 661.722717782 |  None  |\n",
660 |       "| bedrooms_square  |  None |      0.0      |  None  |\n",
661 |       "|    bathrooms     |  None | 15873.9572593 |  None  |\n",
662 |       "|   sqft_living    |  None | 32.4102214513 |  None  |\n",
663 |       "| sqft_living_sqrt |  None | 690.114773313 |  None  |\n",
664 |       "|     sqft_lot     |  None |      0.0      |  None  |\n",
665 |       "|  sqft_lot_sqrt   |  None |      0.0      |  None  |\n",
666 |       "|      floors      |  None |      0.0      |  None  |\n",
667 |       "|  floors_square   |  None |      0.0      |  None  |\n",
668 |       "|    waterfront    |  None |      0.0      |  None  |\n",
669 |       "|       view       |  None |      0.0      |  None  |\n",
670 |       "|    condition     |  None |      0.0      |  None  |\n",
671 |       "|      grade       |  None | 2899.42026975 |  None  |\n",
672 |       "|    sqft_above    |  None | 30.0115753022 |  None  |\n",
673 |       "|  sqft_basement   |  None |      0.0      |  None  |\n",
674 |       "|     yr_built     |  None |      0.0      |  None  |\n",
675 |       "|   yr_renovated   |  None |      0.0      |  None  |\n",
676 |       "+------------------+-------+---------------+--------+\n",
677 |       "[18 rows x 4 columns]\n",
678 |       "\n",
679 |       "None\n",
680 |       "Number of non zero coefficients:  7\n"
681 |      ]
682 |     }
683 |    ],
684 |    "source": [
685 |     "print best_model['coefficients'].print_rows(num_rows=18)\n",
686 |     "print \"Number of non zero coefficients: \" ,best_model['coefficients']['value'].nnz()"
687 |    ]
688 |   }
689 |  ],
690 |  "metadata": {
691 |   "kernelspec": {
692 |    "display_name": "Python 2",
693 |    "language": "python",
694 |    "name": "python2"
695 |   },
696 |   "language_info": {
697 |    "codemirror_mode": {
698 |     "name": "ipython",
699 |     "version": 2
700 |    },
701 |    "file_extension": ".py",
702 |    "mimetype": "text/x-python",
703 |    "name": "python",
704 |    "nbconvert_exporter": "python",
705 |    "pygments_lexer": "ipython2",
706 |    "version": "2.7.13"
707 |   }
708 |  },
709 |  "nbformat": 4,
710 |  "nbformat_minor": 0
711 | }
712 | 


--------------------------------------------------------------------------------
/multiple-linear-regression/multiple-regression-gradient-descent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Multiple Regression using gradient descent on house sales data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Fire up Graphlab Create"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import graphlab"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Load in house sales data\n",
 33 |     "\n",
 34 |     "Dataset is from house sales in King County, the region where the city of Seattle, WA is located."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    },
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stderr",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\\Users\\agraw\\AppData\\Local\\Temp\\graphlab_server_1504899765.log.0\n"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "This non-commercial license of GraphLab Create for academic use is assigned to agrawal.pr@husky.neu.edu and will expire on March 12, 2018.\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "sales = graphlab.SFrame('kc_house_data.gl/')"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "### Explore house sales data"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 6,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/html": [
 80 |        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\"><table frame=\"box\" rules=\"cols\">\n",
 81 |        "    <tr>\n",
 82 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">id</th>\n",
 83 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">date</th>\n",
 84 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">price</th>\n",
 85 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">bedrooms</th>\n",
 86 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">bathrooms</th>\n",
 87 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_living</th>\n",
 88 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_lot</th>\n",
 89 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">floors</th>\n",
 90 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">waterfront</th>\n",
 91 |        "    </tr>\n",
 92 |        "    <tr>\n",
 93 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">7129300520</td>\n",
 94 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">2014-10-13 00:00:00+00:00</td>\n",
 95 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">221900.0</td>\n",
 96 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">3.0</td>\n",
 97 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1.0</td>\n",
 98 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1180.0</td>\n",
 99 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">5650</td>\n",
100 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1</td>\n",
101 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
102 |        "    </tr>\n",
103 |        "</table>\n",
104 |        "<table frame=\"box\" rules=\"cols\">\n",
105 |        "    <tr>\n",
106 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">view</th>\n",
107 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">condition</th>\n",
108 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">grade</th>\n",
109 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_above</th>\n",
110 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_basement</th>\n",
111 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">yr_built</th>\n",
112 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">yr_renovated</th>\n",
113 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">zipcode</th>\n",
114 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">lat</th>\n",
115 |        "    </tr>\n",
116 |        "    <tr>\n",
117 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
118 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">3</td>\n",
119 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">7</td>\n",
120 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1180</td>\n",
121 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
122 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1955</td>\n",
123 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
124 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">98178</td>\n",
125 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">47.51123398</td>\n",
126 |        "    </tr>\n",
127 |        "</table>\n",
128 |        "<table frame=\"box\" rules=\"cols\">\n",
129 |        "    <tr>\n",
130 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">long</th>\n",
131 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_living15</th>\n",
132 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_lot15</th>\n",
133 |        "    </tr>\n",
134 |        "    <tr>\n",
135 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">-122.25677536</td>\n",
136 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1340.0</td>\n",
137 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">5650.0</td>\n",
138 |        "    </tr>\n",
139 |        "</table>\n",
140 |        "[1 rows x 21 columns]<br/>\n",
141 |        "</div>"
142 |       ],
143 |       "text/plain": [
144 |        "Columns:\n",
145 |        "\tid\tstr\n",
146 |        "\tdate\tdatetime\n",
147 |        "\tprice\tfloat\n",
148 |        "\tbedrooms\tfloat\n",
149 |        "\tbathrooms\tfloat\n",
150 |        "\tsqft_living\tfloat\n",
151 |        "\tsqft_lot\tint\n",
152 |        "\tfloors\tstr\n",
153 |        "\twaterfront\tint\n",
154 |        "\tview\tint\n",
155 |        "\tcondition\tint\n",
156 |        "\tgrade\tint\n",
157 |        "\tsqft_above\tint\n",
158 |        "\tsqft_basement\tint\n",
159 |        "\tyr_built\tint\n",
160 |        "\tyr_renovated\tint\n",
161 |        "\tzipcode\tstr\n",
162 |        "\tlat\tfloat\n",
163 |        "\tlong\tfloat\n",
164 |        "\tsqft_living15\tfloat\n",
165 |        "\tsqft_lot15\tfloat\n",
166 |        "\n",
167 |        "Rows: 1\n",
168 |        "\n",
169 |        "Data:\n",
170 |        "+------------+---------------------------+----------+----------+-----------+\n",
171 |        "|     id     |            date           |  price   | bedrooms | bathrooms |\n",
172 |        "+------------+---------------------------+----------+----------+-----------+\n",
173 |        "| 7129300520 | 2014-10-13 00:00:00+00:00 | 221900.0 |   3.0    |    1.0    |\n",
174 |        "+------------+---------------------------+----------+----------+-----------+\n",
175 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
176 |        "| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |\n",
177 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
178 |        "|    1180.0   |   5650   |   1    |     0      |  0   |     3     |   7   |    1180    |\n",
179 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
180 |        "+---------------+----------+--------------+---------+-------------+\n",
181 |        "| sqft_basement | yr_built | yr_renovated | zipcode |     lat     |\n",
182 |        "+---------------+----------+--------------+---------+-------------+\n",
183 |        "|       0       |   1955   |      0       |  98178  | 47.51123398 |\n",
184 |        "+---------------+----------+--------------+---------+-------------+\n",
185 |        "+---------------+---------------+-----+\n",
186 |        "|      long     | sqft_living15 | ... |\n",
187 |        "+---------------+---------------+-----+\n",
188 |        "| -122.25677536 |     1340.0    | ... |\n",
189 |        "+---------------+---------------+-----+\n",
190 |        "[1 rows x 21 columns]"
191 |       ]
192 |      },
193 |      "execution_count": 6,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "sales[0:1]"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "### Convert SFrame to Numpy array"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 4,
212 |    "metadata": {
213 |     "collapsed": true
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "import numpy as np"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 1,
223 |    "metadata": {
224 |     "collapsed": false
225 |    },
226 |    "outputs": [],
227 |    "source": [
228 |     "# function to convert sframe to numpy array (matrix)\n",
229 |     "def get_numpy_data(data_sframe, features, output):\n",
230 |     "    \n",
231 |     "    data_sframe['constant'] = 1 # new constant column in the sframe signifying intercept\n",
232 |     "    \n",
233 |     "    features = ['constant'] + features # prepend constant to features list\n",
234 |     "    \n",
235 |     "    features_sframe = data_sframe[features] # new sframe selecting columns from data_sframe mentioned in features list\n",
236 |     "\n",
237 |     "    feature_matrix = features_sframe.to_numpy() # convert sframe to numpy matrix\n",
238 |     "\n",
239 |     "    output_sarray = data_sframe['price'] # an sarray consisting of the output column\n",
240 |     "\n",
241 |     "    output_array = output_sarray.to_numpy() # converts sarray to a numpy array\n",
242 |     "\n",
243 |     "    return(feature_matrix, output_array)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "### Test the function "
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 20,
256 |    "metadata": {
257 |     "collapsed": false
258 |    },
259 |    "outputs": [
260 |     {
261 |      "name": "stdout",
262 |      "output_type": "stream",
263 |      "text": [
264 |       "[[  1.00000000e+00   1.18000000e+03]]\n",
265 |       "[ 221900.]\n"
266 |      ]
267 |     }
268 |    ],
269 |    "source": [
270 |     "(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price')\n",
271 |     "print example_features[0:1] # the first row of the data\n",
272 |     "print example_output[0:1] # and the corresponding output"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "markdown",
277 |    "metadata": {},
278 |    "source": [
279 |     "### Predicting output given regression weights"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "Suppose we had the weights [1.0, 1.0] and the features [1.0, 1180.0] and we wanted to compute the predicted output 1.0\\*1.0 + 1.0\\*1180.0 = 1181.0 this is the dot product between these two arrays. If they're numpy arrayws we can use np.dot() to compute this:"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 21,
292 |    "metadata": {
293 |     "collapsed": false
294 |    },
295 |    "outputs": [
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "1181.0\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "my_weights = np.array([1., 1.]) # example weights\n",
306 |     "my_features = example_features[0,] # first data point\n",
307 |     "predicted_value = np.dot(my_features, my_weights)\n",
308 |     "print predicted_value"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {},
314 |    "source": [
315 |     "### Function to predict output given feature matrix and weight vector"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 22,
321 |    "metadata": {
322 |     "collapsed": true
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "def predict_output(feature_matrix, weights):\n",
327 |     "    predictions = np.dot(feature_matrix, weights)\n",
328 |     "    return(predictions)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "### Test the function"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 23,
341 |    "metadata": {
342 |     "collapsed": false
343 |    },
344 |    "outputs": [
345 |     {
346 |      "name": "stdout",
347 |      "output_type": "stream",
348 |      "text": [
349 |       "1181.0\n",
350 |       "2571.0\n"
351 |      ]
352 |     }
353 |    ],
354 |    "source": [
355 |     "test_predictions = predict_output(example_features, my_weights)\n",
356 |     "print test_predictions[0] # should be 1181.0\n",
357 |     "print test_predictions[1] # should be 2571.0"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {},
363 |    "source": [
364 |     "### Computing the Derivative"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "RSS (error) for 1 data point is:\n",
372 |     "\n",
373 |     "(w[0]\\*[CONSTANT] + w[1]\\*[feature_1] + ... + w[i] \\*[feature_i] + ... +  w[k]\\*[feature_k] - output)^2\n",
374 |     "\n",
375 |     "So the derivative with respect to weight w[i] by the chain rule is:\n",
376 |     "\n",
377 |     "2\\*(w[0]\\*[CONSTANT] + w[1]\\*[feature_1] + ... + w[i] \\*[feature_i] + ... +  w[k]\\*[feature_k] - output)\\* [feature_i]\n",
378 |     "\n",
379 |     "In short:\n",
380 |     "\n",
381 |     "2\\*error\\*[feature_i]\n",
382 |     "\n",
383 |     "That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself. In the case of the constant then this is just twice the sum of the errors!"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 27,
389 |    "metadata": {
390 |     "collapsed": true
391 |    },
392 |    "outputs": [],
393 |    "source": [
394 |     "def feature_derivative(errors, feature):\n",
395 |     "    \n",
396 |     "    # Assume that errors and feature are both numpy arrays of the same length (number of data points)\n",
397 |     "    dot_product = np.dot(errors, feature)\n",
398 |     "    \n",
399 |     "    # compute twice the dot product of these vectors as 'derivative' and return the value\n",
400 |     "    derivative = 2 * dot_product\n",
401 |     "\n",
402 |     "    return(derivative)"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {},
408 |    "source": [
409 |     "### Test function"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 29,
415 |    "metadata": {
416 |     "collapsed": false
417 |    },
418 |    "outputs": [
419 |     {
420 |      "name": "stdout",
421 |      "output_type": "stream",
422 |      "text": [
423 |       "-23345850022.0\n",
424 |       "-23345850022.0\n"
425 |      ]
426 |     }
427 |    ],
428 |    "source": [
429 |     "(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') \n",
430 |     "my_weights = np.array([0., 0.]) # this makes all the predictions 0\n",
431 |     "test_predictions = predict_output(example_features, my_weights) \n",
432 |     "errors = test_predictions - example_output # prediction errors in this case is just the -example_output\n",
433 |     "feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the \":\" indicates \"all rows\"\n",
434 |     "derivative = feature_derivative(errors, feature)\n",
435 |     "print derivative\n",
436 |     "print -np.sum(example_output)*2 # should be the same as derivative"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "metadata": {},
442 |    "source": [
443 |     "### Gradient Descent"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "markdown",
448 |    "metadata": {},
449 |    "source": [
450 |     "Here is a function that performs a gradient descent. Given a starting point we update the current weights by moving in the negative gradient direction. The gradient is the direction of *increase* and therefore the negative gradient is the direction of *decrease* and we're trying to *minimize* a cost function. \n",
451 |     "\n",
452 |     "The amount by which we move in the negative gradient *direction*  is called the 'step size'. We stop when we are 'sufficiently close' to the optimum. We define this by requiring that the magnitude (length) of the gradient vector to be smaller than a fixed 'tolerance'."
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": 30,
458 |    "metadata": {
459 |     "collapsed": true
460 |    },
461 |    "outputs": [],
462 |    "source": [
463 |     "from math import sqrt # the magnitude/length of a vector [g[0], g[1], g[2]] is sqrt(g[0]^2 + g[1]^2 + g[2]^2)"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": 33,
469 |    "metadata": {
470 |     "collapsed": false
471 |    },
472 |    "outputs": [],
473 |    "source": [
474 |     "def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):\n",
475 |     "    converged = False \n",
476 |     "    weights = np.array(initial_weights) # converting to a numpy array\n",
477 |     "    \n",
478 |     "    while not converged:\n",
479 |     "        # compute the predictions based on feature_matrix and weights using your predict_output() function\n",
480 |     "        predictions = predict_output(feature_matrix, weights)\n",
481 |     "        \n",
482 |     "        # compute the errors as predictions - output\n",
483 |     "        errors = predictions - output\n",
484 |     "\n",
485 |     "        gradient_sum_squares = 0 # initialize the gradient sum of squares\n",
486 |     "        \n",
487 |     "        # while we haven't reached the tolerance yet, update each feature's weight\n",
488 |     "        for i in range(len(weights)): # loop over each weight\n",
489 |     " \n",
490 |     "            # compute the derivative for weight[i]:\n",
491 |     "            derivative_weight_i = feature_derivative(errors, feature_matrix[:, i])\n",
492 |     "\n",
493 |     "            # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)\n",
494 |     "            gradient_sum_squares = gradient_sum_squares + derivative_weight_i**2\n",
495 |     "\n",
496 |     "            # subtract the step size times the derivative from the current weight\n",
497 |     "            weights[i] = weights[i] - (step_size * derivative_weight_i)\n",
498 |     "                \n",
499 |     "        # compute the square-root of the gradient sum of squares to get the gradient magnitude:\n",
500 |     "        gradient_magnitude = sqrt(gradient_sum_squares)\n",
501 |     "        if gradient_magnitude < tolerance:\n",
502 |     "            converged = True\n",
503 |     "    return(weights)"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "markdown",
508 |    "metadata": {},
509 |    "source": [
510 |     "Since the gradient is a sum over all the data points and involves a product of an error and a feature the gradient itself will be very large since the features are large (squarefeet) and the output is large (prices). So while you might expect \"tolerance\" to be small, small is only relative to the size of the features. \n",
511 |     "\n",
512 |     "For similar reasons the step size will be much smaller than you might expect but this is because the gradient has such large values."
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "markdown",
517 |    "metadata": {},
518 |    "source": [
519 |     "# Running the Gradient Descent as Simple Regression (Simple model)"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "metadata": {},
525 |    "source": [
526 |     "First let's split the data into training and test data."
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": 34,
532 |    "metadata": {
533 |     "collapsed": true
534 |    },
535 |    "outputs": [],
536 |    "source": [
537 |     "train_data,test_data = sales.random_split(.8,seed=0)"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": 36,
543 |    "metadata": {
544 |     "collapsed": true
545 |    },
546 |    "outputs": [],
547 |    "source": [
548 |     "simple_features = ['sqft_living']\n",
549 |     "my_output= 'price'\n",
550 |     "(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)\n",
551 |     "initial_weights = np.array([-47000., 1.])\n",
552 |     "step_size = 7e-12\n",
553 |     "tolerance = 2.5e7"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": 38,
559 |    "metadata": {
560 |     "collapsed": false
561 |    },
562 |    "outputs": [
563 |     {
564 |      "name": "stdout",
565 |      "output_type": "stream",
566 |      "text": [
567 |       "[-46999.88716555    281.91211912]\n"
568 |      ]
569 |     }
570 |    ],
571 |    "source": [
572 |     "simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)\n",
573 |     "print simple_weights"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "markdown",
578 |    "metadata": {},
579 |    "source": [
580 |     "### Get predictions for test data using new weights (Simple model)"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 39,
586 |    "metadata": {
587 |     "collapsed": false
588 |    },
589 |    "outputs": [],
590 |    "source": [
591 |     "(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": 41,
597 |    "metadata": {
598 |     "collapsed": false
599 |    },
600 |    "outputs": [
601 |     {
602 |      "name": "stdout",
603 |      "output_type": "stream",
604 |      "text": [
605 |       "[ 356134.44317093  784640.86422788  435069.83652353 ...,  663418.65300782\n",
606 |       "  604217.10799338  240550.4743332 ]\n"
607 |      ]
608 |     }
609 |    ],
610 |    "source": [
611 |     "simple_predictions = predict_output(test_simple_feature_matrix, simple_weights)\n",
612 |     "print simple_predictions"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "markdown",
617 |    "metadata": {},
618 |    "source": [
619 |     "**What is the predicted price for the 1st house in the TEST data set for model 1 (round to nearest dollar)?**"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": 74,
625 |    "metadata": {
626 |     "collapsed": false
627 |    },
628 |    "outputs": [
629 |     {
630 |      "data": {
631 |       "text/plain": [
632 |        "356134.44317092974"
633 |       ]
634 |      },
635 |      "execution_count": 74,
636 |      "metadata": {},
637 |      "output_type": "execute_result"
638 |     }
639 |    ],
640 |    "source": [
641 |     "simple_predictions[0]"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "markdown",
646 |    "metadata": {},
647 |    "source": [
648 |     "### RSS function"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": 44,
654 |    "metadata": {
655 |     "collapsed": false
656 |    },
657 |    "outputs": [],
658 |    "source": [
659 |     "def RSS (predicted_output, true_output):\n",
660 |     "    difference = true_output - predicted_output\n",
661 |     "    squared_difference = difference * difference\n",
662 |     "    sum_of_squared_difference = squared_difference.sum()\n",
663 |     "    return (sum_of_squared_difference)"
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "code",
668 |    "execution_count": 56,
669 |    "metadata": {
670 |     "collapsed": false
671 |    },
672 |    "outputs": [
673 |     {
674 |      "data": {
675 |       "text/plain": [
676 |        "277000.0"
677 |       ]
678 |      },
679 |      "execution_count": 56,
680 |      "metadata": {},
681 |      "output_type": "execute_result"
682 |     }
683 |    ],
684 |    "source": [
685 |     "output[5000]"
686 |    ]
687 |   },
688 |   {
689 |    "cell_type": "markdown",
690 |    "metadata": {},
691 |    "source": [
692 |     "### RSS for Simple model"
693 |    ]
694 |   },
695 |   {
696 |    "cell_type": "code",
697 |    "execution_count": 61,
698 |    "metadata": {
699 |     "collapsed": false
700 |    },
701 |    "outputs": [
702 |     {
703 |      "name": "stdout",
704 |      "output_type": "stream",
705 |      "text": [
706 |       "Residual sum of squares error for Simple model: 2.75400047593e+14\n"
707 |      ]
708 |     }
709 |    ],
710 |    "source": [
711 |     "rss = RSS(simple_predictions, test_output)\n",
712 |     "print \"Residual sum of squares error for Simple model: \" +str(rss)"
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "markdown",
717 |    "metadata": {},
718 |    "source": [
719 |     "# Running a multiple regression"
720 |    ]
721 |   },
722 |   {
723 |    "cell_type": "markdown",
724 |    "metadata": {},
725 |    "source": [
726 |     "Now we will use more than one actual feature. Use the following code to produce the weights for a second model with the following parameters:"
727 |    ]
728 |   },
729 |   {
730 |    "cell_type": "code",
731 |    "execution_count": 62,
732 |    "metadata": {
733 |     "collapsed": false
734 |    },
735 |    "outputs": [],
736 |    "source": [
737 |     "model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. \n",
738 |     "my_output = 'price'\n",
739 |     "(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)\n",
740 |     "initial_weights = np.array([-100000., 1., 1.])\n",
741 |     "step_size = 4e-12\n",
742 |     "tolerance = 1e9"
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "markdown",
747 |    "metadata": {},
748 |    "source": [
749 |     "Use the above parameters to estimate the model weights. Record these values for your quiz."
750 |    ]
751 |   },
752 |   {
753 |    "cell_type": "code",
754 |    "execution_count": 64,
755 |    "metadata": {
756 |     "collapsed": false
757 |    },
758 |    "outputs": [
759 |     {
760 |      "name": "stdout",
761 |      "output_type": "stream",
762 |      "text": [
763 |       "[ -9.99999688e+04   2.45072603e+02   6.52795277e+01]\n"
764 |      ]
765 |     }
766 |    ],
767 |    "source": [
768 |     "multiple_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)\n",
769 |     "print multiple_weights"
770 |    ]
771 |   },
772 |   {
773 |    "cell_type": "markdown",
774 |    "metadata": {},
775 |    "source": [
776 |     "### Get predictions for test data using new weights (Multiple regression model)"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": 66,
782 |    "metadata": {
783 |     "collapsed": true
784 |    },
785 |    "outputs": [],
786 |    "source": [
787 |     "(test_multiple_feature_matrix, test_multiple_output) = get_numpy_data(test_data, model_features, my_output)"
788 |    ]
789 |   },
790 |   {
791 |    "cell_type": "code",
792 |    "execution_count": 68,
793 |    "metadata": {
794 |     "collapsed": false
795 |    },
796 |    "outputs": [
797 |     {
798 |      "name": "stdout",
799 |      "output_type": "stream",
800 |      "text": [
801 |       "[ 366651.41203656  762662.39786164  386312.09499712 ...,  682087.39928241\n",
802 |       "  585579.27865729  216559.20396617]\n"
803 |      ]
804 |     }
805 |    ],
806 |    "source": [
807 |     "multiple_predictions = predict_output(test_multiple_feature_matrix,  multiple_weights)\n",
808 |     "print multiple_predictions"
809 |    ]
810 |   },
811 |   {
812 |    "cell_type": "markdown",
813 |    "metadata": {},
814 |    "source": [
815 |     "**What is the predicted price for the 1st house in the TEST data set for model 2?**"
816 |    ]
817 |   },
818 |   {
819 |    "cell_type": "code",
820 |    "execution_count": 73,
821 |    "metadata": {
822 |     "collapsed": false
823 |    },
824 |    "outputs": [
825 |     {
826 |      "data": {
827 |       "text/plain": [
828 |        "366651.41203655908"
829 |       ]
830 |      },
831 |      "execution_count": 73,
832 |      "metadata": {},
833 |      "output_type": "execute_result"
834 |     }
835 |    ],
836 |    "source": [
837 |     "multiple_predictions[0]"
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "markdown",
842 |    "metadata": {},
843 |    "source": [
844 |     "**What is the actual price for the 1st house in the test data set?**"
845 |    ]
846 |   },
847 |   {
848 |    "cell_type": "code",
849 |    "execution_count": 72,
850 |    "metadata": {
851 |     "collapsed": false
852 |    },
853 |    "outputs": [
854 |     {
855 |      "data": {
856 |       "text/plain": [
857 |        "310000.0"
858 |       ]
859 |      },
860 |      "execution_count": 72,
861 |      "metadata": {},
862 |      "output_type": "execute_result"
863 |     }
864 |    ],
865 |    "source": [
866 |     "test_multiple_output[0]"
867 |    ]
868 |   },
869 |   {
870 |    "cell_type": "markdown",
871 |    "metadata": {},
872 |    "source": [
873 |     "# So the simple model is more closer to the actual price of the house 1"
874 |    ]
875 |   },
876 |   {
877 |    "cell_type": "markdown",
878 |    "metadata": {},
879 |    "source": [
880 |     "RSS for Multiple regression model"
881 |    ]
882 |   },
883 |   {
884 |    "cell_type": "code",
885 |    "execution_count": 79,
886 |    "metadata": {
887 |     "collapsed": false
888 |    },
889 |    "outputs": [
890 |     {
891 |      "name": "stdout",
892 |      "output_type": "stream",
893 |      "text": [
894 |       "Residual sum of squares error for Multiple regression model: 2.70263446465e+14\n"
895 |      ]
896 |     }
897 |    ],
898 |    "source": [
899 |     "rss_multiple = RSS(multiple_predictions, test_multiple_output)\n",
900 |     "print \"Residual sum of squares error for Multiple regression model: \" +str(rss_multiple)"
901 |    ]
902 |   },
903 |   {
904 |    "cell_type": "markdown",
905 |    "metadata": {},
906 |    "source": [
907 |     "# The multiple regression model has lower RSS than Simple model"
908 |    ]
909 |   }
910 |  ],
911 |  "metadata": {
912 |   "kernelspec": {
913 |    "display_name": "Python 2",
914 |    "language": "python",
915 |    "name": "python2"
916 |   },
917 |   "language_info": {
918 |    "codemirror_mode": {
919 |     "name": "ipython",
920 |     "version": 2
921 |    },
922 |    "file_extension": ".py",
923 |    "mimetype": "text/x-python",
924 |    "name": "python",
925 |    "nbconvert_exporter": "python",
926 |    "pygments_lexer": "ipython2",
927 |    "version": "2.7.13"
928 |   }
929 |  },
930 |  "nbformat": 4,
931 |  "nbformat_minor": 0
932 | }
933 | 


--------------------------------------------------------------------------------
/nearest-neighbor-regression/nearest-neighbor-regression.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "## Predicting house prices using k-nearest neighbors regression"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "### Fire up GraphLab Create"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 187,
  20 |    "metadata": {
  21 |     "collapsed": true
  22 |    },
  23 |    "outputs": [],
  24 |    "source": [
  25 |     "import graphlab"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "### Load in house sales data"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": 188,
  38 |    "metadata": {
  39 |     "collapsed": false
  40 |    },
  41 |    "outputs": [],
  42 |    "source": [
  43 |     "sales = graphlab.SFrame('kc_house_data_small.gl/')"
  44 |    ]
  45 |   },
  46 |   {
  47 |    "cell_type": "markdown",
  48 |    "metadata": {},
  49 |    "source": [
  50 |     "### Explore the house sales data"
  51 |    ]
  52 |   },
  53 |   {
  54 |    "cell_type": "code",
  55 |    "execution_count": 189,
  56 |    "metadata": {
  57 |     "collapsed": false
  58 |    },
  59 |    "outputs": [
  60 |     {
  61 |      "data": {
  62 |       "text/html": [
  63 |        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\"><table frame=\"box\" rules=\"cols\">\n",
  64 |        "    <tr>\n",
  65 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">id</th>\n",
  66 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">date</th>\n",
  67 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">price</th>\n",
  68 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">bedrooms</th>\n",
  69 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">bathrooms</th>\n",
  70 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_living</th>\n",
  71 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_lot</th>\n",
  72 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">floors</th>\n",
  73 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">waterfront</th>\n",
  74 |        "    </tr>\n",
  75 |        "    <tr>\n",
  76 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">7129300520</td>\n",
  77 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">2014-10-13 00:00:00+00:00</td>\n",
  78 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">221900</td>\n",
  79 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">3.0</td>\n",
  80 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1.0</td>\n",
  81 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1180.0</td>\n",
  82 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">5650</td>\n",
  83 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1.0</td>\n",
  84 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
  85 |        "    </tr>\n",
  86 |        "</table>\n",
  87 |        "<table frame=\"box\" rules=\"cols\">\n",
  88 |        "    <tr>\n",
  89 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">view</th>\n",
  90 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">condition</th>\n",
  91 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">grade</th>\n",
  92 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_above</th>\n",
  93 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_basement</th>\n",
  94 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">yr_built</th>\n",
  95 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">yr_renovated</th>\n",
  96 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">zipcode</th>\n",
  97 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">lat</th>\n",
  98 |        "    </tr>\n",
  99 |        "    <tr>\n",
 100 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
 101 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">3</td>\n",
 102 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">7</td>\n",
 103 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1180</td>\n",
 104 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
 105 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1955</td>\n",
 106 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
 107 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">98178</td>\n",
 108 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">47.51123398</td>\n",
 109 |        "    </tr>\n",
 110 |        "</table>\n",
 111 |        "<table frame=\"box\" rules=\"cols\">\n",
 112 |        "    <tr>\n",
 113 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">long</th>\n",
 114 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_living15</th>\n",
 115 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_lot15</th>\n",
 116 |        "    </tr>\n",
 117 |        "    <tr>\n",
 118 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">-122.25677536</td>\n",
 119 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1340.0</td>\n",
 120 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">5650.0</td>\n",
 121 |        "    </tr>\n",
 122 |        "</table>\n",
 123 |        "[1 rows x 21 columns]<br/>\n",
 124 |        "</div>"
 125 |       ],
 126 |       "text/plain": [
 127 |        "Columns:\n",
 128 |        "\tid\tstr\n",
 129 |        "\tdate\tdatetime\n",
 130 |        "\tprice\tint\n",
 131 |        "\tbedrooms\tfloat\n",
 132 |        "\tbathrooms\tfloat\n",
 133 |        "\tsqft_living\tfloat\n",
 134 |        "\tsqft_lot\tint\n",
 135 |        "\tfloors\tfloat\n",
 136 |        "\twaterfront\tint\n",
 137 |        "\tview\tint\n",
 138 |        "\tcondition\tint\n",
 139 |        "\tgrade\tint\n",
 140 |        "\tsqft_above\tint\n",
 141 |        "\tsqft_basement\tint\n",
 142 |        "\tyr_built\tint\n",
 143 |        "\tyr_renovated\tint\n",
 144 |        "\tzipcode\tstr\n",
 145 |        "\tlat\tfloat\n",
 146 |        "\tlong\tfloat\n",
 147 |        "\tsqft_living15\tfloat\n",
 148 |        "\tsqft_lot15\tfloat\n",
 149 |        "\n",
 150 |        "Rows: 1\n",
 151 |        "\n",
 152 |        "Data:\n",
 153 |        "+------------+---------------------------+--------+----------+-----------+\n",
 154 |        "|     id     |            date           | price  | bedrooms | bathrooms |\n",
 155 |        "+------------+---------------------------+--------+----------+-----------+\n",
 156 |        "| 7129300520 | 2014-10-13 00:00:00+00:00 | 221900 |   3.0    |    1.0    |\n",
 157 |        "+------------+---------------------------+--------+----------+-----------+\n",
 158 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
 159 |        "| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |\n",
 160 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
 161 |        "|    1180.0   |   5650   |  1.0   |     0      |  0   |     3     |   7   |    1180    |\n",
 162 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
 163 |        "+---------------+----------+--------------+---------+-------------+\n",
 164 |        "| sqft_basement | yr_built | yr_renovated | zipcode |     lat     |\n",
 165 |        "+---------------+----------+--------------+---------+-------------+\n",
 166 |        "|       0       |   1955   |      0       |  98178  | 47.51123398 |\n",
 167 |        "+---------------+----------+--------------+---------+-------------+\n",
 168 |        "+---------------+---------------+-----+\n",
 169 |        "|      long     | sqft_living15 | ... |\n",
 170 |        "+---------------+---------------+-----+\n",
 171 |        "| -122.25677536 |     1340.0    | ... |\n",
 172 |        "+---------------+---------------+-----+\n",
 173 |        "[1 rows x 21 columns]"
 174 |       ]
 175 |      },
 176 |      "execution_count": 189,
 177 |      "metadata": {},
 178 |      "output_type": "execute_result"
 179 |     }
 180 |    ],
 181 |    "source": [
 182 |     "sales[0:1]"
 183 |    ]
 184 |   },
 185 |   {
 186 |    "cell_type": "markdown",
 187 |    "metadata": {},
 188 |    "source": [
 189 |     "### Import Numpy"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "code",
 194 |    "execution_count": 190,
 195 |    "metadata": {
 196 |     "collapsed": true
 197 |    },
 198 |    "outputs": [],
 199 |    "source": [
 200 |     "import numpy as np"
 201 |    ]
 202 |   },
 203 |   {
 204 |    "cell_type": "markdown",
 205 |    "metadata": {},
 206 |    "source": [
 207 |     "### Function to convert sframe to numpy matrix and array"
 208 |    ]
 209 |   },
 210 |   {
 211 |    "cell_type": "code",
 212 |    "execution_count": 191,
 213 |    "metadata": {
 214 |     "collapsed": true
 215 |    },
 216 |    "outputs": [],
 217 |    "source": [
 218 |     "def get_numpy_data(data_sframe, features, output):    \n",
 219 |     "    data_sframe['constant'] = 1 # new constant column in the sframe signifying intercept\n",
 220 |     "    \n",
 221 |     "    features = ['constant'] + features # prepend constant to features list\n",
 222 |     "    \n",
 223 |     "    features_sframe = data_sframe[features] # new sframe selecting columns from data_sframe mentioned in features list\n",
 224 |     "\n",
 225 |     "    feature_matrix = features_sframe.to_numpy() # convert sframe to numpy matrix\n",
 226 |     "\n",
 227 |     "    output_sarray = data_sframe['price'] # an sarray consisting of the output column\n",
 228 |     "\n",
 229 |     "    output_array = output_sarray.to_numpy() # converts sarray to a numpy array\n",
 230 |     "\n",
 231 |     "    return(feature_matrix, output_array)"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "markdown",
 236 |    "metadata": {},
 237 |    "source": [
 238 |     "### Function to normalize features of the matrix"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "code",
 243 |    "execution_count": 192,
 244 |    "metadata": {
 245 |     "collapsed": true
 246 |    },
 247 |    "outputs": [],
 248 |    "source": [
 249 |     "def normalize_features(features_matrix):\n",
 250 |     "    norms = np.linalg.norm(features_matrix, axis=0)\n",
 251 |     "    normalized_features = features_matrix / norms\n",
 252 |     "    return(normalized_features, norms)"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "markdown",
 257 |    "metadata": {},
 258 |    "source": [
 259 |     "### Split data into training, test, and validation sets"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": 193,
 265 |    "metadata": {
 266 |     "collapsed": false
 267 |    },
 268 |    "outputs": [],
 269 |    "source": [
 270 |     "(train_and_validation, test) = sales.random_split(.8, seed=1) # initial train/test split\n",
 271 |     "(train, validation) = train_and_validation.random_split(.8, seed=1) # split training set into training and validation sets"
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "markdown",
 276 |    "metadata": {},
 277 |    "source": [
 278 |     "### Feature list"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "code",
 283 |    "execution_count": 194,
 284 |    "metadata": {
 285 |     "collapsed": false
 286 |    },
 287 |    "outputs": [],
 288 |    "source": [
 289 |     "feature_list = ['bedrooms',  \n",
 290 |     "                'bathrooms',  \n",
 291 |     "                'sqft_living',  \n",
 292 |     "                'sqft_lot',  \n",
 293 |     "                'floors',\n",
 294 |     "                'waterfront',  \n",
 295 |     "                'view',  \n",
 296 |     "                'condition',  \n",
 297 |     "                'grade',  \n",
 298 |     "                'sqft_above',  \n",
 299 |     "                'sqft_basement',\n",
 300 |     "                'yr_built',  \n",
 301 |     "                'yr_renovated',  \n",
 302 |     "                'lat',  \n",
 303 |     "                'long',  \n",
 304 |     "                'sqft_living15',  \n",
 305 |     "                'sqft_lot15']"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "markdown",
 310 |    "metadata": {},
 311 |    "source": [
 312 |     "### Convert sframe datasets into numpy matrix and output numpy array"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": 195,
 318 |    "metadata": {
 319 |     "collapsed": true
 320 |    },
 321 |    "outputs": [],
 322 |    "source": [
 323 |     "features_train, output_train = get_numpy_data(train, feature_list, 'price')\n",
 324 |     "features_test, output_test = get_numpy_data(test, feature_list, 'price')\n",
 325 |     "features_valid, output_valid = get_numpy_data(validation, feature_list, 'price')"
 326 |    ]
 327 |   },
 328 |   {
 329 |    "cell_type": "markdown",
 330 |    "metadata": {},
 331 |    "source": [
 332 |     "In computing distances, it is crucial to normalize features. Otherwise, for example, the `sqft_living` feature (typically on the order of thousands) would exert a much larger influence on distance than the `bedrooms` feature (typically on the order of ones). We divide each column of the training feature matrix by its 2-norm, so that the transformed column has unit norm.\n",
 333 |     "\n",
 334 |     "The features in the test and validation sets must be divided by the same norms used to divide features of train set, so that the training, test, and validation sets are normalized consistently."
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "markdown",
 339 |    "metadata": {},
 340 |    "source": [
 341 |     "### Normalize features"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "code",
 346 |    "execution_count": 196,
 347 |    "metadata": {
 348 |     "collapsed": true
 349 |    },
 350 |    "outputs": [],
 351 |    "source": [
 352 |     "features_train, norms = normalize_features(features_train) # normalize training set features (columns)\n",
 353 |     "features_test = features_test / norms # normalize test set by training set norms\n",
 354 |     "features_valid = features_valid / norms # normalize validation set by training set norms"
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "markdown",
 359 |    "metadata": {},
 360 |    "source": [
 361 |     "## Compute a single distance"
 362 |    ]
 363 |   },
 364 |   {
 365 |    "cell_type": "markdown",
 366 |    "metadata": {},
 367 |    "source": [
 368 |     "* Lets compute distance between house no.1 from test set and house no.10 from train set\n",
 369 |     "* The features associated with both these houses seen in the 18-dimensional vector have values between 0 to 1"
 370 |    ]
 371 |   },
 372 |   {
 373 |    "cell_type": "code",
 374 |    "execution_count": 197,
 375 |    "metadata": {
 376 |     "collapsed": false
 377 |    },
 378 |    "outputs": [
 379 |     {
 380 |      "name": "stdout",
 381 |      "output_type": "stream",
 382 |      "text": [
 383 |       "[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.\n",
 384 |       "  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306\n",
 385 |       "  0.          0.01345386 -0.01346927  0.01375926  0.0016225 ]\n"
 386 |      ]
 387 |     }
 388 |    ],
 389 |    "source": [
 390 |     "house_1 = features_test[0]\n",
 391 |     "print house_1"
 392 |    ]
 393 |   },
 394 |   {
 395 |    "cell_type": "code",
 396 |    "execution_count": 198,
 397 |    "metadata": {
 398 |     "collapsed": false
 399 |    },
 400 |    "outputs": [
 401 |     {
 402 |      "name": "stdout",
 403 |      "output_type": "stream",
 404 |      "text": [
 405 |       "[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425\n",
 406 |       "  0.          0.          0.01938684  0.01390535  0.0096309   0.\n",
 407 |       "  0.01302544  0.          0.01346821 -0.01346254  0.01195898  0.00156612]\n"
 408 |      ]
 409 |     }
 410 |    ],
 411 |    "source": [
 412 |     "house_2 = features_train[9]\n",
 413 |     "print house_2"
 414 |    ]
 415 |   },
 416 |   {
 417 |    "cell_type": "code",
 418 |    "execution_count": 199,
 419 |    "metadata": {
 420 |     "collapsed": false
 421 |    },
 422 |    "outputs": [
 423 |     {
 424 |      "name": "stdout",
 425 |      "output_type": "stream",
 426 |      "text": [
 427 |       "0.0597235937167\n"
 428 |      ]
 429 |     }
 430 |    ],
 431 |    "source": [
 432 |     "distance = np.sqrt(np.sum((house_1-house_2)**2))\n",
 433 |     "print distance"
 434 |    ]
 435 |   },
 436 |   {
 437 |    "cell_type": "markdown",
 438 |    "metadata": {},
 439 |    "source": [
 440 |     "## Compute multiple distances"
 441 |    ]
 442 |   },
 443 |   {
 444 |    "cell_type": "markdown",
 445 |    "metadata": {},
 446 |    "source": [
 447 |     "To do nearest neighbor regression, we need to compute the distance between our query house and *all* houses in the training set.  "
 448 |    ]
 449 |   },
 450 |   {
 451 |    "cell_type": "markdown",
 452 |    "metadata": {},
 453 |    "source": [
 454 |     "### Function to calculate euclidean distance"
 455 |    ]
 456 |   },
 457 |   {
 458 |    "cell_type": "code",
 459 |    "execution_count": 200,
 460 |    "metadata": {
 461 |     "collapsed": false
 462 |    },
 463 |    "outputs": [],
 464 |    "source": [
 465 |     "def euclidean_distance(query_house, houses):\n",
 466 |     "    distance_list = []\n",
 467 |     "    for house in houses:\n",
 468 |     "        distance = np.sqrt(np.sum((query_house-house)**2))\n",
 469 |     "        distance_list.append(distance)\n",
 470 |     "    return(distance_list)"
 471 |    ]
 472 |   },
 473 |   {
 474 |    "cell_type": "markdown",
 475 |    "metadata": {},
 476 |    "source": [
 477 |     "* Lets compute distance for query house no.1 from test set against first 10 houses of the train set"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": 201,
 483 |    "metadata": {
 484 |     "collapsed": false
 485 |    },
 486 |    "outputs": [
 487 |     {
 488 |      "name": "stdout",
 489 |      "output_type": "stream",
 490 |      "text": [
 491 |       "[0.0602747091729555, 0.085468811488270832, 0.061499464371202843, 0.053402739788200579, 0.058444840639381393, 0.059879215101840008, 0.054631404972615261, 0.055431083241597921, 0.052383627840972731, 0.059723593716661257]\n"
 492 |      ]
 493 |     }
 494 |    ],
 495 |    "source": [
 496 |     "distance_list = euclidean_distance(features_test[0], features_train[0:10])\n",
 497 |     "print distance_list"
 498 |    ]
 499 |   },
 500 |   {
 501 |    "cell_type": "markdown",
 502 |    "metadata": {},
 503 |    "source": [
 504 |     "Among the first 10 training houses, the closest house to the test query house is:"
 505 |    ]
 506 |   },
 507 |   {
 508 |    "cell_type": "code",
 509 |    "execution_count": 222,
 510 |    "metadata": {
 511 |     "collapsed": false
 512 |    },
 513 |    "outputs": [
 514 |     {
 515 |      "name": "stdout",
 516 |      "output_type": "stream",
 517 |      "text": [
 518 |       "Train house number:  8\n",
 519 |       "Distance of test query house:  0.052383627841\n"
 520 |      ]
 521 |     }
 522 |    ],
 523 |    "source": [
 524 |     "distance, train_house_number = min((val, idx) for (idx, val) in enumerate(distance_list))\n",
 525 |     "print \"Train house number: \" ,train_house_number\n",
 526 |     "print \"Distance of test query house: \" ,distance"
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "markdown",
 531 |    "metadata": {},
 532 |    "source": [
 533 |     "## Perform 1-nearest neighbor regression\n",
 534 |     "\n",
 535 |     "Looping to calculate distance is not efficient in python so let us use two single line expressions instead of using the previously defined euclidean distance function to calculate distance of 1 test house from all train houses.\n",
 536 |     "\n",
 537 |     "First step is to calculate the difference between the features of each training house and the query (test) house"
 538 |    ]
 539 |   },
 540 |   {
 541 |    "cell_type": "code",
 542 |    "execution_count": 203,
 543 |    "metadata": {
 544 |     "collapsed": false
 545 |    },
 546 |    "outputs": [],
 547 |    "source": [
 548 |     "diff = features_train[0:len(features_train)] - features_test[0]"
 549 |    ]
 550 |   },
 551 |   {
 552 |    "cell_type": "markdown",
 553 |    "metadata": {},
 554 |    "source": [
 555 |     "Second step is to take these feature-by-feature differences in `diff`, square each, take their sum and finally perform square root"
 556 |    ]
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": 204,
 561 |    "metadata": {
 562 |     "collapsed": false
 563 |    },
 564 |    "outputs": [],
 565 |    "source": [
 566 |     "distances = np.sqrt(np.sum(diff**2, axis=1))"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "markdown",
 571 |    "metadata": {},
 572 |    "source": [
 573 |     "### Function that computes the distances from a query house to all training houses\n",
 574 |     "We will use the previous two single line expressions and modify them so that they can be used in this function to calculate the distance"
 575 |    ]
 576 |   },
 577 |   {
 578 |    "cell_type": "code",
 579 |    "execution_count": 205,
 580 |    "metadata": {
 581 |     "collapsed": false
 582 |    },
 583 |    "outputs": [],
 584 |    "source": [
 585 |     "def compute_distances(features_instances, features_query):\n",
 586 |     "    diff = features_instances[0:len(features_instances)] - features_query\n",
 587 |     "    distances = np.sqrt(np.sum(diff**2, axis=1))\n",
 588 |     "    return(distances)"
 589 |    ]
 590 |   },
 591 |   {
 592 |    "cell_type": "markdown",
 593 |    "metadata": {},
 594 |    "source": [
 595 |     "### Compute 1 nearest neighbor regression for a single query house"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": 206,
 601 |    "metadata": {
 602 |     "collapsed": false
 603 |    },
 604 |    "outputs": [
 605 |     {
 606 |      "name": "stdout",
 607 |      "output_type": "stream",
 608 |      "text": [
 609 |       "[ 0.01954476  0.06861035  0.02165079 ...,  0.02433478  0.02622734\n",
 610 |       "  0.02637942]\n"
 611 |      ]
 612 |     }
 613 |    ],
 614 |    "source": [
 615 |     "query_house = features_test[2]\n",
 616 |     "distances = compute_distances(features_train, query_house)\n",
 617 |     "print distances"
 618 |    ]
 619 |   },
 620 |   {
 621 |    "cell_type": "markdown",
 622 |    "metadata": {
 623 |     "collapsed": true
 624 |    },
 625 |    "source": [
 626 |     "Closest house to the query house"
 627 |    ]
 628 |   },
 629 |   {
 630 |    "cell_type": "code",
 631 |    "execution_count": 223,
 632 |    "metadata": {
 633 |     "collapsed": false
 634 |    },
 635 |    "outputs": [
 636 |     {
 637 |      "name": "stdout",
 638 |      "output_type": "stream",
 639 |      "text": [
 640 |       "Train house number:  382\n",
 641 |       "Distance of test query house:  0.00286049526751\n"
 642 |      ]
 643 |     }
 644 |    ],
 645 |    "source": [
 646 |     "distance, train_house_number = min((val, idx) for (idx, val) in enumerate(distances))\n",
 647 |     "print \"Train house number: \" ,train_house_number\n",
 648 |     "print \"Distance of test query house: \" ,distance"
 649 |    ]
 650 |   },
 651 |   {
 652 |    "cell_type": "markdown",
 653 |    "metadata": {},
 654 |    "source": [
 655 |     "### Predicted value of the query house"
 656 |    ]
 657 |   },
 658 |   {
 659 |    "cell_type": "code",
 660 |    "execution_count": 226,
 661 |    "metadata": {
 662 |     "collapsed": false
 663 |    },
 664 |    "outputs": [
 665 |     {
 666 |      "name": "stdout",
 667 |      "output_type": "stream",
 668 |      "text": [
 669 |       "249000\n"
 670 |      ]
 671 |     }
 672 |    ],
 673 |    "source": [
 674 |     "predicted_value = output_train[382]\n",
 675 |     "print predicted_value"
 676 |    ]
 677 |   },
 678 |   {
 679 |    "cell_type": "markdown",
 680 |    "metadata": {},
 681 |    "source": [
 682 |     "## Perform k-nearest neighbor regression"
 683 |    ]
 684 |   },
 685 |   {
 686 |    "cell_type": "markdown",
 687 |    "metadata": {},
 688 |    "source": [
 689 |     "### Function to calculate k-nearest neighbor of a single query house"
 690 |    ]
 691 |   },
 692 |   {
 693 |    "cell_type": "code",
 694 |    "execution_count": 227,
 695 |    "metadata": {
 696 |     "collapsed": true
 697 |    },
 698 |    "outputs": [],
 699 |    "source": [
 700 |     "def compute_k_distances(k, features_matrix, feature_vector):\n",
 701 |     "    all_distances = compute_distances(features_matrix, feature_vector)\n",
 702 |     "    house_numbers = np.argsort(all_distances) # sorts distances in ascending order and inserts their indexes in the array\n",
 703 |     "    return house_numbers[0:k]"
 704 |    ]
 705 |   },
 706 |   {
 707 |    "cell_type": "markdown",
 708 |    "metadata": {},
 709 |    "source": [
 710 |     "### Compute k-nearest neighbors of a single query house"
 711 |    ]
 712 |   },
 713 |   {
 714 |    "cell_type": "code",
 715 |    "execution_count": 228,
 716 |    "metadata": {
 717 |     "collapsed": false
 718 |    },
 719 |    "outputs": [
 720 |     {
 721 |      "name": "stdout",
 722 |      "output_type": "stream",
 723 |      "text": [
 724 |       "[ 382 1149 4087 3142]\n"
 725 |      ]
 726 |     }
 727 |    ],
 728 |    "source": [
 729 |     "query_house = features_test[2]\n",
 730 |     "house_numbers = compute_k_distances(4, features_train, query_house)\n",
 731 |     "print house_numbers # four closest houses from the training set to the house present in the test set"
 732 |    ]
 733 |   },
 734 |   {
 735 |    "cell_type": "markdown",
 736 |    "metadata": {},
 737 |    "source": [
 738 |     "## Make a single prediction by averaging k nearest neighbor outputs"
 739 |    ]
 740 |   },
 741 |   {
 742 |    "cell_type": "markdown",
 743 |    "metadata": {
 744 |     "collapsed": true
 745 |    },
 746 |    "source": [
 747 |     "### Function to predict price of a house using k-nearest neighbors"
 748 |    ]
 749 |   },
 750 |   {
 751 |    "cell_type": "code",
 752 |    "execution_count": 211,
 753 |    "metadata": {
 754 |     "collapsed": true
 755 |    },
 756 |    "outputs": [],
 757 |    "source": [
 758 |     "def predict_price(k, features_matrix, feature_vector, prices):\n",
 759 |     "    all_distances = compute_distances(features_matrix, feature_vector)\n",
 760 |     "    house_numbers = np.argsort(all_distances) # sorts distances in ascending order and inserts their indexes in the array\n",
 761 |     "    k_house_numbers = house_numbers[0:k] # closest k houses to the query house\n",
 762 |     "    total_price = 0\n",
 763 |     "    for house_number in k_house_numbers:\n",
 764 |     "        total_price += prices[house_number] # sum prices of all the k closest houses\n",
 765 |     "    predicted_price = total_price / k # average out the total price\n",
 766 |     "    return(predicted_price)"
 767 |    ]
 768 |   },
 769 |   {
 770 |    "cell_type": "markdown",
 771 |    "metadata": {},
 772 |    "source": [
 773 |     "### Compute price for a single query house"
 774 |    ]
 775 |   },
 776 |   {
 777 |    "cell_type": "code",
 778 |    "execution_count": 212,
 779 |    "metadata": {
 780 |     "collapsed": false
 781 |    },
 782 |    "outputs": [
 783 |     {
 784 |      "name": "stdout",
 785 |      "output_type": "stream",
 786 |      "text": [
 787 |       "413987\n"
 788 |      ]
 789 |     }
 790 |    ],
 791 |    "source": [
 792 |     "predicted_price = predict_price(4, features_train, features_test[2], output_train)\n",
 793 |     "print predicted_price"
 794 |    ]
 795 |   },
 796 |   {
 797 |    "cell_type": "markdown",
 798 |    "metadata": {},
 799 |    "source": [
 800 |     "On comparing price obtained using 4-nearest neighbors to the price obtained using 1-nearest neighbor computed earlier of house number 3 of test set, it is clear that the 4-nearest neighbors gives us a much reasonable price estimate."
 801 |    ]
 802 |   },
 803 |   {
 804 |    "cell_type": "markdown",
 805 |    "metadata": {},
 806 |    "source": [
 807 |     "### Function to predict prices of multiple houses using k-nearest neighbors"
 808 |    ]
 809 |   },
 810 |   {
 811 |    "cell_type": "code",
 812 |    "execution_count": 229,
 813 |    "metadata": {
 814 |     "collapsed": false
 815 |    },
 816 |    "outputs": [],
 817 |    "source": [
 818 |     "def predict_prices(k, features_matrix, features_matrix_query, prices):\n",
 819 |     "    predicted_prices = []\n",
 820 |     "    for i in range(0, len(features_matrix_query)):\n",
 821 |     "        features_array = features_matrix_query[i]\n",
 822 |     "        predicted_price = predict_price(k, features_matrix, features_array, prices)\n",
 823 |     "        predicted_prices.append(predicted_price)\n",
 824 |     "    return(predicted_prices)"
 825 |    ]
 826 |   },
 827 |   {
 828 |    "cell_type": "markdown",
 829 |    "metadata": {},
 830 |    "source": [
 831 |     "### Computer prices for first 10 houses of the test set with k = 10"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "code",
 836 |    "execution_count": 230,
 837 |    "metadata": {
 838 |     "collapsed": false
 839 |    },
 840 |    "outputs": [
 841 |     {
 842 |      "name": "stdout",
 843 |      "output_type": "stream",
 844 |      "text": [
 845 |       "[881300, 431860, 460595, 430200, 766750, 667420, 350032, 512800, 484000, 457235]\n"
 846 |      ]
 847 |     }
 848 |    ],
 849 |    "source": [
 850 |     "predicted_prices = predict_prices(10, features_train, features_test[0:10], output_train)\n",
 851 |     "print predicted_prices"
 852 |    ]
 853 |   },
 854 |   {
 855 |    "cell_type": "markdown",
 856 |    "metadata": {},
 857 |    "source": [
 858 |     "### House with the lowest predicted price from the query set"
 859 |    ]
 860 |   },
 861 |   {
 862 |    "cell_type": "code",
 863 |    "execution_count": 231,
 864 |    "metadata": {
 865 |     "collapsed": false
 866 |    },
 867 |    "outputs": [
 868 |     {
 869 |      "name": "stdout",
 870 |      "output_type": "stream",
 871 |      "text": [
 872 |       "The house number is:  6\n",
 873 |       "The predicted house price is:  350032\n"
 874 |      ]
 875 |     }
 876 |    ],
 877 |    "source": [
 878 |     "house_number = predicted_prices.index(min(predicted_prices))\n",
 879 |     "print \"The house number is: \" ,house_number\n",
 880 |     "print \"The predicted house price is: \" , min(predicted_prices)"
 881 |    ]
 882 |   },
 883 |   {
 884 |    "cell_type": "markdown",
 885 |    "metadata": {},
 886 |    "source": [
 887 |     "## Choosing the best value of k using a validation set"
 888 |    ]
 889 |   },
 890 |   {
 891 |    "cell_type": "code",
 892 |    "execution_count": 232,
 893 |    "metadata": {
 894 |     "collapsed": false
 895 |    },
 896 |    "outputs": [],
 897 |    "source": [
 898 |     "rss_all = []\n",
 899 |     "for k in range(1, 16):\n",
 900 |     "    predicted_prices = predict_prices(k, features_train, features_valid, output_train)\n",
 901 |     "    residual = predicted_prices - output_valid\n",
 902 |     "    rss = (residual*residual).sum()\n",
 903 |     "    rss_all.append(rss)"
 904 |    ]
 905 |   },
 906 |   {
 907 |    "cell_type": "markdown",
 908 |    "metadata": {},
 909 |    "source": [
 910 |     "### Best value of k that reported lowest RSS"
 911 |    ]
 912 |   },
 913 |   {
 914 |    "cell_type": "code",
 915 |    "execution_count": 233,
 916 |    "metadata": {
 917 |     "collapsed": false
 918 |    },
 919 |    "outputs": [
 920 |     {
 921 |      "name": "stdout",
 922 |      "output_type": "stream",
 923 |      "text": [
 924 |       "7\n"
 925 |      ]
 926 |     }
 927 |    ],
 928 |    "source": [
 929 |     "k = rss_all.index(min(rss_all))\n",
 930 |     "print k"
 931 |    ]
 932 |   },
 933 |   {
 934 |    "cell_type": "markdown",
 935 |    "metadata": {
 936 |     "collapsed": false
 937 |    },
 938 |    "source": [
 939 |     "Visualize the performance as a function of `k`, plot the RSS on the VALIDATION set for each considered `k` value:"
 940 |    ]
 941 |   },
 942 |   {
 943 |    "cell_type": "code",
 944 |    "execution_count": 234,
 945 |    "metadata": {
 946 |     "collapsed": false
 947 |    },
 948 |    "outputs": [
 949 |     {
 950 |      "data": {
 951 |       "text/plain": [
 952 |        "[<matplotlib.lines.Line2D at 0x23593ac8>]"
 953 |       ]
 954 |      },
 955 |      "execution_count": 234,
 956 |      "metadata": {},
 957 |      "output_type": "execute_result"
 958 |     },
 959 |     {
 960 |      "data": {
 961 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEGCAYAAACJnEVTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XuYVNWZ7/Hvr0W8gHfUcNEG2vsdNKgQTRnHiCZq4mQy\nKM4E4y0XiXMyTtQkpJun84w642QSL2fymBgZJyi5mRNz4iSQoz0RFcWAgApya1tuY2JEBBIzSL/n\nj10NRXd1V3VT1bu66/d5nnq69t5r7/327d2r1lp7bUUEZmZWHWrSDsDMzHqPk76ZWRVx0jczqyJO\n+mZmVcRJ38ysijjpm5lVkYpK+pIekPSGpMVFlD1H0m8lbZN0eZ7t+0laI+nu8kRrZtb3VFTSBx4E\nLiyybAvwKWBmJ9sbgf8qRVBmZv1FRSX9iJgLbMxdJ2m0pP+UNF/Sf0k6Jlv29Yh4Cehwd5mk04HD\ngNm9EbeZWV9RUUm/E/cDN0bE+4F/AP6tq8KSBNwF3Ayo/OGZmfUdA9IOoCuSBgHjgR9lkznAngV2\n+xzwi4hYn93Fid/MLKuikz7JJ5GNETG2G/ucDXxA0ueA/YA9JW2OiC+XJUIzsz6kYPNOoRE1ko6V\n9IykdyV9sd22iZKWSVou6ZYiY1L2RURsBpolfSLnmKd0sg/Zfa6KiJERMZqkiechJ3wzs0QxbfqF\nRtT8AZgK/HPuSkk1wL3ZfU8ErpB0XFcnkvQw8AxwjKTXJV0NTAaukfSipJeAS7Nlz5C0BvgE8G1J\nS4r4XszMqpqKmVpZUi3w84jIV8tuK1MPbI6Ib2SXzwLqI+Ki7PKtQETEnSWJ3MzMuq2co3eGA2ty\nltdm15mZWUr6wpBNMzMrkXKO3lkHHJmzPCK7Li9JfoSXmVk3RUS3hqUXW9PfMaKmiHJt5gNHSaqV\nNBCYBDzW1c4RUdGv+vr61GNwnI7TcTrOtldPFKzpZ0fUZIBDJL0O1AMDkxwd90s6HHiBZEx8q6Sb\ngBMiYoukG0mmQqgBHoiIpT2K0szMSqJg0o+IKwtsfwM4opNtvwSO7VloZmZWau7I7YZMJpN2CEVx\nnKXlOEvLcaarqHH6vUFSVEosZmZ9gSSiTB25ZmbWDzjpm5lVESd9M7Mq4qRvZlZFnPTNzKqIk76Z\nWRVx0jczqyJO+mZmVcRJ38ysijjpm5lVESd9M7Mq4qRvZlZFnPTNzKpIOR+XmJrm5hamTZvBunWt\nDB9eQ2PjFEaNqk07LDOz1BWcWlnSA8BHgTci4pROytwNXARsBa6OiIXZ9a8Bm4BWYFtEjOviPCWZ\nWrm5uYULLriHVaumA4OArdTV1TNnzlQnfjPrV8o1tfKDwIVdnPQioC4ijgZuAP4tZ3MrkImIMV0l\n/FKaNm1GTsIHGMSqVdOZNm1Gb5zezKyiFUz6ETEX2NhFkcuAh7JlnwMOyD43F5IHpfdqv8G6da3s\nTPhtBrF+fWtvhmFmVpFKkZCHA2tyltdl1wEEMEfSfEnXleBchYMZXkPSypRrK8OGuc/azKzcHbkT\nImKDpENJkv/S7CeHvBoaGna8z2QyPXpGZWPjFObNq+/Qpt/YOLXbxzIzqyRNTU00NTXt1jGKekau\npFrg5/k6ciV9G3gyIn6QXV4GfDAi3mhXrh7YHBHf6OQcJXtGbnNzC1deOYPVq1u54AKP3jGz/qkn\nHbnFJv2RJEn/5DzbLgY+HxEfkXQW8M2IOEvSvkBNRGyRNAiYDUyPiNmdnKOkD0ZfvBg++UlYtqxk\nhzQzqyg9SfoFm3ckPQxkgEMkvQ7UAwOBiIj7I+JxSRdLWkl2yGZ218OBn0qK7Hlmdpbwy+H44+H1\n12HLFhg8uLfOamZW2Yqq6feGUtf0Ac44A+6+G8aPL+lhzcwqQrnG6fdZY8bAwoVpR2FmVjmc9M3M\nqki/T/oLFqQdhZlZ5ejXbfpbt8KQIbBpEwwcWNJDm5mlzm367QwaBCNHwiuvpB2JmVll6NdJH2Ds\nWLfrm5m16fdJ3525ZmY7OembmVWRft2RC/DWW0m7/ttvQ02/v8SZWTVxR24eBx8MBx0Eq1alHYmZ\nWfr6fdIHN/GYmbVx0jczqyJO+mZmVaSqkn6F9FmbmaWmKpL+iBGwfTts2JB2JGZm6aqKpC+5icfM\nDIpI+pIekPSGpMVdlLlb0gpJL0o6LWf9REnLJC2XdEupgu4JJ30zs+Jq+g8CF3a2UdJFQF1EHA3c\nAHw7u74GuDe774nAFZKO2+2Ie8jTLJuZFZH0I2IusLGLIpcBD2XLPgccIOlwYBywIiJaImIbMCtb\nNhWu6ZuZlaZNfziwJmd5bXZdZ+tTcfTR8OabsLGry5eZWT83oAzH7NY8ELkaGhp2vM9kMmQymRKE\nk9hjDzjlFHjxRTjvvJId1sys1zQ1NdHU1LRbxyhqwjVJtcDPI+KUPNu+DTwZET/ILi8DPgiMAhoi\nYmJ2/a1ARMSdnZyjLBOu5brxRhg9Gr74xbKexsysV5RzwjXReQ3+MeBvswGcBbwdEW8A84GjJNVK\nGghMypZNjdv1zazaFWzekfQwkAEOkfQ6UA8MJKm13x8Rj0u6WNJKYCtwNcnG7ZJuBGaTXFweiIil\nZfo+ijJmDPzrv6YZgZlZuvr9fPq5/vxnOPDAZI79ffYp66nMzMrO8+kXsNdecOyxsGRJ2pGYmaWj\nqpI+uF3fzKqbk76ZWRVx0jczqyJV1ZEL8M47MHQobNoEA8pxa5qZWS9xR24R9t8fhg2DV19NOxIz\ns95XdUkf3MRjZtWrKpP+2LGeZtnMqlNVJn3X9M2sWlVdRy7A736X3KT11lvJoxTNzPoid+QW6bDD\nYN994bXX0o7EzKx3VWXSBzfxmFl1ctI3M6siTvpmZlXESd/MrIpUbdIfORL++MdkJI+ZWbUoKulL\nmihpmaTlkm7Js/1ASY9KWiRpnqQTcra9ll2/UNLzpQx+d0hw2mmu7ZtZdSmY9CXVAPcCFwInAldI\nOq5dsS8DCyPiVOBTwN0521qBTESMiYhxpQm7NNzEY2bVppia/jhgRUS0RMQ2YBZwWbsyJwBPAETE\nq8BISYdmt6nI8/Q6J30zqzbFJOPhwJqc5bXZdbkWAZcDSBoHHAmMyG4LYI6k+ZKu271wS8tJ38yq\nTalmlL8D+JakBcASYCGwPbttQkRsyNb850haGhFz8x2koaFhx/tMJkMmkylRePkddxysWwebN8N+\n+5X1VGZmu62pqYmmpqbdOkbBuXcknQU0RMTE7PKtQETEnV3s0wycHBFb2q2vBzZHxDfy7NNrc+/k\nOvNM+Jd/gQ98oNdPbWa2W8o198584ChJtZIGApOAx9qd+ABJe2bfXwf8V0RskbSvpMHZ9YOADwMv\ndSfAcnMTj5lVk4LNOxGxXdKNwGySi8QDEbFU0g3J5rgfOB74d0mtwMvANdndDwd+Kimy55oZEbPL\n8Y301JgxMG9e2lGYmfWOqpxaOdfzz8P118OLL/b6qc3MdktPmneqPun/6U9w8MHw9tuw1169fnoz\nsx7zfPo9sM8+UFcHL7+cdiRmZuVX9Ukf3JlrZtXDSR8nfTOrHk76OOmbWfWo+o5cgI0b4cgjk87c\nPfZIJQQzs25zR24PHXQQDBkCK1emHYmZWXk56We5icfMqoGTfpaTvplVAyf9LCd9M6sGTvpZY8cm\nSb9C+rXNzMrCST9r6FCoqUnm1zcz66+c9LMkN/GYWf/npJ9jzBhYsCDtKMzMysdJP4dr+mbW3xWV\n9CVNlLRM0nJJt+TZfqCkRyUtkjRP0gnF7ltJnPTNrL8r5hm5NcBy4HxgPcnjEydFxLKcMv9E8uzb\nRknHAvdFxF8Us2/OMVKbhqFNaysceCA0N8Mhh6QaiplZQeWahmEcsCIiWiJiGzALuKxdmROAJwAi\n4lVgpKRDi9y3YtTUwKmn+ilaZtZ/FZP0hwNrcpbXZtflWgRcDiBpHHAkMKLIfSuKm3jMrD8rVUfu\nHcBBkhYAnwcWAttLdOxe5aRvZv3ZgCLKrCOpubcZkV23Q0RsBj7dtiypGVgN7Fto31wNDQ073mcy\nGTKZTBHhldaYMfDP/9zrpzUzK6ipqYmmpqbdOkYxHbl7AK+SdMZuAJ4HroiIpTllDgD+GBHbJF0H\nTIiIKcXsm3OM1DtyAf7nf5LO3DffhH33TTsaM7POlaUjNyK2AzcCs4GXgVkRsVTSDZKuzxY7HnhJ\n0lLgQuCmrvbtToC9beBAOO44WLw47UjMzErPT87K49pr4fTT4bOfTTsSM7PO+clZJeLOXDPrr5z0\n83DSN7P+ys07eWzZAocfnjwofc89047GzCw/N++UyODBcMQRsKzDZBFmZn2bk34nPM2ymfVHTvqd\ncLu+mfVHTvqdcNI3s/7IHbmdePNNqKuDjRuT2TfNzCqNO3JLaMgQ2H//ZG59M7P+wkm/C27iMbP+\nxkm/C076ZtbfOOl3wUnfzPobJ/0uOOmbWX/jpN+FI49M5tf/7/9OOxIzs9Jw0u+C5Nq+mfUvTvoF\nOOmbWX9SVNKXNFHSMknLJd2SZ/v+kh6T9KKkJZKm5Gx7TdIiSQslPV/C2HuFk76Z9SfFPCO3BlhO\n8pzb9cB8YFJELMspcxuwf0TcJmkIyXNxD4+I9yStBk6PiI0FzlNRd+S2WboULrkEVq5MOxIzs12V\n647cccCKiGiJiG3ALOCydmUC2C/7fj/gDxHxXltcRZ6nIh1zTNKRu2lT2pGYme2+YpLxcGBNzvLa\n7Lpc9wInSFoPLCL7YPSsAOZImi/put0JNg177AEnnwyLFqUdiZnZ7itVDfxCYGFEDAPGAPdJGpzd\nNiEixgIXA5+X9IESnbPXeG59M+svBhRRZh1wZM7yiOy6XFcDtwNExCpJzcBxwAsRsSG7/veSfkrS\nXDQ334kaGhp2vM9kMmQymaK+iXIbMwbm5o3YzKz3NDU10dTUtFvHKKYjdw+SjtnzgQ3A88AVEbE0\np8x9wO8iYrqkw4EXgFOBd4GaiNgiaRAwG5geEbPznKciO3IBXngBPv1pWLw47UjMzHbqSUduwZp+\nRGyXdCNJwq4BHoiIpZJuSDbH/cDXgRmS2tLilyLiLUmjgJ9Kiuy5ZuZL+JXupJNgxQp4913Ye++0\nozEz6zk/RKVIp5wC3/senHFG2pGYmSV6UtN30i9Cc3ML558/g4EDWznjjBoaG6cwalRt2mGZWZVz\n0i+D5uYWLrjgHlatmg4MArZSV1fPnDlTnfjNLFV+XGIZTJs2IyfhAwxi1arpTJs2I8WozMx6xkm/\ngHXrWtmZ8NsMYv361jTCMTPbLU76BQwfXgNsbbd2K8OG+UdnZn2PM1cBjY1TqKurZ2fiT9r0Gxun\npBaTmVlPuSO3CM3NLUybNoN161p54YUaHnpoCh//uDtxzSxdHr3TC26/HZqb4f77047EzKqdk34v\n2LABTjgB1qyBwYMLlzczKxcP2ewFQ4fCBz8IP/hB2pGYmXWfk34PXHcdfOc7aUdhZtZ9Tvo9cOGF\nsHYtLFmSdiRmZt3jpN8DAwYkUy1/97tpR2Jm1j3uyO2h115LZtxcu9bTLZtZOtyR24tGjoSxY+HR\nR9OOxMyseE76u+G669zEY2Z9S1FJX9JEScskLZd0S57t+0t6TNKLkpZImlLsvn3ZpZfCSy/BypVp\nR2JmVpxinpFbAywneUbuemA+MCkiluWUuQ3YPyJukzSE5Jm6hwOthfbNOUafatNvc/PNsOeeyZ26\nZma9qVxt+uOAFRHREhHbgFnAZe3KBLBf9v1+wB8i4r0i9+3TrrkGZsyAbdvSjsTMrLBikv5wYE3O\n8trsulz3AidIWg8sAm7qxr592vHHw1FHwS9+kXYkZmaFDSjRcS4EFkbEhyTVAXMkndLdgzQ0NOx4\nn8lkyGQyJQqvvK69NunQ/djH0o7EzPqzpqYmmpqadusYxbTpnwU0RMTE7PKtQETEnTll/i9we0Q8\nnV3+f8AtJBeVLvfNOUafbNMH2LoVjjgCFi+GESPSjsbMqkW52vTnA0dJqpU0EJgEPNauTAvwF9kg\nDgeOAVYXuW+fN2gQTJoEDz6YdiRmZl0r6o5cSROBb5FcJB6IiDsk3UBSa79f0lBgBjA0u8vtEfFI\nZ/t2co4+W9MHWLAALr8cVq+GGt/9YGa9wPPpp+z005Ohmx/+cNqRmFk18DQMKbv2Wk+5bGaVzTX9\nEtq0CWprYflyOOywtKMxs/7ONf2UHXBAMmzzP/4j7UjMzPJz0i+xtiaePv6hxcz6KSf9EpswASR4\n+um0IzEz68hJv8Qkd+iaWeVyR24Z/P73cPTRydO1Djww7WjMrL9yR26FOPTQZKz+ww+nHYmZ2a6c\n9MvET9Uys0rkpF8m558Pb70Fv/1t2pGYme3kpF8mNTXJA1Zc2zezSuKO3DJauxZOOQXWrElm4jQz\nKyV35FaYESNg/Hj40Y/SjsTMLOGkX2bu0DWzSuKkX2YXXwyrVsErr6QdiZmZk37Z7bknTJkCDzyQ\ndiRmZt17ctY32fn0qzvbbb8ZmAwEsCdwPDAkIt6W9BqwCWgFtkXEuE7O0e86ctusXJm07a9ZA3vt\nlXY0ZtZflOXJWZJqgOXA+cB6kufeToqIZZ2U/yjwdxHR9szc1cDpEbGxwHn6bdIH+NCH4DOfgU9+\nMu1IzKy/KNfonXHAiohoiYhtwCzgsi7KXwE8khtXkefp1667zpOwmVn6iknGw4E1Octrs+s6kLQP\nMBH4Sc7qAOZImi/pup4G2td9/OOwcCE0N6cdiZlVswElPt4lwNyIeDtn3YSI2CDpUJLkvzQi5ubb\nuaGhYcf7TCZDJpMpcXjp2XtvuOqqpEP3619POxoz64uamppoamrarWMU06Z/FtAQEROzy7cC0b4z\nN7vtUeCHETGrk2PVA5sj4ht5tvXrNn2AJUtg4kRoaYEBpb7cmlnVKVeb/nzgKEm1kgYCk4DH8pz8\nAOCDwM9y1u0raXD2/SDgw8BL3QmwPzn5ZDjiCPjlL9OOxMyqVcGkHxHbgRuB2cDLwKyIWCrpBknX\n5xT9GPCriPhTzrrDgbmSFgLzgJ9HxOzShd/3uEPXzNLkCdd62ZYtSW3/5Zdh2LC0ozGzvswTrvUB\ngwfDX/0VzJiRdiRmVo1c00/B88/DFVfAihXJvPtmZj3hmn4f8f73JzX+J59MOxIzqzZO+imQ4Npr\nPeWymfU+N++kZOHCFs48cwbjxrUycmQNjY1TGDWqNu2wzKwPKcuEa72lmpJ+c3MLF1xwD6tWTQcG\nAVupq6tnzpypTvxmVjS36fcR06bNyEn4AINYtWo6U6bMYPVqaG1NMzoz6888GUAK1q1rZWfCbzOI\nl15q5dxzYdMmOOmk5A7ek09OHq5+8slw8MFdH7e5uYVp02awbl0rw4e7ycjMOnLST8Hw4TXAVnZN\n/Fu56KIavv99eOsteOmlZK6exYvhkUeS5f3263ghOP745MEs+ZqM5s1zk5GZ7cpt+inoSZt+RDJR\nW9uFYMmS5LV6NYwaBVu3Tuf112+m/YVk8uS7+P7363vhuzKz3taTNn3X9FMwalQtc+ZMZdq0u1i/\nvpVhw2pobOy6Ri7ByJHJ65JLdq7/859h2TK44or8TUbr17uDwMx2ctJPyahRtSWpge+1F5x6Kowd\nW8PSpR2bjF5/vYbVq2H06N0+lZn1Ax690080Nk6hrq6epK8AYCsjR9Zz0UVTGDcOrrkmaQoys641\nN7dw1VXTOe+8eq66ajrNzS0Vdbzd5Tb9fqRt9M7OJqNk9M7GjfDNb8J998Gll8JXvgJ1dWlHa1Z5\nSn0PTbnvyfHNWdaljRvhW9+Ce+9N+gW+8hU46qi0ozKrHFddNZ2ZMzsOiJg48S5uu62eLVvo1mvl\nyuls3ly+ARbuyLUuHXQQNDTA3/1dkvzPOgs+8hH46lfh6KPTjs4sHa2tyWCI556DJ5/MPyBi7txW\nvvrVZKLEfK/3va/juv32g89+tpXnnqusARZFJX1JE4FvkvQBPND++biSbgYmAwHsCRwPDImItwvt\na73vwAOhvh5uugnuvhvGj4eLLkqS/zHHpB2dWXmtX59Mb/7cc8nXF16AQw+FceNg6NAa1q/vOCDi\nssuSe2i666ijanjuuY7HGzYsxe7UiOjyRZKsVwK1JAn9ReC4Lsp/FPh1d/dNQrE0vP12RGNjxJAh\nEVddFbFsWdoRmRVn9erXYvLkhshkvhaTJzfE6tWv7bJ98+aIJ5+MuPPOiMsvjxgxIuKQQyIuuiii\nvj7i8ccjfv/7XY9XV/f3AVsiuTtmS9TV/X2H43YnvlIer71s3iyYx3NfBdv0JZ0F1EfERdnlW7Mn\nyltjlzQTeCIiHujOvm7TT98778A99ySdvhdemNT899rLUztYZcrXSTpiRD2f+cxUVq+u5fnnkxFr\np56a1OLPPDP5Onp0ct9LV8fNNyBid+Is5fFylaUjV9JfAhdGxPXZ5auAcRHxhTxl9wHWAnWRNO10\nZ18n/QrxzjtJZ+9dd7Wwbds9bNni2UCrTV+Yx+kv/3I6jz7asZN01Ki7uPnmes48M5mqZODAtCIs\nv0royL0EmBsRb/dk54aGhh3vM5kMmUymNFFZt+y/P3z5y7Bo0Qx++MOOs4HedttdzJrlqR0qRakT\ndCXO47R9O7z8Mjz9NDzzTPJqacnf6TpyZCuf+1waUZZfU1MTTU1Nu3eQQu0/wFnAL3OWbwVu6aTs\no8CkHu5bkjYuK51M5mvZdshdX9LX4v3vj/jCFyJmzYpoaYlobU072upU6jbj1taIv/7rhpzjxY7j\nTp7cUOLoO7dpU8Ts2RENDREXXBCx//4Rxx4bcfXVEd/5TsTLL0dceWX6caaNHrTpF1PTnw8cJakW\n2ABMAq5oX0jSAcAHSUbxdGtfq0ydzQb6yU/W8PnPJ7WtRx6BqVOTj9Bnn52MBDr7bBgzJpkiIp++\n0HTQV3T2bIYrr7yLv/mbZFz51q10+Jpv3ZYt8Mc/AuSvQT/+eCvXXANHHLHzNWJE8nW//QrH2tnv\nPQJee23XWvzKlTB2LEyYkPx9PfwwDBmy6/G+/vUpPPdcfYcbnxobp+7Oj7TfK5j0I2K7pBuB2ewc\ndrlU0g3J5rg/W/RjwK8i4k+F9i35d2Fl0dg4hXnzOv5T3X77VEaNgnPOScpFJB1mzz6b/MM+9BAs\nXw6nnbbzInD22TB0aGU2HfQ1774Lv/1t8rP+1a/yJ+jm5laWLEnGiw8atHMc+aBBXX/dd1/41Kdq\nmDmz48X+1FNrOPNMWLMG5s5Nvra9Bg7seCHIfb33XguXXrrr7/2Xv6znjDOmsmhRLVKS4MePhylT\nkr+dQm3xPZm40HxHrhXQ05EHmzfD/PlJYnr22eR1wAEQMZ2WFk8B3R1vvJH8HNtqwosWJc9RGD8e\nFiyYztNPl/bn2d2pAyKSu73bLgBr1+56QVi7Flavnk5ra8c4x4+/i5kz66mt7XpEjeXnaRisYrW2\nJrX/j3+8nmXLpnfYPnRoPVOnTmf0aHa8Dj64+ETQV5qMCsW5fTu88squTR1/+EPySamtJjxuXFIz\nbzteOeZ2KfUww/POq6epqePv/bzz6nniiY7rrTiVMHrHLK+aGjjuODj99BqWLevYdDBiRA0bN8KP\nfpQ0Fa1endQgR49OHhKTezEYPRpqa3f2GfSVJqN8cT7zTD3Tp0+lubmWZ56BefPgsMOSBH/OOXDr\nrcnPraaTGzjL1cRRqqm/23TWP5TqnanVqrs9v+V64dE7VaE7o03eeivihRcifvjDiDvuiLjhhmQk\nR11dxMCByd2V554bMXp03xjFMXly/jgPPbQhvvSliJ/9LOJ3v0s7yvIo952p1Yoyjd4xK5nu1EwP\nOghOPz15tffee21txfC5z+XvzHzmmVaeeCK5E3NQ+829ICJp0nrqKfjNb+DHP84f50kntXJnP5+R\nyp2ulcNJ33pdKZoOBgzY+fjIM86o4dVXOzYd7L13DdOm7ez4nDABPvCB5OvQobt1+ry2b0/O9dRT\nO197750005x7Lrz1Vg2/+EX1NnGUusnIesYdudbnFerMfPfdZCbFp59Ohho+/XTyKSL3InD88R3b\nzQt1uv75z8kIpd/8Jknwzz4Lw4YlSb7tVVtbfJxm3eXRO1a1ujPapG3+9LYLwNy5yZDD8eN3XggO\nPbSFj3501wQ9alQ9X/vaVFasqOWpp2DBguRi0Zbgk/1KF6dZIU76Zj20YUNyAWi7CCxcOJ3t2zuO\nKz/ssLu4/vp6zjknGUZZzJ2oZuXiIZtmPTR0KHziE8kL4NxzW3nqqY6driee2EpjY6+HZ1Yy1dGD\nZNZNRx7ZNq48V/V0ulr/5b9gszwaG6dQV1fPzsTfNpnXlNRiMisFt+mbdcKdrlbp3JFrZlZFepL0\n3bxjZlZFnPTNzKpIUUlf0kRJyyQtl3RLJ2UykhZKeknSkznrX5O0KLvt+VIFbmZm3Vcw6UuqAe4F\nLgROBK6QdFy7MgcA9wEfjYiTgL/K2dwKZCJiTESMK1nkKdjtBxL3EsdZWo6ztBxnuoqp6Y8DVkRE\nS0RsA2YBl7UrcyXwk4hYBxARb+ZsU5HnqXh95Y/AcZaW4ywtx5muYpLxcGBNzvLa7LpcxwAHS3pS\n0nxJf5OzLYA52fXX7V64Zma2O0o1DcMAYCzwIZLJSp6V9GxErAQmRMQGSYeSJP+lETG3ROc1M7Nu\nKDhOX9JZQENETMwu30rytJY7c8rcAuwdEdOzy98F/jMiftLuWPXA5oj4Rp7zeJC+mVk3lWPCtfnA\nUZJqgQ3AJOCKdmV+BtwjaQ9gL+BM4BuS9gVqImKLpEHAh4G8T0HubuBmZtZ9BZN+RGyXdCMwm6QP\n4IGIWCrphmRz3B8RyyT9ClgMbAfuj4hXJI0CfpqtxQ8AZkbE7PJ9O2Zm1pWKmYbBzMzKL/WhlMXc\n+JU2SSMkPSHpZUlLJH0h7Zi6IqlG0gJJj6UdS2ckHSDpR5KWZn+uZ6YdU3uS/lf2ZsPFkmZKGph2\nTG0kPSDpDUmLc9YdJGm2pFcl/Sp7/0ylxfhP2d/5i5J+Imn/NGPMxtQhzpxtfy+pVdLBacTWLpa8\ncUqamv3G0spKAAADg0lEQVSZLpF0R6HjpJr0i7nxq0K8B3wxIk4EzgY+X6FxtrkJeCXtIAr4FvB4\nRBwPnAosTTmeXUgaBkwFxkbEKSTNk5PSjWoXD5L83+S6Ffh1RBwLPAHc1utR7SpfjLOBEyPiNGAF\n6ccI+eNE0gjgAqCl1yPKr0OckjLAJcDJEXEycFehg6Rd0y/mxq/URcR/R8SL2fdbSBJU+3sVKkL2\nD/Vi4Ltpx9KZbO3unIh4ECAi3ouId1IOK589gEGSBgD7AutTjmeH7LDnje1WXwb8e/b9vwMf69Wg\n2skXY0T8OiJas4vzgBG9Hlg7nfwsAf4V+IdeDqdTncT5WeCOiHgvW+bNDju2k3bSL+bGr4oiaSRw\nGvBcupF0qu0PtZI7a0YBb0p6MNsMdb+kfdIOKldErAf+BXgdWAe8HRG/Tjeqgg6LiDcgqagAh6Uc\nTyGfBv4z7SDykXQpsCYilqQdSwHHAOdKmpe9OfaMQjuknfT7FEmDgR8DN2Vr/BVF0keAN7KfSpR9\nVaK2m/nui4ixwB9JmiYqhqQDSWrOtcAwYLCkK9ONqtsq9sIv6SvAtoh4OO1Y2stWQL4M1OeuTimc\nQgYAB0XEWcCXgB8W2iHtpL8OODJneUR2XcXJfsT/MfAfEfGztOPpxATgUkmrgUeA8yQ9lHJM+awl\nqUW9kF3+MclFoJL8BbA6It6KiO3Ao8D4lGMq5A1JhwNIeh/wu5TjyUvSFJImyEq9iNYBI4FFkppJ\n8tJvJVXiJ6c1JH+bRMR8oFXSIV3tkHbS33HjV3ZkxCSgUkecfA94JSK+lXYgnYmIL0fEkRExmuRn\n+URE/G3acbWXbYJYI+mY7KrzqbyO59eBsyTtLUkkMVZUZzMdP809BkzJvv8UyU2TadslRkkTSZof\nL42IP6cWVUc74oyIlyLifRExOiJGkVRSxkREJVxE2//O/w/J9Ddk/5/2jIg/dHWAVJN+tgbVduPX\ny8CsiKi0fywkTQAmAx/KPhdgQfaP13ruC8BMSS+SjN75x5Tj2UVEPE/yCWQhsIjkH+3+VIPKIelh\n4BngGEmvS7oauAO4QNKrJBepgsP3UojxHmAwyTxcCyT97zRjhE7jzBVUQPNOJ3F+DxgtaQnwMFCw\nkuebs8zMqkjazTtmZtaLnPTNzKqIk76ZWRVx0jczqyJO+mZmVcRJ38ysijjpm5lVESd9M7Mq8v8B\nvQBp8jPaPM8AAAAASUVORK5CYII=\n",
 962 |       "text/plain": [
 963 |        "<matplotlib.figure.Figure at 0x1f85e208>"
 964 |       ]
 965 |      },
 966 |      "metadata": {},
 967 |      "output_type": "display_data"
 968 |     }
 969 |    ],
 970 |    "source": [
 971 |     "import matplotlib.pyplot as plt\n",
 972 |     "%matplotlib inline\n",
 973 |     "\n",
 974 |     "kvals = range(1, 16)\n",
 975 |     "plt.plot(kvals, rss_all,'bo-')"
 976 |    ]
 977 |   }
 978 |  ],
 979 |  "metadata": {
 980 |   "kernelspec": {
 981 |    "display_name": "Python 2",
 982 |    "language": "python",
 983 |    "name": "python2"
 984 |   },
 985 |   "language_info": {
 986 |    "codemirror_mode": {
 987 |     "name": "ipython",
 988 |     "version": 2
 989 |    },
 990 |    "file_extension": ".py",
 991 |    "mimetype": "text/x-python",
 992 |    "name": "python",
 993 |    "nbconvert_exporter": "python",
 994 |    "pygments_lexer": "ipython2",
 995 |    "version": "2.7.13"
 996 |   }
 997 |  },
 998 |  "nbformat": 4,
 999 |  "nbformat_minor": 0
1000 | }
1001 | 


--------------------------------------------------------------------------------
/ridge-regression/ridge-regression-gradient-descent.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Ridge regression using gradient descent on house sales data"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "### Fire up Graphlab Create"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {
  21 |     "collapsed": false
  22 |    },
  23 |    "outputs": [],
  24 |    "source": [
  25 |     "import graphlab"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "### Load in house sales data\n",
  33 |     "\n",
  34 |     "Dataset is from house sales in King County, the region where the city of Seattle, WA is located."
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "code",
  39 |    "execution_count": 91,
  40 |    "metadata": {
  41 |     "collapsed": false
  42 |    },
  43 |    "outputs": [],
  44 |    "source": [
  45 |     "sales = graphlab.SFrame('kc_house_data.gl/')"
  46 |    ]
  47 |   },
  48 |   {
  49 |    "cell_type": "markdown",
  50 |    "metadata": {},
  51 |    "source": [
  52 |     "### Function to convert SFrame to Numpy data"
  53 |    ]
  54 |   },
  55 |   {
  56 |    "cell_type": "code",
  57 |    "execution_count": 92,
  58 |    "metadata": {
  59 |     "collapsed": true
  60 |    },
  61 |    "outputs": [],
  62 |    "source": [
  63 |     "import numpy as np"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": 93,
  69 |    "metadata": {
  70 |     "collapsed": true
  71 |    },
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "def get_numpy_data(data_sframe, features, output):\n",
  75 |     "    \n",
  76 |     "    data_sframe['constant'] = 1 # new constant column in the sframe signifying intercept\n",
  77 |     "    \n",
  78 |     "    features = ['constant'] + features # prepend constant to features list\n",
  79 |     "    \n",
  80 |     "    features_sframe = data_sframe[features] # new sframe selecting columns from data_sframe mentioned in features list\n",
  81 |     "\n",
  82 |     "    feature_matrix = features_sframe.to_numpy() # convert sframe to numpy matrix\n",
  83 |     "\n",
  84 |     "    output_sarray = data_sframe[output] # an sarray consisting of the output column\n",
  85 |     "\n",
  86 |     "    output_array = output_sarray.to_numpy() # converts sarray to a numpy array\n",
  87 |     "\n",
  88 |     "    return(feature_matrix, output_array)"
  89 |    ]
  90 |   },
  91 |   {
  92 |    "cell_type": "markdown",
  93 |    "metadata": {
  94 |     "collapsed": true
  95 |    },
  96 |    "source": [
  97 |     "### Function to predict output given feature matrix and weight vector"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 94,
 103 |    "metadata": {
 104 |     "collapsed": true
 105 |    },
 106 |    "outputs": [],
 107 |    "source": [
 108 |     "def predict_output(feature_matrix, weights):\n",
 109 |     "    predictions = np.dot(feature_matrix, weights)\n",
 110 |     "    return(predictions)"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "markdown",
 115 |    "metadata": {},
 116 |    "source": [
 117 |     "### Computing the Derivative"
 118 |    ]
 119 |   },
 120 |   {
 121 |    "cell_type": "markdown",
 122 |    "metadata": {},
 123 |    "source": [
 124 |     "We are now going to move to computing the derivative of the regression cost function. The cost function is the sum over the data points of the squared difference between an observed output and a predicted output, plus the L2 penalty term.\n",
 125 |     "```\n",
 126 |     "Cost(w)\n",
 127 |     "= SUM[ (prediction - output)^2 ]\n",
 128 |     "+ l2_penalty*(w[0]^2 + w[1]^2 + ... + w[k]^2).\n",
 129 |     "```\n",
 130 |     "\n",
 131 |     "Since the derivative of a sum is the sum of the derivatives, we can take the derivative of the first part (the RSS) and add the derivative of the regularization part. The derivative of the RSS with respect to `w[i]` can be written as: \n",
 132 |     "```\n",
 133 |     "2*SUM[ error*[feature_i] ].\n",
 134 |     "```\n",
 135 |     "The derivative of the regularization term with respect to `w[i]` is:\n",
 136 |     "```\n",
 137 |     "2*l2_penalty*w[i].\n",
 138 |     "```\n",
 139 |     "Summing both, we get\n",
 140 |     "```\n",
 141 |     "2*SUM[ error*[feature_i] ] + 2*l2_penalty*w[i].\n",
 142 |     "```\n",
 143 |     "That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself, plus `2*l2_penalty*w[i]`. \n",
 144 |     "\n",
 145 |     "**We dont have to regularize the constant.**  Thus, in the case of the constant, the derivative is just twice the sum of the errors (without the `2*l2_penalty*w[0]` term).\n",
 146 |     "\n",
 147 |     "Twice the sum of the product of two vectors is just twice the dot product of the two vectors. Therefore the derivative for the weight for feature_i is just two times the dot product between the values of feature_i and the current errors, plus `2*l2_penalty*w[i]`.\n",
 148 |     "\n",
 149 |     "The following derivative function computes the derivative of the weight given the value of the feature (over all data points) and the errors (over all data points)."
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "markdown",
 154 |    "metadata": {},
 155 |    "source": [
 156 |     "### Function to compute derivative of weight"
 157 |    ]
 158 |   },
 159 |   {
 160 |    "cell_type": "code",
 161 |    "execution_count": 95,
 162 |    "metadata": {
 163 |     "collapsed": true
 164 |    },
 165 |    "outputs": [],
 166 |    "source": [
 167 |     "def feature_derivative_ridge(errors, feature, weight, l2_penalty, feature_is_constant):\n",
 168 |     "    derivative = 0\n",
 169 |     "    # If feature_is_constant is True, derivative is twice the dot product of errors and feature\n",
 170 |     "    if feature_is_constant:\n",
 171 |     "        total_error = errors.sum()\n",
 172 |     "        derivative = 2 * total_error\n",
 173 |     "    # Otherwise, derivative is twice the dot product plus 2*l2_penalty*weight\n",
 174 |     "    else:\n",
 175 |     "        dot_product = np.dot(errors, feature)\n",
 176 |     "        rss_part = 2 * dot_product\n",
 177 |     "        regularized_part = 2 * l2_penalty * weight\n",
 178 |     "        derivative = rss_part + regularized_part\n",
 179 |     "    return derivative"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "markdown",
 184 |    "metadata": {},
 185 |    "source": [
 186 |     "### Gradient Descent"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "markdown",
 191 |    "metadata": {},
 192 |    "source": [
 193 |     "Now we will write a function that performs a gradient descent. The basic premise is simple. Given a starting point we update the current weights by moving in the negative gradient direction. Recall that the gradient is the direction of *increase* and therefore the negative gradient is the direction of *decrease* and we're trying to *minimize* a cost function. \n",
 194 |     "\n",
 195 |     "The amount by which we move in the negative gradient *direction*  is called the 'step size'. We stop when we are 'sufficiently close' to the optimum. \n",
 196 |     "\n",
 197 |     "We will set a **maximum number of iterations** and take gradient steps until we reach this maximum number. If no maximum number is supplied, the maximum should be set 100 by default."
 198 |    ]
 199 |   },
 200 |   {
 201 |    "cell_type": "markdown",
 202 |    "metadata": {},
 203 |    "source": [
 204 |     "### Gradient descent algorithm with Ridge regression"
 205 |    ]
 206 |   },
 207 |   {
 208 |    "cell_type": "code",
 209 |    "execution_count": 96,
 210 |    "metadata": {
 211 |     "collapsed": false
 212 |    },
 213 |    "outputs": [],
 214 |    "source": [
 215 |     "def ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations=100):\n",
 216 |     "    print 'Starting gradient descent with l2_penalty = ' + str(l2_penalty)\n",
 217 |     "    \n",
 218 |     "    weights = np.array(initial_weights) # a numpy array\n",
 219 |     "    iteration = 0 # iteration counter\n",
 220 |     "    print_frequency = 1  # for adjusting frequency of debugging output\n",
 221 |     "    \n",
 222 |     "    # while not reached maximum number of iterations        \n",
 223 |     "    while iteration < max_iterations: \n",
 224 |     "        iteration += 1  # increment iteration counter\n",
 225 |     "        \n",
 226 |     "        ### === code section for adjusting frequency of debugging output. ===\n",
 227 |     "        if iteration == 10:\n",
 228 |     "            print_frequency = 10\n",
 229 |     "        if iteration == 100:\n",
 230 |     "            print_frequency = 100\n",
 231 |     "        if iteration%print_frequency==0:\n",
 232 |     "            print('Iteration = ' + str(iteration))\n",
 233 |     "        ### === end code section ===\n",
 234 |     "        \n",
 235 |     "        # compute the predictions based on feature_matrix and weights using your predict_output() function\n",
 236 |     "        predictions = predict_output(feature_matrix, weights)\n",
 237 |     "\n",
 238 |     "        # compute the errors in prediction\n",
 239 |     "        errors = predictions - output\n",
 240 |     "\n",
 241 |     "        # from time to time, print the value of the cost function\n",
 242 |     "        if iteration%print_frequency==0:\n",
 243 |     "            print 'Cost function = ', str(np.dot(errors,errors) + l2_penalty*(np.dot(weights,weights) - weights[0]**2))\n",
 244 |     "        \n",
 245 |     "        for i in xrange(len(weights)): # loop over each weight\n",
 246 |     "            \n",
 247 |     "            # feature column associated with weights[i]\n",
 248 |     "            feature_column = feature_matrix[:,i]\n",
 249 |     "            \n",
 250 |     "            # computing derivative of weight[i]\n",
 251 |     "            if i == 0:  # feature is constant\n",
 252 |     "                derivative = feature_derivative_ridge(errors, feature_column, weights[i], l2_penalty, True)\n",
 253 |     "            else:       # feature is not constant\n",
 254 |     "                derivative = feature_derivative_ridge(errors, feature_column, weights[i], l2_penalty, False)\n",
 255 |     "                \n",
 256 |     "            # subtracting the step size times the derivative from the current weight\n",
 257 |     "            weights[i] = weights[i] - (step_size * derivative)\n",
 258 |     "            \n",
 259 |     "    print 'Done with gradient descent at iteration ', iteration\n",
 260 |     "    print 'Learned weights = ', str(weights)\n",
 261 |     "    return weights"
 262 |    ]
 263 |   },
 264 |   {
 265 |    "cell_type": "markdown",
 266 |    "metadata": {},
 267 |    "source": [
 268 |     "### Visualizing effect of L2 penalty"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "markdown",
 273 |    "metadata": {},
 274 |    "source": [
 275 |     "### Simple model with no L2 penalty (No regularization)"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "code",
 280 |    "execution_count": 97,
 281 |    "metadata": {
 282 |     "collapsed": true
 283 |    },
 284 |    "outputs": [],
 285 |    "source": [
 286 |     "simple_features = ['sqft_living']\n",
 287 |     "my_output = 'price'"
 288 |    ]
 289 |   },
 290 |   {
 291 |    "cell_type": "markdown",
 292 |    "metadata": {},
 293 |    "source": [
 294 |     "Let us split the dataset into training set and test set. Make sure to use `seed=0`:"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "code",
 299 |    "execution_count": 98,
 300 |    "metadata": {
 301 |     "collapsed": true
 302 |    },
 303 |    "outputs": [],
 304 |    "source": [
 305 |     "train_data,test_data = sales.random_split(.8,seed=0)"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "markdown",
 310 |    "metadata": {},
 311 |    "source": [
 312 |     "Get numpy versions by using `get_numpy_data` of your data with only this feature, for both the `train_data` and the `test_data`. "
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": 99,
 318 |    "metadata": {
 319 |     "collapsed": true
 320 |    },
 321 |    "outputs": [],
 322 |    "source": [
 323 |     "(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)\n",
 324 |     "(simple_test_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)"
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "markdown",
 329 |    "metadata": {},
 330 |    "source": [
 331 |     "Let's set the parameters for our optimization:"
 332 |    ]
 333 |   },
 334 |   {
 335 |    "cell_type": "code",
 336 |    "execution_count": 100,
 337 |    "metadata": {
 338 |     "collapsed": true
 339 |    },
 340 |    "outputs": [],
 341 |    "source": [
 342 |     "initial_weights = np.array([0., 0.])\n",
 343 |     "step_size = 1e-12\n",
 344 |     "max_iterations=1000"
 345 |    ]
 346 |   },
 347 |   {
 348 |    "cell_type": "markdown",
 349 |    "metadata": {},
 350 |    "source": [
 351 |     "#### Learned weights with no regulariztion"
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": 101,
 357 |    "metadata": {
 358 |     "collapsed": false
 359 |    },
 360 |    "outputs": [
 361 |     {
 362 |      "name": "stdout",
 363 |      "output_type": "stream",
 364 |      "text": [
 365 |       "Starting gradient descent with l2_penalty = 0.0\n",
 366 |       "Iteration = 1\n",
 367 |       "Cost function =  7.43305185103e+15\n",
 368 |       "Iteration = 2\n",
 369 |       "Cost function =  5.39426721314e+15\n",
 370 |       "Iteration = 3\n",
 371 |       "Cost function =  4.0232377365e+15\n",
 372 |       "Iteration = 4\n",
 373 |       "Cost function =  3.10125618392e+15\n",
 374 |       "Iteration = 5\n",
 375 |       "Cost function =  2.48124764451e+15\n",
 376 |       "Iteration = 6\n",
 377 |       "Cost function =  2.06430807789e+15\n",
 378 |       "Iteration = 7\n",
 379 |       "Cost function =  1.78392709737e+15\n",
 380 |       "Iteration = 8\n",
 381 |       "Cost function =  1.59537820315e+15\n",
 382 |       "Iteration = 9\n",
 383 |       "Cost function =  1.46858399105e+15\n",
 384 |       "Iteration = 10\n",
 385 |       "Cost function =  1.38331819148e+15\n",
 386 |       "Iteration = 20\n",
 387 |       "Cost function =  1.2115621405e+15\n",
 388 |       "Iteration = 30\n",
 389 |       "Cost function =  1.20831376268e+15\n",
 390 |       "Iteration = 40\n",
 391 |       "Cost function =  1.20825232625e+15\n",
 392 |       "Iteration = 50\n",
 393 |       "Cost function =  1.20825116361e+15\n",
 394 |       "Iteration = 60\n",
 395 |       "Cost function =  1.20825114092e+15\n",
 396 |       "Iteration = 70\n",
 397 |       "Cost function =  1.20825113978e+15\n",
 398 |       "Iteration = 80\n",
 399 |       "Cost function =  1.20825113905e+15\n",
 400 |       "Iteration = 90\n",
 401 |       "Cost function =  1.20825113832e+15\n",
 402 |       "Iteration = 100\n",
 403 |       "Cost function =  1.2082511376e+15\n",
 404 |       "Iteration = 200\n",
 405 |       "Cost function =  1.20825113037e+15\n",
 406 |       "Iteration = 300\n",
 407 |       "Cost function =  1.20825112315e+15\n",
 408 |       "Iteration = 400\n",
 409 |       "Cost function =  1.20825111592e+15\n",
 410 |       "Iteration = 500\n",
 411 |       "Cost function =  1.2082511087e+15\n",
 412 |       "Iteration = 600\n",
 413 |       "Cost function =  1.20825110147e+15\n",
 414 |       "Iteration = 700\n",
 415 |       "Cost function =  1.20825109424e+15\n",
 416 |       "Iteration = 800\n",
 417 |       "Cost function =  1.20825108702e+15\n",
 418 |       "Iteration = 900\n",
 419 |       "Cost function =  1.20825107979e+15\n",
 420 |       "Iteration = 1000\n",
 421 |       "Cost function =  1.20825107257e+15\n",
 422 |       "Done with gradient descent at iteration  1000\n",
 423 |       "Learned weights =  [ -1.63113501e-01   2.63024369e+02]\n"
 424 |      ]
 425 |     }
 426 |    ],
 427 |    "source": [
 428 |     "l2_penalty = 0.0\n",
 429 |     "simple_weight_0_penalty = ridge_regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, \n",
 430 |     "                                                            l2_penalty, max_iterations)"
 431 |    ]
 432 |   },
 433 |   {
 434 |    "cell_type": "markdown",
 435 |    "metadata": {},
 436 |    "source": [
 437 |     "#### Learned weights with regulariztion"
 438 |    ]
 439 |   },
 440 |   {
 441 |    "cell_type": "code",
 442 |    "execution_count": 102,
 443 |    "metadata": {
 444 |     "collapsed": false
 445 |    },
 446 |    "outputs": [
 447 |     {
 448 |      "name": "stdout",
 449 |      "output_type": "stream",
 450 |      "text": [
 451 |       "Starting gradient descent with l2_penalty = 1e+11\n",
 452 |       "Iteration = 1\n",
 453 |       "Cost function =  7.43305185103e+15\n",
 454 |       "Iteration = 2\n",
 455 |       "Cost function =  5.61830389841e+15\n",
 456 |       "Iteration = 3\n",
 457 |       "Cost function =  4.92061327812e+15\n",
 458 |       "Iteration = 4\n",
 459 |       "Cost function =  4.65238194261e+15\n",
 460 |       "Iteration = 5\n",
 461 |       "Cost function =  4.54925876401e+15\n",
 462 |       "Iteration = 6\n",
 463 |       "Cost function =  4.50961239088e+15\n",
 464 |       "Iteration = 7\n",
 465 |       "Cost function =  4.49437005028e+15\n",
 466 |       "Iteration = 8\n",
 467 |       "Cost function =  4.48850998403e+15\n",
 468 |       "Iteration = 9\n",
 469 |       "Cost function =  4.48625698853e+15\n",
 470 |       "Iteration = 10\n",
 471 |       "Cost function =  4.48539075267e+15\n",
 472 |       "Iteration = 20\n",
 473 |       "Cost function =  4.48484886803e+15\n",
 474 |       "Iteration = 30\n",
 475 |       "Cost function =  4.48484788048e+15\n",
 476 |       "Iteration = 40\n",
 477 |       "Cost function =  4.48484693108e+15\n",
 478 |       "Iteration = 50\n",
 479 |       "Cost function =  4.48484598169e+15\n",
 480 |       "Iteration = 60\n",
 481 |       "Cost function =  4.48484503229e+15\n",
 482 |       "Iteration = 70\n",
 483 |       "Cost function =  4.4848440829e+15\n",
 484 |       "Iteration = 80\n",
 485 |       "Cost function =  4.48484313351e+15\n",
 486 |       "Iteration = 90\n",
 487 |       "Cost function =  4.48484218411e+15\n",
 488 |       "Iteration = 100\n",
 489 |       "Cost function =  4.48484123472e+15\n",
 490 |       "Iteration = 200\n",
 491 |       "Cost function =  4.48483174082e+15\n",
 492 |       "Iteration = 300\n",
 493 |       "Cost function =  4.48482224696e+15\n",
 494 |       "Iteration = 400\n",
 495 |       "Cost function =  4.48481275314e+15\n",
 496 |       "Iteration = 500\n",
 497 |       "Cost function =  4.48480325936e+15\n",
 498 |       "Iteration = 600\n",
 499 |       "Cost function =  4.48479376562e+15\n",
 500 |       "Iteration = 700\n",
 501 |       "Cost function =  4.48478427191e+15\n",
 502 |       "Iteration = 800\n",
 503 |       "Cost function =  4.48477477825e+15\n",
 504 |       "Iteration = 900\n",
 505 |       "Cost function =  4.48476528463e+15\n",
 506 |       "Iteration = 1000\n",
 507 |       "Cost function =  4.48475579105e+15\n",
 508 |       "Done with gradient descent at iteration  1000\n",
 509 |       "Learned weights =  [   9.76730383  124.57217565]\n"
 510 |      ]
 511 |     }
 512 |    ],
 513 |    "source": [
 514 |     "l2_penalty = 1e11\n",
 515 |     "simple_weight_high_penalty = ridge_regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, \n",
 516 |     "                                                            l2_penalty, max_iterations)"
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "markdown",
 521 |    "metadata": {},
 522 |    "source": [
 523 |     "### Plotting the two learned models\n",
 524 |     "The blue line is for the model with no regularization and the red line is for the one with high regularization."
 525 |    ]
 526 |   },
 527 |   {
 528 |    "cell_type": "code",
 529 |    "execution_count": 103,
 530 |    "metadata": {
 531 |     "collapsed": false
 532 |    },
 533 |    "outputs": [
 534 |     {
 535 |      "data": {
 536 |       "text/plain": [
 537 |        "[<matplotlib.lines.Line2D at 0x20ca27b8>,\n",
 538 |        " <matplotlib.lines.Line2D at 0x20ca2898>,\n",
 539 |        " <matplotlib.lines.Line2D at 0x20ca2a20>,\n",
 540 |        " <matplotlib.lines.Line2D at 0x20c9a470>,\n",
 541 |        " <matplotlib.lines.Line2D at 0x20c9a588>,\n",
 542 |        " <matplotlib.lines.Line2D at 0x20c9af60>]"
 543 |       ]
 544 |      },
 545 |      "execution_count": 103,
 546 |      "metadata": {},
 547 |      "output_type": "execute_result"
 548 |     },
 549 |     {
 550 |      "data": {
 551 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ0AAAEACAYAAABoJ6s/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJztvXt8VdWZ//9+ck5CRo0IXkCIECIEEmKxWsGxhW+KF7Cd\n0bZW5duL2vHb33iZ1ttMFdsp2su30m8dhbZS1NZqq0SrnUn6GkWSCgMzgChqlYCB1ooYCjqiJK2j\nEvL8/jhrH/Y52efk5ORcc57367Vf2Vl7rb2fvc8567PXWs96lqgqhmEYhpELyvJtgGEYhlE6mOgY\nhmEYOcNExzAMw8gZJjqGYRhGzjDRMQzDMHKGiY5hGIaRM1ISHRG5TkS2iMiLIvKgiFSIyCgRWSUi\nnSLypIiM9OVfKCI7RGSbiJzjSz/FnWO7iNzpS68QkWZXZoOITPAdu9Tl7xSRS3zpNSKy0R1bISLh\noT8OwzAMI5sMKDoiMg74CnCKqn4ICAP/G7gJaFfVqcBTwEKXvwG4CKgHzgXuEhFxp1sGXK6qdUCd\niMxz6ZcD+1R1CnAn8H13rlHAN4HTgFnAIp+4LQZud+d6x53DMAzDKGBS7V4LAYe71sRfAV3A+cD9\n7vj9wKfc/nlAs6r2quqrwA5gpoiMBapU9RmX7wFfGf+5HgXmuv15wCpV3a+q7wCrgPnu2FzgMd/1\nP53ivRiGYRh5YkDRUdXdwO3Aa0TEZr+qtgNjVHWvy7MHOM4VGQ/s8p2iy6WNB173pb/u0mLKqOpB\nYL+IjE50LhE5GnhbVft85xqXyg0bhmEY+SOV7rWjiLREJhKp2A8Xkc8D8fFzMhlPRwbOklIewzAM\no4BIZfD9LOAVVd0HICL/CpwB7BWRMaq613WdveHydwEn+MpXu7RE6f4yu0UkBBypqvtEpAtoiiuz\nWlXfEpGRIlLmWjv+c8UgIhZczjAMIw1UNeMv96mM6bwGnC4ilc4h4ExgK9AKXObyXAq0uP1WYIHz\nSJsETAY2uS64/SIy053nkrgyl7r9C4k4JgA8CZztBGYUcLZLA1jt8sZfvx+qWrTbokWL8m5DKdpu\n9ud/M/vzu2WLAVs6qrpJRB4FngcOuL93A1XAIyLyd8BOIh5rqOpWEXmEiDAdAK7SQ3dwNfBzoBJ4\nXFVXuvSfAr8QkR3AW8ACd663ReTbwLNEuu9u1YhDAUS855rd8efdOQzDMIwCJqW5Lap6K3BrXPI+\nIl1vQfm/B3wvIH0zcFJA+vs40Qo49nMiQhWf/kcibtSGYRhGkWARCQqcpqamfJuQNsVsO5j9+cbs\nH55INvvuCgER0eF+j4ZhGJlGRNA8ORIUPT09Pfk2wTCMEqOnp4cNGzZY/RNHSYjO7Nmz7YM3DCNn\n9PT0MHv2bObMmWP1TxwlITpbt26lo6Mj32YYhlEibNmyhY6ODnp7e63+iaMkRKehoYHp06fn2wzD\nMEqExsZGpk+fTnl5udU/cZSEI0F3dzdVVVX5NsUwjBKip6eHjo4Opk+fXpT1T7YcCUpCdIb7PRqG\nYWQa814zDMMwih4THcMwDCNnmOgYhmEYOcNExzAMw8gZJjqGYRhGzjDRMQzDMHKGiY5hGIaRM0x0\nDMMwjJxhomMYhmHkDBMdwzAMI2eY6BiGYRg5Y0DREZE6EXleRJ5zf/eLyFdFZJSIrBKRThF5UkRG\n+sosFJEdIrJNRM7xpZ8iIi+KyHYRudOXXiEiza7MBhGZ4Dt2qcvfKSKX+NJrRGSjO7ZCRMKZeSSG\nYRhGthhQdFR1u6p+WFVPAU4F/gL8K3AT0K6qU4GngIUAItIAXATUA+cCd4mIFzRuGXC5qtYBdSIy\nz6VfDuxT1SnAncD33blGAd8ETgNmAYt84rYYuN2d6x13jkBsASXDKD5s5c3hyWC7184C/qCqu4Dz\ngftd+v3Ap9z+eUCzqvaq6qvADmCmiIwFqlT1GZfvAV8Z/7keBea6/XnAKlXdr6rvAKuA+e7YXOAx\n3/U/nchoW7nPMIoLW3lz+DJY0bkYeMjtj1HVvQCqugc4zqWPB3b5ynS5tPHA6770111aTBlVPQjs\nF5HRic4lIkcDb6tqn+9c4xIZbSv3GUZxYStvDl9SFh0RKSfSivmVS4pfpCaTi9aksoZDyus82Mp9\nhlFc2Mqbw5fBDL6fC2xW1f92/+8VkTGqutd1nb3h0ruAE3zlql1aonR/md0iEgKOVNV9ItIFNMWV\nWa2qb4nISBEpc60d/7n6G37uudx+++0ANDU10dTUlCirYRgFQFVVFevWrSvqlTeLjTVr1rBmzZqs\nXyfllUNFZAWwUlXvd/8vJjL4v1hEbgRGqepNzpHgQSID/+OBNmCKqqqIbAS+CjwD/DuwVFVXishV\nQKOqXiUiC4BPqeoC50jwLHAKkVbZs8CpqvqOiDwM/FpVHxaRZcDvVPUnAXbbyqGGYRiDJK/LVYvI\nYcBOoFZVe1zaaOARIi2UncBFbrAfEVlIxJvsAHCNqq5y6acCPwcqgcdV9RqXPgL4BfBh4C1ggXNC\nQEQuA75OpPvuO6r6gEufBDQDo4DngS+o6oEA2010DMMwBkleRaeYMdExDMMYPNkSHYtIYBiGYeQM\nEx3DMAwjZ5joGIZhGDnDRMcwDMPIGSY6hmEYRs4w0TEMwzByhomOYRiGkTNMdAzDMIycYaJjGIZh\n5AwTHcMwDCNnmOgYhmEYOcNExzAMw8gZJjqGYRhGzjDRMQzDMHKGiY5hGIaRM0x0DMMwjJxhomMY\nhmHkDBMdwzAMI2eY6BiGYRg5IyXREZGRIvIrEdkmIh0iMktERonIKhHpFJEnRWSkL/9CEdnh8p/j\nSz9FRF4Uke0icqcvvUJEml2ZDSIywXfsUpe/U0Qu8aXXiMhGd2yFiISH/jgMwzCMbJJqS2cJ8Liq\n1gMzgJeBm4B2VZ0KPAUsBBCRBuAioB44F7hLRMSdZxlwuarWAXUiMs+lXw7sU9UpwJ3A9925RgHf\nBE4DZgGLfOK2GLjdnesddw7DMAyjgBlQdETkSGC2qt4HoKq9qrofOB+432W7H/iU2z8PaHb5XgV2\nADNFZCxQparPuHwP+Mr4z/UoMNftzwNWqep+VX0HWAXMd8fmAo/5rv/plO/aMAzDyAuptHQmAf8t\nIveJyHMicreIHAaMUdW9AKq6BzjO5R8P7PKV73Jp44HXfemvu7SYMqp6ENgvIqMTnUtEjgbeVtU+\n37nGpXLDhmEYRv5IZRwkDJwCXK2qz4rIHUS61jQuX/z/Q0EGzpJSHgBuueWW6H5TUxNNTU2Dt8gw\nDGMYs2bNGtasWZP166QiOq8Du1T1Wff/Y0REZ6+IjFHVva7r7A13vAs4wVe+2qUlSveX2S0iIeBI\nVd0nIl1AU1yZ1ar6lnNuKHOtHf+5+uEXHcMwDKM/8S/kt956a1auM2D3mutC2yUidS7pTKADaAUu\nc2mXAi1uvxVY4DzSJgGTgU2uC26/iMx0jgWXxJW51O1fSMQxAeBJ4GwnMKOAs10awGqXN/76hmEY\nRoEiqgP3ionIDOBeoBx4BfgSEAIeIdJC2Qlc5Ab7EZGFRLzJDgDXqOoql34q8HOgkog33DUufQTw\nC+DDwFvAAueEgIhcBnydSPfdd1T1AZc+CWgGRgHPA19Q1QMBtmsq92gYhmEcQkRQ1ZSHMVI+73Cv\nkE10DMMwBk+2RMciEhiGYRg5w0THMAzDyBkmOoZhGEbOMNExDMMwcoaJjmEYhpEzSkJ0enp68m2C\nYQx7enp62LBhg/3ejKSUhOjMnj3bfgiGkUV6enqYPXs2c+bMsd+bkZSSEJ2tW7fS0dGRbzMMY9iy\nZcsWOjo66O3ttd+bkZSSEJ2GhgamT5+ebzMMY9jS2NjI9OnTKS8vt9+bkZSSiEjQ3d1NVVVVvk0x\njGFNT08PHR0dTJ8+3X5vwwALg5MmFgbHMAxj8FgYHMMwDKPoKQnRMU8ao9Qw92WjUCkJ0TnjjDPs\nx2cUJemIh7kvG4VMSYjOli1b2LRpU77NMIxBka54mPuyUciUhOgYRjGSrniY+7JRyJSE91pjYyPr\n1683N06jqPBaOlu3bqWhoYF169al/B0292VjqJjLdJrYPB2jmDHxMPKFiU6amOgYpUhPTw9btmyh\nsbHRvvtGWuR1no6IvCoivxOR50Vkk0sbJSKrRKRTRJ4UkZG+/AtFZIeIbBORc3zpp4jIiyKyXUTu\n9KVXiEizK7NBRCb4jl3q8neKyCW+9BoR2eiOrRCRcCL7zYPHKCXMe80oZFJ1JOgDmlT1w6o606Xd\nBLSr6lTgKWAhgIg0ABcB9cC5wF0i4qnlMuByVa0D6kRknku/HNinqlOAO4Hvu3ONAr4JnAbMAhb5\nxG0xcLs71zvuHIGYB49RSpj3mlHIpCo6EpD3fOB+t38/8Cm3fx7QrKq9qvoqsAOYKSJjgSpVfcbl\ne8BXxn+uR4G5bn8esEpV96vqO8AqYL47Nhd4zHf9Tycy3jx4jFLCvNeMQiZhl1QcCrSJyEFguare\nC4xR1b0AqrpHRI5zeccDG3xlu1xaL/C6L/11l+6V2eXOdVBE9ovIaH+6/1wicjTwtqr2+c41LpHx\ng/H6MYxip6qqinXr1pkDglGQpCo6H1XVP4nIscAqEekkIkR+MumRkMrgVcoDXLfffnt0v6mpiaam\npjRMMozioaqqitNPPz3fZhhFxJo1a1izZk3Wr5OS6Kjqn9zfN0Xk34CZwF4RGaOqe13X2Rsuexdw\ngq94tUtLlO4vs1tEQsCRqrpPRLqAprgyq1X1LREZKSJlrrXjP1c/brnlllRu0zAMo2SJfyG/9dZb\ns3KdAcd0ROQwETnC7R8OnAO8BLQCl7lslwItbr8VWOA80iYBk4FNqroH2C8iM51jwSVxZS51+xcS\ncUwAeBI42wnMKOBslwaw2uWNv75hGIZRoAw4T8cJx78S6T4LAw+q6m1uzOURIi2UncBFbrAfEVlI\nxJvsAHCNqq5y6acCPwcqgcdV9RqXPgL4BfBh4C1ggXNCQEQuA77urv8dVX3AZ1czMAp4HviCqh4I\nsN/W0zEMwxgkNjk0TUx0DMMwBo8t4mYYhmEUPSY6hmEYRs4w0TEMwzByRkmIjsWeMozCwZbSLm1K\nQnQs6KFhFAYWjNQoCdGxoIeGURhYMFKjJETHgh4aRmFgwUiNkpin09XVxbhxCeOBGkbGsUXUEmOr\noRYHNk9nCMybN8/6jo2sET8wbuMWyfGCkZrglCYlITpbtmxh06ZN+TbDGIYECYyNWxhGYkpCdAwj\nWwQJjI1bGEZiSmJMp7GxkfXr11tz3sg4Xktn69atNDQ0RBcMtHELo9ixgJ9pIiLa3d1tP3wja5jA\nGMMRE500sSjThmEYg8e81wzDMIyipyREx1xWjWLGYpUZw4mSEB2bK2EUKzbnxxhulITo2FwJo1ix\nOT/GcKMkRMfmShjFis35MYYbKYuOiJSJyHMi0ur+HyUiq0SkU0SeFJGRvrwLRWSHiGwTkXN86aeI\nyIsisl1E7vSlV4hIsyuzQUQm+I5d6vJ3isglvvQaEdnojq0QkXAi2725E4ZRbFRVVbFu3TrWrl1r\n32NjWDCYls41wFbf/zcB7ao6FXgKWAggIg3ARUA9cC5wl4h4bnfLgMtVtQ6oE5F5Lv1yYJ+qTgHu\nBL7vzjUK+CZwGjALWOQTt8XA7e5c77hzBGI/VKOYsVhlxnAiJdERkWrgE8C9vuTzgfvd/v3Ap9z+\neUCzqvaq6qvADmCmiIwFqlT1GZfvAV8Z/7keBea6/XnAKlXdr6rvAKuA+e7YXOAx3/U/nch+G3w1\nChXzTDNKjVRbOncA/wT4Z1mOUdW9AKq6BzjOpY8Hdvnydbm08cDrvvTXXVpMGVU9COwXkdGJziUi\nRwNvq2qf71wJ1y4wrx+jEDHPNKMUSTgO4iEinwT2quoLItKUJGsmp/2nMgs25ZmyL730Etdddx3V\n1dU0NTXR1NSUvmWGkSGCPNNOP/30fJtllChr1qxhzZo1Wb/OgKIDfBQ4T0Q+AfwVUCUivwD2iMgY\nVd3rus7ecPm7gBN85atdWqJ0f5ndIhICjlTVfSLSBTTFlVmtqm+JyEgRKXOtHf+5+nHSSSdxxx13\nWJ+4UVB4nmlesFDzTDPySfwL+a233pqV6wzYvaaqN6vqBFWtBRYAT6nqF4HfAJe5bJcCLW6/FVjg\nPNImAZOBTa4Lbr+IzHSOBZfElbnU7V9IxDEB4EngbCcwo4CzXRrAapc3/vr9ePzxx01wjITka1zF\nPNOMUmRQAT9F5H8BN6jqeW7M5REiLZSdwEVusB8RWUjEm+wAcI2qrnLppwI/ByqBx1X1Gpc+AvgF\n8GHgLWCBc0JARC4Dvk6k++47qvqAS58ENAOjgOeBL6jqgQCbdcaMGfajNgLxxlW8KNH2PTGMCBZl\nOk1EREOhEP/5n/9p/eUljreqZ2NjY1RYNmzYwJw5c+jt7aW8vJy1a9fm7HsSZI9hFAoWZXoI9PX1\nMXr06HybYeSRRJ5i+Zrxb55rRqlSEqKjqqxduzbfZhh5xO8p1tHRwaZNm4D8jatYTDWjVCkJ0RER\n5syZk28zjDzS2NjItGnTAOjt7eXaa6+Nti7yMePfYqoZpUrJiM6+ffvybYYxBII8zAbrdfblL3+Z\nsrLIV76zszOvrQvzXDNKlVTm6RQ9fX19VFRU5NsMI02CPMwAzjjjDF5++WWmTZvG+vXrE1bc/vIV\nFRX09vYWROvCa2EZRilREi0dgF/+8pf5NsFIE//4x5YtW9i0aRNPP/00W7ZsiUlLpfzBgwdZtmyZ\ntS4MI0+UjOh86EMfyrcJRpr4x2MOHjzItddey7vvvjuo8v7xk4svvtgExyg5Dh7MtwURSkZ0Tjjh\nhIEzGQVJVVUVd9xxB+FwpDe4s7OTww8/nMbGRkKhEI2NjcycOTNpeRs/MUqRp56CSZNABMJheO+9\nfFtUIpNDGxsbk/b5G4WPNy7jxSnzxnW8cR77bA0j0ppZvhyuvjo2/fLL4bbb4JhjUj+XRSRIExHR\n7u5uq5SGAT09PSYyhhHH/v3wjW/Aj34Um/7//h9ccw2Ul6d3XhOdNBERHe73aBhGabF9O1x1Ffz2\nt4fSxoyBe++Fv/mbzFzDwuAMAQsxUprYqpzGcOLJJ6G6OjI+M3VqRHCammDrVlCFPXsyJzjZpCRE\nx2JblR4W28wodnp74c47IyIjAvPnQ1cXXHEF7NsXEZrVq6G+Pt+WDo6SEJ2Ojg6LbVUg5Kr1kY3Y\nZkOxPRv3PdA5raVXfOzbB3//9xGRKS+H666LpC9ZAgcORIRm2TIYNSq/dg4JVR3WG5F1eHTlypVq\n5Jfu7m6dMWOGhsNhnTFjhnZ3d2f9WuXl5Rm51lBsz8Z9D3TOXD5rY2h0dKjOmaMakZTIdsIJqvmu\nsiLykIU6ORsnLaTNE5358+en89yNDLJ+/XoNh8MKaHl5uW7YsCHlst3d3bp+/fpBV/YbNmzISIU7\nFNuHUjbdc2bjmkbmaG1VPfbYWKE5+2zVzs58W3YIE50hio61dPKPv/XR2NiobW1tgW/o8eJSCG/t\n3d3d2tjYqOFwWBsbG9Nq6fhbXf77TFdQk7XkMt3SM4bGBx+oLl4cKzKg+tWvqr7zTr6tC8ZEZ4ii\ns2zZsnSeu5Fhuru7tb29PVqB+yvEROJSCG/tnuiEQqFBi45X3mt1+e+zsbEx8FkM9pzpHDeyy5tv\nqn7pS/2F5q67VHt7823dwJjoDFF0zjrrrHSeu5EGA725B4lId3e3Ll++PFBcUnmrH2xLYbBkUvj8\n5wqHwxoKhawbbJjwu9+pnn56rMjU1qo+9VS+LRs8eRMdYATwNPA88BKwyKWPAlYBncCTwEhfmYXA\nDmAbcI4v/RTgRWA7cKcvvQJodmU2ABN8xy51+TuBS3zpNcBGd2wFEE5gvwL64IMPZuFjMeJJ1A0V\n353kF5Guri6dMWOGhkIhHTFiRGAXVqK39lx1vWWyuyq+m7GxsdG6wYqUvj7VRx9VHTkyVmg++UnV\nP/wh39YNjby2dIDD3N+Qq+hnAouBr7n0G4Hb3H6DE6iwE4bfcyjywdPAaW7/cWCe278SuMvtXww0\n6yFh+wMwEjjK23fHHgYudPvLgL9PYLsC+uUvfzkbn4sRR1tbm3rPHND29vZAYfCLiP/NH9CysrKU\nu7By2fWWye6q+O426wYrHt57T/U734kVGVC94QbV4fQRZkt0Upqno6peHPkRTkwUOB+436XfD3zK\n7Z/nRKNXVV8l0nqZKSJjgSpVfcble8BXxn+uR4G5bn8esEpV96vqO0RaVvPdsbnAY77rfzrZPYhk\nPJqDkSJBc2b8C5j95S9/Ydq0adEo0n19fWzbtq3fGjlB8068ZQ9CoRBTp06NWZgt0/NUBrOs9UDX\n9p8rH8tlG4Njzx74whci82cqKyOxziASdubgwYjs/OAHYB/hwKQkOiJSJiLPA3uANiccY1R1L4Cq\n7gGOc9nHA7t8xbtc2njgdV/66y4tpoyqHgT2i8joROcSkaOBt1W1z3euccnuIVnoeyNzNDQ0UFtb\nS1lZWXTJgfj1bDxh8KIGnHvuuQD8+te/pt5Nr/bWzfEq7Z6eHs444wzmzJnDGWec0a8yj3+pyFZE\nglSELJPXtgme+eO55+AjH4kIzfHHw4MPwrRpsG7dofbN5ZdDWUlMsc8gg2kWAUcCvwWmA/vijr3l\n/v4Q+Jwv/V7gM8CpRFotXvrHgFa3/xIwznfs98Bo4AbgZl/6N4DrgaOBHb70auDFBDYroMcee6wu\nWrRIV69ePeRmpxGMvxtt8uTJ2tXVFXMsvgupra2tX9dYS0tL4MB6ULedauLutWx0u6U6fpSpaxeC\nq3gp0denumKF6mGHxXabfepTqjt35tu67LN69WpdtGhRdKNQvNeAf3ZisI1IawdgLLDN7d8E3OjL\nvxKY5c/j0hcAy/x59NC40Ru+PD/xlfkJcLHbfwMoc/unA08ksFcBPeOMMzL7CRlRvHGZIBFJVqax\nsTEqIo2NjdrV1dUvzatoE4lOV1eXTp48ud/cn8G4OHd1deny5ctjRDKIVMUkU04H+XQVz4VHYCHw\nP/+j+s1v9h+fWbhQ9c9/zrd1+SVvogMcw6HB+78C1gKfIOJIcKNLD3IkqAAmEetI4DkhCBFHgvku\n/SoOORIsINiRwNs/yh172CdAy4ArEtivgF5wwQXZ+WRKnKA5J6lUtv4KNRQKaXt7ez9XYk9YvOvE\ni4gnOGVlZVpbW6v19fXRVoEnYANN5uzq6tLKykoFtLKyMqnwDEZMMuEckK8JnsO9hdXVpXrRRbEi\nEw6rPvBApLVjRMin6JwEPAe8QMTd+esufTTQTsSVeZUnBu7YQic28S7TpxLpStsBLPGljwAecekb\ngRrfsctc+nZiXaYnEfGG2+4EqDyB/Qro5MmTs/G5lDzxb+Pt7e0pVbaJZukPNB/H7/E1efLkaMsn\nFArF2HH33Xen1EpYvnx5TAvqnnvuGdDuXHqa5cOzrRAm42aajRtVTzopVmhOOkl1GNxa1sib6BT7\n5lUm3/ve99J57sYADOVtPKhC9SIWBIXI8RPvZl1bWxvTyvLm/gxk12BaOqXCcAih09enev/9kRaM\nX2guukj19dfzbV1xYKIzBNEREatMskgqb+PJxgiCJo4O1LXjrxg9p4V4O1JtJXR1dek999yT8nek\nFMY7inHu0F/+EhmLiR+f+eY3Vd99N9/WFR8mOkMQHUCbm5vTee5GigwkKv5xH38rJl5kBuuMkKhi\nzJYwxNvb1dVVMAJUCmIYz86dEe8yv8gcdljEC83GZ4aGic4QReeiiy5K57kbKTBQ6yQo4oDXOgka\nExpq1042B8LjnR0mT55cEAPuw33w38+6dapTp8YKzamnqj77bL4tG16Y6AxRdP76r/86nedupMBA\nA8/+CtE/aO8JT5BDwYYNG9JuRWRzIDy+W69QBtyH4+C/x8GDqvfeGysyoPqFL6ju2ZNv64YvJjpD\nFJ077rgjnedupEAqA8+eg0BtbW1UdPwRpoMcCtJ9c/fP3cnGW79fFAtlwH04DP77+dOf+osMRGKe\nvfdevq0rDUx0hig6ra2t6Tz3kiSdsYHBDNoHCUL8NeO7se6+++6U7PEq31AoFBgVIdNjHoU04F5I\ntqTDb38bLDSPPmrjM/nARGeIomOLuKXGYFsYqc7mj7+Gv/vMazHER6H20iorKwPtCRKReLFqbW1N\neI18UYoD/olYtChYaB54IN+WGSY6QxSdj370o+k895IjKCpAd3e3trW19Zs7k2yOy0AVq9fi8Vok\niRZvSzTJM5E4epELvM99xIgRGgqFtLa2NiamW6otp0xTSgP+QXzwQf9Jmt62dWu+rTP8mOgMUXSO\nOeaYdJ57yRFfadfX12t9fX30f39ImUSz+ZNVrJ6A+cd2ysrKtLq6OrDLraWlRWtra/sdW79+fVRE\nwuFwzMB5W1ublpWVxdjm3zwhykSl74nrQE4PXr7BuIQPF3btChaZY4+NzK0xChMTnSGKzsiRI9N5\n7iVJW1tbTIXur8D9FXyilk4iTyr/eEu8EIhIzBhMvPjV1tbGtKSStbL8x0Qkxv5QKBT9f6iVvv9+\nEnUB+vMNNj5dOvYUSrfdv/97sNDYWorFg4nOEEXnsMMOS+e5D2sSVVJ+T6iGhgatqanp19LxynZ2\nduo999yjnZ2dSZejjn/LD9r83mzLly+PEadQKBSz0miyFkO86P3whz/UhoYGDYfD2tDQkLDSH0yl\nHWRjIiFLNz7dYCiEbrt//MdgofnVr3JuipEBTHSGKDof//jH03nuw5ZkYyJed1F7e3s0UnNtba02\nNzdrW1tbv0H5RI4Afrfi+Lf8+vr6aLeZv5Xg5Q+FQjpixIgYsYs/V319feDSBUGi50Worq+v15aW\nluhY1UDPY6BnV1lZGW3pJAtUmm135nzM03nvPdUTTwwWmh07sn55I8uY6AxRdKqqqtJ57sOWoEoq\nWUga/+x7zwHAPyifaqvD/5bvCVNnZ6cuXbpUW1pa+l1z6dKlUYGId3LwnAPiu95UY92H4yMiBAmt\nv9UyUKU6jyQcAAAgAElEQVQdf09e3LZkrZdsuzPnap7OK68Ei8zEiZG1aYzhg4nOEEWnoaEhnec+\nbAmqpOLXuGlubo4RGn9F659rkyiqQFtbm9533306ZswYLSsri7ZW/N1wbW1tMY4K9fX1Sbu/gqIB\neNENEnWVBY0lBQltsnGZgZ5dIZAtYfv1r4OF5pprMnoZI8sMdszPRGeIojNlypSUHnQpETRfJt7d\nuKysTMePH6+bN2/u12UVFNG5q6urn5D4PwP/wmrevj+Pt6BbUOXpiVR7e3vU5drfelmyZElg958n\nPF53YSKh9VotqY7pZKvlUggOAVdeGSw0v/lN3kwyhkA6Y34mOkMUnXA4POBDLkXiv4wtLS2Bg/3H\nHnusrl27VpcsWaIrVqzoFyk6fgJmkKOA35PMv+iaf6uvr49pBfmXn47/0XjCEw6HY8Z/amtrE3b3\nxYtFfOvJ7z2Xj4o/Xw4B776rOmZMsNDs3JkTE4wsks6Yn4nOEEWnvLx8wIdciiSL8lxeXp7Q0yxo\ncN8/1hO0+YXB60aLjz7tjd/4W1wNDQ0JvdW8CaT+63pRrFPt/vJPVE3kGJErcukQ8PLLwSLT0KD6\n/vtZu6yRB9LpEjbRGaLoVFRUDPiQS5GgL6PXFVVXV5dUdLyYaP5K0pvk6blah0IhHTt2rH7729/u\nF+mgq6tLlyxZovX19f26vOInd7a2tgb+aLwJpNXV1THi1tnZmXL3V3xFn+pS19kg2+NFDz0ULDQ3\n35zRyxgFyGC7hE10hig6M2bMSOlBlyLx4zHeFt/9JSIJWzqe67N/rRz/lzzIjdnv/ux3Ye7u7o6J\nWABEj8d3jflbREFOAql0kSWyLV+OApkcL+rrU73kkmChefLJDBhrDFvyJjpANfAU0AG8BHzVpY8C\nVgGdwJPASF+ZhcAOYBtwji/9FOBFYDtwpy+9Amh2ZTYAE3zHLnX5O4FLfOk1wEZ3bAUQTmC/Avrh\nD384G5/LsCGoS8s/p2bp0qW6efNmra2t1bKyshg35fgurkStA68FFS9qQfm7urqiLtHx83A84l2h\nvc2/Tk+qXWRBglasEZv37QsWGVC1VduNVMmn6IwFTnb7R7jKfxqwGPiaS78RuM3tNwDPA2EnDL8H\nxB17GjjN7T8OzHP7VwJ3uf2LgWY9JGx/AEYCR3n77tjDwIVufxnw9wnsV0Dnzp2blQ9muOCPZebv\nAvNXvMninSXrFop3NPDm1sR3q8XT1dWld999d8II1kEtHU8Mh/OiZkGsWBEsMqedpnrgQL6tM4qR\nguleA/4NOAt4GRijh4TpZbd/E3CjL/8TwCyXZ6svfQGwzO2vBGa5/RDwRnwePSQuF7v9N4Eyt386\nsDKBvQrookWLMvqBFDKJupWC0v2C4K/Ag2b5t7S0RJ0BgqJKt7W1aXNzsy5ZsiSmFeR3NPCP1dTU\n1ES7zfx2DGYpAq/11NraGh0n8pdPFiUg327JQ+WII4KFZt68fFtmDAcKQnRcy+VV1+J5O+7YPvf3\nh8DnfOn3Ap8BTgVW+dI/BrS6/ZeAcb5jO4DRwA3Azb70bwDXA0cD233p1cCLCWyOVqLDHa/i97zC\n4gfcE61Z4y0v0NnZqe3t7TGVt9812S8YiZYZ8MZ9KioqdPPmzbpkyZKY1tHYsWNjWibXXXeddnZ2\nRsPUJFqKoL29fUCRiL/HRFEC0nVLzodQ+a/Z1xcsMhBZUdMwMkneRccJzbPA+eoTGd/xtzRzovP7\nFERnhy99QNHxWjurV6/O5OdSMPgr0iBhCOpuivcSq62t7TcnprGxsd+gfnxlnWhsJch92u9l5m0V\nFRX90vxbXV1dwjhrflLtUkun6y0f82e6u7t12rS/SSg0mzYNjxabURisXr1aFy1aFN3yKjpExmdW\nAtf40rYR2722ze3Hd6+t5FD32jZfeqrdaz/xlfkJh7rX3iC2e+2JBLZHK6/hTHzFHx+TzBv/8Ffc\nnZ2d/YShtbU1ZmmDUCgUk6e2tjbaJeatGtrZ2RntykomHqlu8e7Sxx9/fMz/7e3t/e7f38obyOss\nkZt4sso7FaHKlABcfXXiFs0HH/S/j1JdEM7ILvkWnQeAf4lLW+yJC8GOBBXAJGIdCTYCMwEh4kgw\n36VfxSFHggUEOxJ4+0e5Yw/7BGgZcEUC20tCdPwVaXx0Zq9S9USnpqZGV6xYEbNkgbdNmDBBGxoa\nov/7Y6FVV1drZ2enqkYG+b3rVFRUaHNzs7a3t+uPf/zjIQnOhAkTtLm5OaZ1Fd+Kihcdf+VbX18f\nM6YU9Jy8MZ94d+5klfdA82eGKgCJRAY04TVLzVnCyC15Ex3go8BB4AUnJs8B84l0f7UT8WZb5YmB\nK7PQiU28y/SpRLrSdgBLfOkjgEdc+kagxnfsMpe+nViX6UlEvOG2OwEqT2B/SYiOavDsfG8dmkRd\nYPGbiMQE/Wxtbe03Y7+7u1uXLFnSr3XS2NioK1asGJLoiIg2NjZqZ2dnNKqAt4yBF1k6XlAGiiLt\nfz5BwpBs0Tl/yyXIrdo7HhTZIVmrp7c3sch8/vOxNidy3c72RFKjtMmb6BT7VkqioxqpiIKWl+7q\n6gocT4nf6uvrY1o6jY2NgXNqWlpaErZUvP0RI0ZoOBxOGBqnrKxMjzvuuMBj8ZNB/c4M/hhp3j3H\nR5H2BNM73tbWpkuWLEm6oml8d1uylkt8d6XfW84fzNRfduPGxELzzDPpf97FOp/IKGxMdEx0+pHI\nBdqrDL1WgVeBlpWVxYy71NXVaXNzs37/+9+Pdo95FXS855hXidbU1GhLS4t2dXXp1KlTkwrY8ccf\nr+PGjUurxdPa2hpzb/FOD0FLGbS3t+u0adNiRM/zjPOnBYXSaWtri45nxbdcvDlLftra2hKKpF+k\nRR5OKDS9vTn4khhGmpjomOjEEOQeHP8271WW/kmdZWVlOnbsWC0rK9OpU6fGvJF7btKdnZ1aWVmp\nEJmP09nZ2W+pgtraWv3Zz37Wb9A/2bZgwQKtq6tLGhQU0HHjxum4ceNiuvO6urpivNzKysp06dKl\n/YTnmmuuiTnXTTfd1K8F5F++wP8cPXdtr+WSbN5SkOh4JBKZYfYVNIY5JjomOjHEjyHEuzX7u8v8\nIhLvnuyJhncOr4XkpZeVlemNN97YL+6aV2YwrZdQKKR1dXU6atSopHmCIiMsXrw4MH9DQ0NM95t/\nrlBQSydePBKNdXnhehKF9vFalOFwWBsaTk4oMldcYeGajeLERMdEJwb/OESyJQVCoZB+61vfiopI\nfAXrRYIePXp0oKCIiIZCoUDRycU2btw4nTJlStI8ra2tMQu6eWLpX5snaNKrasQLzxPkeNFJNlD/\n298GiwyodnTk61thGI6331Z95BHVSy9VPeaYQ1/OQawpbqJjotMPLzZZ/Nt8ssmWJ554YkwLZeLE\niSl5tWVzO/rooxMKZiou2EuXLg0U3aDWSbxzQHzMufgJqP6B+rlzEwtNX1++vgVGyfLnP6u2tqpe\ncYXqhAmJv5ygOmOG6tKlgzq9iY6JTozjQNCYjv9tfunSpYGtk6qqqiGLxGDGcVLZQqGQVlRUqIhE\nl8j2vN4SOSJ4xz33aq+1Ul5erhMmTAhsnQQ5B/hbM7W1tdra2hpTJtnv2DCyzvvvq7a1qV57rWpd\nXfIvZF1dJF9bm+p77w350iY6JS468SKTbGmA7u5uXbFiRcYiBGRqi+/C828iotdff712dnbGLKLm\niZLnALFs2TJdunRpzCJtbW1tMULoxZKLn8/jjcN4+fzu5F4XZWPjRxP+pm+6aeg/ZMPox4EDqv/5\nn5GV9GbMSC4sEyZEWjatrZGWThYx0Slx0Um2rHRDQ0N0Fn78PB2vZTJy5MiUWhypiEc64zuVlZW6\ndu3aAVtJU6ZM0cWLF/eLluBFQwjy2PPPK/K2RDP0W1paYpwnNmzYoFdf/WqS3/mhcweF3/EzUBgc\ni5NWwvT1qW7erPqtb6mefnpyYTnmmMhYzCOPRMZm8oSJTomKjj9sS9AExtbW1phwNEFjILW1tXrz\nzTcPKAzl5eVZcxi46667dMWKFSmfPxwOx0wcDYVCunTp0phoCTU1NTEi5kUsSBaq5lBLJ/Fv3iuX\nzC066HMaaDKpxUkb5vT1qW7dqvqDH6h+/OPJheWII1Qvukj15z9X3bMn35YHYqJTgqITX1F1dnbq\n0qVLtaWlJVppLV++PKZijG+tlJWV6bHHHjskwfDHchvKNlhBO/roo2MiHDQ0NGhdXV1CkUq2nIFq\n8jogaGZ/UJDURAwUB83ipA0jXnlF9Uc/Uv3EJ5J/qcrKVP/2b1V/8hPVnTvzbfWgMdEpAdGJ736J\nH/j2z8XxR4rOt/dZNrdjjjkmxt07kWNB/ERRVdVduxLXB0cd9Xi0e86LQpDoM0klzEyqAUEtTlqR\nsHu36k9/qvqZz6hWViYXl7POUv2Xf1F1wXCHCyY6w1x0grpfurq6ohM1vSgC3r143U0NDQ0Z9ybL\n1ea5dofD4RhxjW+thUIhDYfDOn78+ECB9dYBUlW98MLEdcNzz/VoW1tb1MMv0aTRdMdeBhIoi5NW\nYLz1lupDD6l+4Quqo0YlF5aPfUz1//5f1RdeKBn/eBOdYS46QY4C/kpxOGzJAn/68wSJqDdBNb6r\nL+JwkLiuSBTA0+/959ngCULQCquJRMicAwqc7m7VX/9a9f/8H9Vx45ILyymnqP7zP0cis1pgPBOd\ntG+wgEUnaN6N1/0S7wZcTNu4ceP6LUud6lZXV5cwpI8nDhHxSlx3JFol1b9sdnt7e4yXnxcyZ/ny\n5YFegkFu2OYcUCD8z/+orlyp+pWvqNbWJheWhgbVf/xH1aeeisyBMRJiojPMRMcfu8sLMNnS0qJL\nly6Nuj77B9EHsyWbD5OrLdGSBQNtS5cu1c2bN/drFZWXn5qkLvltdCmDIE8/79l6raTKysroeI5/\nhVQv9lxlZWWM+Ptt8Ue3TnUdHiMDfPCB6po1ql/7mmpjY3JhqamJLL/6+OOq776bb8uLFhOdYSI6\n3mTG+ACW/nkp3pIE3/rWtwZdaZeVlelXvvKVvItOOltZWZmuXLnSN6fozYT1yjHHzOpX3osS7fdi\n27x5s1ZXV8cIhycQ8ZNFvS0cDkcjUXd3d8fEdRvqOjxGEg4cUP3xj5MLireNHav6d3+n+uijqvv3\n59vyYYmJzjAQnUSVnFfhxotQui2dYt2OOuqopPVM0GJtQZsnDPHLIQAxyyWoar+xnfjjqhqNWOBv\nOfm7Rf3OAeYaPQB9faoPPjjwwL23ffzjqr/8peqbb+bb8pLDRGcYiE78QmQQmbtSV1enEydOTFiJ\nFuvYTupbsnonkmft2rUJlyGoqanR8vJyraysjJnT9PnPfz4m37hx46Lx1jz8rZWGhoZo997y5cu1\ns7Ozn7h4XXcDTQJN5jqdStdb0XfRPfGE6qRJqQkLqM6apfr00/m22vCRN9EBfgrsBV70pY0CVgGd\nwJPASN+xhcAOYBtwji/9FOBFYDtwpy+9Amh2ZTYAE3zHLnX5O4FLfOk1wEZ3bAUQTmJ/wYhOd3d3\nv/AuY8eO1YaGhuiqnEGtm1TD0xTP9ukk9c9jgWVCoVA0DE55eXm0K62hoUFbWlqibtAbNmyICQDq\nbRUVFdrp5lHEV+je0geNjY1aVlYWncTqLevgF49UWjKJXKNT7Xrz5wuKIVcwbNyoetppqQvLlCmR\nAX+jKMin6HwMODlOdBYDX3P7NwK3uf0G4Hkg7ITh94C4Y08Dp7n9x4F5bv9K4C63fzHQrIeE7Q/A\nSOAob98dexi40O0vA/4+if0FJTr+5ZS9ytRfiV111VUxM/eHTysnWX2UmqfbxRdfrD/72c90yZIl\n2tnZGbOMtr/ba8mSJTHlvvjFL0Yr7kQVf6JWVLy4DGWSZ6pdb/G2xC/NnVO2blU9++zUheWYY1Qf\nfrhk5rIMZ/ImOpFrMzFOdF4Gxrj9scDLbv8m4EZfvieAWS7PVl/6AmCZ218JzHL7IeCN+Dx6SFwu\ndvtvAmVu/3RgZRLb8yY6/rhp69ev15aWln4VWk1NjTY0NETf3vMvDrkSmoHLJ2vh1dfXx3iWea2C\ncDis9fX10Wc5YsSImJZCMo8zb7zI39IJapWkO8kzVcGKd14Ih8PZHRt67TXVBQtSFxYR1bvuigz8\nG8OWQhOdfXHH97m/PwQ+50u/F/gMcCqwypf+MaDV7b8EjPMd2wGMBm4AbvalfwO4Hjga2O5Lr/bb\nFmB7XkTHX4l5Yw1B809ERCdNmqTnnXde3lbnzNwmQxYa/5ZsWWtAm5ubY5bh9keP9oKBequH+j+X\nRPHUPDHp7OzUe+65J2b5hEx+L1I5p3/57Yx4wf33f6teeWXqwgKq3/2uuRyXMNkSnTCZQTN0HgDJ\nUJ688vTTT7NlyxYOHjzIwYMHAdi5c2e/fKrKH//4R/74xz/m2sQMcRXw4wTHHgc+mfaZ33777aTH\nX3vtNQ4cOABEnmNtbS07d+5k4sSJvPLKK/T19bFjxw46OjqYPn06W7ZsYeLEidHy7777Lj09PVRV\nVQFQVVXF6aefDkBdXV3adifDf41kjBs3jueeey5qu2djUt5+Gz75SdiwIXWDrr8evvlNGDky9TKG\nMQTSFZ29IjJGVfeKyFjgDZfeBZzgy1ft0hKl+8vsFpEQcKSq7hORLqAprsxqVX1LREaKSJmq9sWd\nKym33HILTU1NNDU1DZg3XXp6enj66af5yle+EhWb8vJy+vr6OO644/jTn/6UtWvnjmTvGKOB5GKR\nLmVlZfT19QEwbdo07rvvvugzPuGEE3j44Yf5r//6L4444giuvPJK3n//fcLhMKNHj2b27Nl0dHQw\nduxYdu/eTV9fH6+88gqnnXYazzzzDOPGjQu8Zk9PD1u2bKGxsTG1ij+DBApUTw8ceeTgTvSlL8F3\nvwvHH58544xhx5o1a1izZk32L5RKc4iIU8BLvv8X48ZuCHYkqAAmEetIsBGYSaSV8jgw36VfxSFH\nggUEOxJ4+0e5Yw9zaHxnGXBFEtuz3r3mTfj82c9+pmPGjOnXDeTFEzvmmGMKoAss3S1z3Wbpbv7o\nAf61dSDicBE0JlZeXh64Eqk/jz9gaPznmouJnv3coz/4YHDdYN72xBNZsc8oTchS91oqgvMQsBt4\nH3gN+JITgXYirsyrPDFw+RcSEZt4l+lTiYzf7ACW+NJHAI+49I1Aje/YZS59O7Eu05OIeMNtJyJA\n5Unsz6roJJvwWdxbRUEIjbdNmDAhOhYzfvx43bx584ATRf1zdqqrq6Pp8WNnoVAocKA+qxM9+/oi\nnl6DFZbvfCdzNhhGEsiX6BT7lm3R8S9/XPzbE0nquyfzatvxxx+va9eujVkOYe3atTFu0/6WTn19\nfXTuzowZM6KeaF6sO/9icIkWaMvIGjjz5w9eWObMCbSlqCeLGkWHiU4Bik5QmJXi25LVf4cXgH2H\ntvjuyXA4HA2O6kULaG9vj0Yd6O7u1uXLl8e8FFRXV0fLxOcNqtRT8jb72tcGLyxHHqna15eSsFk8\nNyMfmOgUoOjET0Isni1ZfZhv2xJvXoQAf9o999wT+Nn4K2r/i0FQN1lKlfq99w5eWED1vfcG/B4N\nJGwWz83IByY6BSQ6XV1dumTJEp00aVLeK+LUtmOKVmj8W21trf74xz+OCo+3REEQ8Ut9V1dXJ2xN\neHkXpCMqEJkDk0VsqWsjH2RLdDzPsmGLGzQGIBP3un37dhobG6PzQwqXZPe6GpibK0NiGDlyJBUV\nFbz55psplwmHw0yYMIERI0awY8cOpkyZwpVXXskFF1yQ0NV59+7dnHjiibz33ntUVlbyu9/9jn37\n9vGh/fs5bP78wRu+fTtMmTL4chmip6dncHN2DGOIiAiqmvE5kSY6g6Cnp4epU6cW8FybZPdXDvTm\nypCMUV9fzw9/+ENUlXPPPZfe3l7Ky8tZu3ZtdA5Lv7k027fD1KmDvtaOb3+bsddcY5W6YZA90clU\nRIKS4Omnn2bPnj35NiOOZEJT8IEbkjJhwgRuu+02Zs6cCcD06dPZunUrU6dO5b09e0Ai91cF/HWq\nJ73hBvjBDwIP5a8dYxilg7V0BsB7iz766KOZO3cuXV0pBT/IIvXA1iTHi1towuEwvb2RFlkZcDCd\nk4wcCe+8Q09PD5s2beLaa6/l5ZdfZvr06axbt85aMoaRAtlq6ZRl+oTDiZ6eHmbPns2cOXOYPn16\nHgXHP54eLzi/JCI03lZ8+O/uQG9vdH8gwakoL+e37e2cPGMGFeXlnDxjBj3d3fDOO0AkjMxhhx3G\ntm3b6O3tpaOjg46Ojqzei2EYyTHRScLq1at56aWX6O3tjb595w5/VRyPX2S+mEuj0iaZW1oyykMh\nTp4xg/a2NsrD4Zg7nzp1KjNnzmTdunWsXbs2sBUzceJEysvLgUMOCYZh5A8TnQTs3r2bCy+8MBpg\nMjekKjSFSbrCUkns3VWOGEF5OMxJjY2sfPJJ1q1bx6xZs6ipqYmWKSsr48477wRgy5YtCb26du7c\nGX1hOHjwIK+99trQbtIwjCFhopOAX/7yl3zwwQdZvkoTxSY06QrLZGIFZfKJJ0bv7H1fvlAoxK9+\n9SvWrVvH+vXrOfPMM6mqqqKqqor/+I//YPLkyYTDYU466STq6+uj3Z+zZ8+mp6en33UbGxuZPn06\n5eXlNDQ0MH369Aw8BcMw0iYbk38KacNXLw6EFwpl7dq1WZzkmGye4TdyPuEyaHsozUmSfxt3njlz\n5sT8f/XVVydcEhpIKcyLf/Z+qjP1013p0zBKGVdnZr5OzsZJC2lLVXSyGy06WV2dH2G5Pk1heXQQ\n17jsssuiIWgqKiq0s7MzGm4mPu8JJ5wQjYOWKjZT3zCyR7ZEx7rXHN5Kn5nDX6fGk5tus4+QWBFu\nH6CsJNg+m+K1RYR58+ZFF1nr6+tj3759rFu3jpUrV1JfXx/NO2HCBDZu3BjtSkuVqqqqpE4EhmEU\nHiU/T8db6XPNmjV897vfHcKVLgaak1kyhHMnpgroTrNspi0KhULccsst7Nu3jyuuuILjjz+e2bNn\ns3XrVhoaGmKEwZtDAzBz5kwTDMMoMCwMTpokE52enh5mzpzJyy+/nObZkz27i4msTZcZ0v2UstWW\n+vKXv8wVV1zBBRdcwKuvvgpEBu3Xr18fIyAWM8wwihMTnTRJJjqtra2cf/75gzxjsuc1tM+n0IQl\nEXV1dTz77LNUVVVZi8UwhikWey0LeG/oA5M5oUlXWEYA2XTgvv7669m9ezeXXXYZ//AP/8Crr77K\n2LFjeeihh+js7GTOnDns2rULiBWXqqoqzjzzzCxaZhjGcKJkWzo9PT2sXr2az372swHLFNQAf0xw\nxm1AQ9JrpvtE64F0O/oGQ1lZxH/kxBNP5Ktf/Sqf+cxnYpYIsC4xwzCsey0AEZkP3ElkkutPVXVx\nQJ5+ouPFVOvo6KCuro4TTzyR3/zmaOC+BFc6Dohd/2UTcFoaNv9/wD1plBuIRYsWcdddd/Hmm28y\nevRoPvvZz3LDDTcAcN9993HhhRfy9ttvA5HlAl577TUTFcMwEmKiE4eIlAHbgTOB3cAzwAJVfTku\nXz/R2bBhA3PmzKG3tx54MdEVOCnJ0WT8B5FYA5nirLPOYtasWYwfP54XXniBkSNHUlVVxYQJE+jp\n6Ym2VAqthbJmzRqamprybUbamP35xezPLzam05+ZwA5V3QkgIs3A+aTQQ9XY2MiIERvo7f0IFbzP\na0xgDG8M6uLrgY+mYbTHlClT6Orq4t1330VEmDVrFocffjjV1dWcc845/OhHP6KqqoqamhqWL1+e\n0jmrqqqiC5sVAsX+ozP784vZPzwpZtEZD+zy/f86ESEakKqqKvbMv4MjHnsoab4bge+nbR4cfvjh\nfOMb3+Avf/kL27dvZ//+/XzkIx/hqquuGrBl8rnPfQ6AW265ZQgWGIZhFBbFLDqDYu3atTH/H/H1\nf4S/PYc/f+xjfOyCC6IrUt5www2sX7+eo446igceeAD27u13rqqqKmbOnMkpp5xCQ0MDu3bt4oMP\nPqCiooKTTz6ZU089NaUxk0JrmRiGYWSbYh7TOR24RVXnu/9vIhIraHFcvuK8QcMwjDxjjgQ+RCQE\ndBJxJPgTEYey/62q2/JqmGEYhpGQou1eU9WDIvIPwCoOuUyb4BiGYRQwRdvSMQzDMIqPYbu0gYjM\nF5GXRWS7iNyYb3s8RKRaRJ4SkQ4ReUlEvurSR4nIKhHpFJEnRWSkr8xCEdkhIttE5Bxf+iki8qK7\nxztzeA9lIvKciLQWoe0jReRXzp4OEZlVZPZfJyJb3LUfFJGKQrZfRH4qIntF5EVfWsbsdfff7Mps\nEJEJObD/+86+F0TkMRE5spjs9x27QUT6RGR0Tu3PxiI9+d6IiOnvgYlAOfACMC3fdjnbxgInu/0j\niIxLTQMWA19z6TcCt7n9BuB5Il2hNe6+vBbq08Bpbv9xYF6O7uE64JdAq/u/mGz/OfAltx8GRhaL\n/cA44BWgwv3/MHBpIdsPfAw4GXjRl5Yxe4Ergbvc/sVAcw7sPwsoc/u3Ad8rJvtdejWwkki8r9Eu\nrT4X9mf9R56PDTgdeML3/03Ajfm2K4Gt/+a+xC8DY1zaWODlINuBJ4BZLs9WX/oCYFkO7K0G2ogE\nXfBEp1hsPxL4Q0B6sdg/DtgJjHIVQ2sxfHeIvPz5K+2M2Uuk4pzl9kPAm9m2P+7Yp4BfFJv9wK+A\nk4gVnZzYP1y714Imjo7Pky0JEZEaIm8hG4n8CPcCqOoeIgHfoP+9dLm08UTuyyNX93gH8E/ExjUt\nFtsnAf8tIve57sG7ReQwisR+Vd1NZNHX15wt+1W1nSKx38dxGbQ3WkZVDwLv+LuLcsDfEXnzj7HF\nUZD2i8h5wC5VfSnuUE7sH66iU/CIyBHAo8A1qvpn+genLjgPDxH5JLBXVV8g+ZoOBWe7IwycAvxY\nVVTp2ZMAAAJHSURBVE8B/kLk7a7gnz2AiBxFJNTTRCKtnsNF5PMUif1JyKS9OVteSkS+DhxQ1RWZ\nPG0Gz9X/5CJ/BdwMLMrWJQbKMFxFpwvwD2hVu7SCQETCRATnF6ra4pL3isgYd3wsRIPBdQEn+Ip7\n95IoPZt8FDhPRF4BVgBzReQXwJ4isB0ib2i7VPVZ9/9jRESoGJ49RLrSXlHVfe6t8l+BMyge+z0y\naW/0mETm7h2pqvuyZ3oEEbkM+ATwOV9yMdh/IpHxmt+JyB+dLc+JyHEkrjczav9wFZ1ngMkiMlFE\nKoj0Qbbm2SY/PyPSR7rEl9YKXOb2LwVafOkLnJfIJGAysMl1S+wXkZkiIsAlvjJZQVVvVtUJqlpL\n5Jk+papfBH5T6LY7+/cCu0SkziWdCXRQBM/e8RpwuohUuuueCWwtAvuF2DfgTNrb6s4BcCHwVLbt\nl8iSKv8EnKeq7/vyFbz9qrpFVceqaq2qTiLyIvZhVX3D2XJx1u3P9KBVoWzAfCKeYTuAm/Jtj8+u\njwIHiXjUPQ8852wdDbQ7m1cBR/nKLCTiSbINOMeXfirwkrvHJTm+j//FIUeCorEdmEHkpeQF4NdE\nvNeKyf5FzpYXgfuJeGcWrP3AQ0SWHnmfiGh+iYgjREbsJbKo7iMufSNQkwP7dxBx6HjObXcVk/1x\nx1/BORLkyn6bHGoYhmHkjOHavWYYhmEUICY6hmEYRs4w0TEMwzByhomOYRiGkTNMdAzDMIycYaJj\nGIZh5AwTHcMwDCNnmOgYhmEYOeP/B6B1gHehiEZaAAAAAElFTkSuQmCC\n",
 552 |       "text/plain": [
 553 |        "<matplotlib.figure.Figure at 0x204109e8>"
 554 |       ]
 555 |      },
 556 |      "metadata": {},
 557 |      "output_type": "display_data"
 558 |     }
 559 |    ],
 560 |    "source": [
 561 |     "import matplotlib.pyplot as plt\n",
 562 |     "%matplotlib inline\n",
 563 |     "plt.plot(simple_feature_matrix,output,'k.',\n",
 564 |     "         simple_feature_matrix,predict_output(simple_feature_matrix, simple_weight_0_penalty),'b-',\n",
 565 |     "        simple_feature_matrix,predict_output(simple_feature_matrix, simple_weight_high_penalty),'r-')"
 566 |    ]
 567 |   },
 568 |   {
 569 |    "cell_type": "markdown",
 570 |    "metadata": {},
 571 |    "source": [
 572 |     "### RSS function"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "code",
 577 |    "execution_count": 104,
 578 |    "metadata": {
 579 |     "collapsed": false
 580 |    },
 581 |    "outputs": [],
 582 |    "source": [
 583 |     "def RSS (predicted_output, true_output):\n",
 584 |     "    residuals = predicted_output - true_output\n",
 585 |     "    residuals_squared = residuals * residuals\n",
 586 |     "    residuals_sum_of_squares = residuals_squared.sum()\n",
 587 |     "    return residuals_sum_of_squares"
 588 |    ]
 589 |   },
 590 |   {
 591 |    "cell_type": "markdown",
 592 |    "metadata": {},
 593 |    "source": [
 594 |     "### RSS on the TEST data: (Simple model)\n",
 595 |     "1. The initial weights (all zeros)\n",
 596 |     "2. The weights learned with no regularization\n",
 597 |     "3. The weights learned with high regularization"
 598 |    ]
 599 |   },
 600 |   {
 601 |    "cell_type": "code",
 602 |    "execution_count": 105,
 603 |    "metadata": {
 604 |     "collapsed": false
 605 |    },
 606 |    "outputs": [
 607 |     {
 608 |      "name": "stdout",
 609 |      "output_type": "stream",
 610 |      "text": [
 611 |       "For simple model and initial weights:\n",
 612 |       "Weight (Coefficients): [ 0.  0.]\n",
 613 |       "RSS: 1.78427328252e+15\n"
 614 |      ]
 615 |     }
 616 |    ],
 617 |    "source": [
 618 |     "predictions = predict_output(simple_test_feature_matrix, initial_weights)\n",
 619 |     "print \"For simple model and initial weights:\" \n",
 620 |     "print \"Weight (Coefficients): \" + str(initial_weights)\n",
 621 |     "print \"RSS: \" + str(RSS(predictions, test_output))"
 622 |    ]
 623 |   },
 624 |   {
 625 |    "cell_type": "code",
 626 |    "execution_count": 106,
 627 |    "metadata": {
 628 |     "collapsed": false
 629 |    },
 630 |    "outputs": [
 631 |     {
 632 |      "name": "stdout",
 633 |      "output_type": "stream",
 634 |      "text": [
 635 |       "For simple model and weights with no regularization:\n",
 636 |       "Weight (Coefficient): [ -1.63113501e-01   2.63024369e+02]\n",
 637 |       "RSS: 2.75723634598e+14\n"
 638 |      ]
 639 |     }
 640 |    ],
 641 |    "source": [
 642 |     "predictions = predict_output(simple_test_feature_matrix, simple_weight_0_penalty)\n",
 643 |     "print \"For simple model and weights with no regularization:\"\n",
 644 |     "print \"Weight (Coefficient): \" + str(simple_weight_0_penalty)\n",
 645 |     "print \"RSS: \" + str(RSS(predictions, test_output))"
 646 |    ]
 647 |   },
 648 |   {
 649 |    "cell_type": "code",
 650 |    "execution_count": 107,
 651 |    "metadata": {
 652 |     "collapsed": false
 653 |    },
 654 |    "outputs": [
 655 |     {
 656 |      "name": "stdout",
 657 |      "output_type": "stream",
 658 |      "text": [
 659 |       "For simple model and weights with regularization:\n",
 660 |       "Weight (Coefficient): [   9.76730383  124.57217565]\n",
 661 |       "RSS: 6.94642100914e+14\n"
 662 |      ]
 663 |     }
 664 |    ],
 665 |    "source": [
 666 |     "predictions = predict_output(simple_test_feature_matrix, simple_weight_high_penalty)\n",
 667 |     "print \"For simple model and weights with regularization:\"\n",
 668 |     "print \"Weight (Coefficient): \" + str(simple_weight_high_penalty)\n",
 669 |     "print \"RSS: \" + str(RSS(predictions, test_output))"
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "markdown",
 674 |    "metadata": {},
 675 |    "source": [
 676 |     "### Which weights perform the best?\n",
 677 |     "#### The weights with no regularization seem to perform the best for now!"
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "markdown",
 682 |    "metadata": {},
 683 |    "source": [
 684 |     "### Multiple regression with L2 penalty (Regularization)"
 685 |    ]
 686 |   },
 687 |   {
 688 |    "cell_type": "markdown",
 689 |    "metadata": {},
 690 |    "source": [
 691 |     "Let us now consider a model with 2 features: `['sqft_living', 'sqft_living15']`."
 692 |    ]
 693 |   },
 694 |   {
 695 |    "cell_type": "markdown",
 696 |    "metadata": {},
 697 |    "source": [
 698 |     "First, create Numpy versions of your training and test data with these two features. "
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "code",
 703 |    "execution_count": 108,
 704 |    "metadata": {
 705 |     "collapsed": true
 706 |    },
 707 |    "outputs": [],
 708 |    "source": [
 709 |     "model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. \n",
 710 |     "my_output = 'price'\n",
 711 |     "(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)\n",
 712 |     "(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "markdown",
 717 |    "metadata": {},
 718 |    "source": [
 719 |     "We need to re-inialize the weights, since we have one extra parameter. Let us also set the step size and maximum number of iterations."
 720 |    ]
 721 |   },
 722 |   {
 723 |    "cell_type": "code",
 724 |    "execution_count": 109,
 725 |    "metadata": {
 726 |     "collapsed": true
 727 |    },
 728 |    "outputs": [],
 729 |    "source": [
 730 |     "initial_weights = np.array([0.0,0.0,0.0])\n",
 731 |     "step_size = 1e-12\n",
 732 |     "max_iterations = 1000"
 733 |    ]
 734 |   },
 735 |   {
 736 |    "cell_type": "markdown",
 737 |    "metadata": {},
 738 |    "source": [
 739 |     "#### Learned weights with no regulariztion"
 740 |    ]
 741 |   },
 742 |   {
 743 |    "cell_type": "code",
 744 |    "execution_count": 110,
 745 |    "metadata": {
 746 |     "collapsed": false
 747 |    },
 748 |    "outputs": [
 749 |     {
 750 |      "name": "stdout",
 751 |      "output_type": "stream",
 752 |      "text": [
 753 |       "Starting gradient descent with l2_penalty = 0.0\n",
 754 |       "Iteration = 1\n",
 755 |       "Cost function =  7.43305185103e+15\n",
 756 |       "Iteration = 2\n",
 757 |       "Cost function =  4.0567523315e+15\n",
 758 |       "Iteration = 3\n",
 759 |       "Cost function =  2.52956511433e+15\n",
 760 |       "Iteration = 4\n",
 761 |       "Cost function =  1.83855669428e+15\n",
 762 |       "Iteration = 5\n",
 763 |       "Cost function =  1.52567557521e+15\n",
 764 |       "Iteration = 6\n",
 765 |       "Cost function =  1.38378949867e+15\n",
 766 |       "Iteration = 7\n",
 767 |       "Cost function =  1.31923260628e+15\n",
 768 |       "Iteration = 8\n",
 769 |       "Cost function =  1.28964887203e+15\n",
 770 |       "Iteration = 9\n",
 771 |       "Cost function =  1.27588472408e+15\n",
 772 |       "Iteration = 10\n",
 773 |       "Cost function =  1.26927880758e+15\n",
 774 |       "Iteration = 20\n",
 775 |       "Cost function =  1.25781238632e+15\n",
 776 |       "Iteration = 30\n",
 777 |       "Cost function =  1.25195457127e+15\n",
 778 |       "Iteration = 40\n",
 779 |       "Cost function =  1.24675542316e+15\n",
 780 |       "Iteration = 50\n",
 781 |       "Cost function =  1.24213950875e+15\n",
 782 |       "Iteration = 60\n",
 783 |       "Cost function =  1.23804140114e+15\n",
 784 |       "Iteration = 70\n",
 785 |       "Cost function =  1.23440301346e+15\n",
 786 |       "Iteration = 80\n",
 787 |       "Cost function =  1.23117277498e+15\n",
 788 |       "Iteration = 90\n",
 789 |       "Cost function =  1.22830490006e+15\n",
 790 |       "Iteration = 100\n",
 791 |       "Cost function =  1.22575873926e+15\n",
 792 |       "Iteration = 200\n",
 793 |       "Cost function =  1.21173888142e+15\n",
 794 |       "Iteration = 300\n",
 795 |       "Cost function =  1.20747308096e+15\n",
 796 |       "Iteration = 400\n",
 797 |       "Cost function =  1.20617512577e+15\n",
 798 |       "Iteration = 500\n",
 799 |       "Cost function =  1.20578019023e+15\n",
 800 |       "Iteration = 600\n",
 801 |       "Cost function =  1.20566001447e+15\n",
 802 |       "Iteration = 700\n",
 803 |       "Cost function =  1.20562343925e+15\n",
 804 |       "Iteration = 800\n",
 805 |       "Cost function =  1.20561230098e+15\n",
 806 |       "Iteration = 900\n",
 807 |       "Cost function =  1.20560890236e+15\n",
 808 |       "Iteration = 1000\n",
 809 |       "Cost function =  1.20560785866e+15\n",
 810 |       "Done with gradient descent at iteration  1000\n",
 811 |       "Learned weights =  [  -0.35743482  243.0541689    22.41481594]\n"
 812 |      ]
 813 |     }
 814 |    ],
 815 |    "source": [
 816 |     "l2_penalty = 0.0\n",
 817 |     "multiple_weights_0_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, \n",
 818 |     "                                                               l2_penalty, max_iterations)"
 819 |    ]
 820 |   },
 821 |   {
 822 |    "cell_type": "markdown",
 823 |    "metadata": {},
 824 |    "source": [
 825 |     "#### Learned weights with regulariztion"
 826 |    ]
 827 |   },
 828 |   {
 829 |    "cell_type": "code",
 830 |    "execution_count": 111,
 831 |    "metadata": {
 832 |     "collapsed": false
 833 |    },
 834 |    "outputs": [
 835 |     {
 836 |      "name": "stdout",
 837 |      "output_type": "stream",
 838 |      "text": [
 839 |       "Starting gradient descent with l2_penalty = 1e+11\n",
 840 |       "Iteration = 1\n",
 841 |       "Cost function =  7.43305185103e+15\n",
 842 |       "Iteration = 2\n",
 843 |       "Cost function =  4.46048979029e+15\n",
 844 |       "Iteration = 3\n",
 845 |       "Cost function =  3.79667446884e+15\n",
 846 |       "Iteration = 4\n",
 847 |       "Cost function =  3.64831953044e+15\n",
 848 |       "Iteration = 5\n",
 849 |       "Cost function =  3.61509110322e+15\n",
 850 |       "Iteration = 6\n",
 851 |       "Cost function =  3.60760274251e+15\n",
 852 |       "Iteration = 7\n",
 853 |       "Cost function =  3.60588632216e+15\n",
 854 |       "Iteration = 8\n",
 855 |       "Cost function =  3.60547487453e+15\n",
 856 |       "Iteration = 9\n",
 857 |       "Cost function =  3.60536516777e+15\n",
 858 |       "Iteration = 10\n",
 859 |       "Cost function =  3.60532940218e+15\n",
 860 |       "Iteration = 20\n",
 861 |       "Cost function =  3.60529428102e+15\n",
 862 |       "Iteration = 30\n",
 863 |       "Cost function =  3.60529353727e+15\n",
 864 |       "Iteration = 40\n",
 865 |       "Cost function =  3.60529308275e+15\n",
 866 |       "Iteration = 50\n",
 867 |       "Cost function =  3.60529263111e+15\n",
 868 |       "Iteration = 60\n",
 869 |       "Cost function =  3.60529217949e+15\n",
 870 |       "Iteration = 70\n",
 871 |       "Cost function =  3.60529172788e+15\n",
 872 |       "Iteration = 80\n",
 873 |       "Cost function =  3.60529127626e+15\n",
 874 |       "Iteration = 90\n",
 875 |       "Cost function =  3.60529082465e+15\n",
 876 |       "Iteration = 100\n",
 877 |       "Cost function =  3.60529037303e+15\n",
 878 |       "Iteration = 200\n",
 879 |       "Cost function =  3.6052858569e+15\n",
 880 |       "Iteration = 300\n",
 881 |       "Cost function =  3.60528134078e+15\n",
 882 |       "Iteration = 400\n",
 883 |       "Cost function =  3.60527682468e+15\n",
 884 |       "Iteration = 500\n",
 885 |       "Cost function =  3.60527230859e+15\n",
 886 |       "Iteration = 600\n",
 887 |       "Cost function =  3.60526779252e+15\n",
 888 |       "Iteration = 700\n",
 889 |       "Cost function =  3.60526327646e+15\n",
 890 |       "Iteration = 800\n",
 891 |       "Cost function =  3.60525876041e+15\n",
 892 |       "Iteration = 900\n",
 893 |       "Cost function =  3.60525424438e+15\n",
 894 |       "Iteration = 1000\n",
 895 |       "Cost function =  3.60524972836e+15\n",
 896 |       "Done with gradient descent at iteration  1000\n",
 897 |       "Learned weights =  [  6.7429658   91.48927361  78.43658768]\n"
 898 |      ]
 899 |     }
 900 |    ],
 901 |    "source": [
 902 |     "l2_penalty = 1e11\n",
 903 |     "multiple_weights_high_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, \n",
 904 |     "                                                               l2_penalty, max_iterations)"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "markdown",
 909 |    "metadata": {},
 910 |    "source": [
 911 |     "### RSS on the TEST data: (Multiple model)\n",
 912 |     "1. The initial weights (all zeros)\n",
 913 |     "2. The weights learned with no regularization\n",
 914 |     "3. The weights learned with high regularization"
 915 |    ]
 916 |   },
 917 |   {
 918 |    "cell_type": "code",
 919 |    "execution_count": 112,
 920 |    "metadata": {
 921 |     "collapsed": false
 922 |    },
 923 |    "outputs": [
 924 |     {
 925 |      "name": "stdout",
 926 |      "output_type": "stream",
 927 |      "text": [
 928 |       "For multiple model and initial weights:\n",
 929 |       "Weight (Coefficients): [ 0.  0.  0.]\n",
 930 |       "RSS: 1.78427328252e+15\n"
 931 |      ]
 932 |     }
 933 |    ],
 934 |    "source": [
 935 |     "predictions = predict_output(test_feature_matrix, initial_weights)\n",
 936 |     "print \"For multiple model and initial weights:\" \n",
 937 |     "print \"Weight (Coefficients): \" + str(initial_weights)\n",
 938 |     "print \"RSS: \" + str(RSS(predictions, test_output))"
 939 |    ]
 940 |   },
 941 |   {
 942 |    "cell_type": "code",
 943 |    "execution_count": 113,
 944 |    "metadata": {
 945 |     "collapsed": false
 946 |    },
 947 |    "outputs": [
 948 |     {
 949 |      "name": "stdout",
 950 |      "output_type": "stream",
 951 |      "text": [
 952 |       "For multiple model and weights with no regularization:\n",
 953 |       "Weight (Coefficients): [  -0.35743482  243.0541689    22.41481594]\n",
 954 |       "RSS: 2.74067618287e+14\n"
 955 |      ]
 956 |     }
 957 |    ],
 958 |    "source": [
 959 |     "predictions = predict_output(test_feature_matrix, multiple_weights_0_penalty)\n",
 960 |     "print \"For multiple model and weights with no regularization:\" \n",
 961 |     "print \"Weight (Coefficients): \" + str(multiple_weights_0_penalty)\n",
 962 |     "print \"RSS: \" + str(RSS(predictions, test_output))"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": 114,
 968 |    "metadata": {
 969 |     "collapsed": false
 970 |    },
 971 |    "outputs": [
 972 |     {
 973 |      "name": "stdout",
 974 |      "output_type": "stream",
 975 |      "text": [
 976 |       "For multiple model and weights with regularization:\n",
 977 |       "Weight (Coefficients): [  6.7429658   91.48927361  78.43658768]\n",
 978 |       "RSS: 5.0040480058e+14\n"
 979 |      ]
 980 |     }
 981 |    ],
 982 |    "source": [
 983 |     "predictions = predict_output(test_feature_matrix, multiple_weights_high_penalty)\n",
 984 |     "print \"For multiple model and weights with regularization:\" \n",
 985 |     "print \"Weight (Coefficients): \" + str(multiple_weights_high_penalty)\n",
 986 |     "print \"RSS: \" + str(RSS(predictions, test_output))"
 987 |    ]
 988 |   },
 989 |   {
 990 |    "cell_type": "markdown",
 991 |    "metadata": {},
 992 |    "source": [
 993 |     "### Which weights perform best?\n",
 994 |     "\n",
 995 |     "#### The weights with no regularization seem to perform the best in the multiple model too same as the simple model result!"
 996 |    ]
 997 |   },
 998 |   {
 999 |    "cell_type": "markdown",
1000 |    "metadata": {},
1001 |    "source": [
1002 |     "## House 1 price prediction from test dataset"
1003 |    ]
1004 |   },
1005 |   {
1006 |    "cell_type": "code",
1007 |    "execution_count": 118,
1008 |    "metadata": {
1009 |     "collapsed": false
1010 |    },
1011 |    "outputs": [
1012 |     {
1013 |      "data": {
1014 |       "text/plain": [
1015 |        "array([  1.00000000e+00,   1.43000000e+03,   1.78000000e+03])"
1016 |       ]
1017 |      },
1018 |      "execution_count": 118,
1019 |      "metadata": {},
1020 |      "output_type": "execute_result"
1021 |     }
1022 |    ],
1023 |    "source": [
1024 |     "#House no.1 in the test dataset\n",
1025 |     "test_feature_matrix[0]"
1026 |    ]
1027 |   },
1028 |   {
1029 |    "cell_type": "code",
1030 |    "execution_count": 77,
1031 |    "metadata": {
1032 |     "collapsed": false
1033 |    },
1034 |    "outputs": [
1035 |     {
1036 |      "data": {
1037 |       "text/plain": [
1038 |        "['sqft_living', 'sqft_living15']"
1039 |       ]
1040 |      },
1041 |      "execution_count": 77,
1042 |      "metadata": {},
1043 |      "output_type": "execute_result"
1044 |     }
1045 |    ],
1046 |    "source": [
1047 |     "#Features in consideration\n",
1048 |     "model_features"
1049 |    ]
1050 |   },
1051 |   {
1052 |    "cell_type": "code",
1053 |    "execution_count": 80,
1054 |    "metadata": {
1055 |     "collapsed": false
1056 |    },
1057 |    "outputs": [
1058 |     {
1059 |      "data": {
1060 |       "text/plain": [
1061 |        "'price'"
1062 |       ]
1063 |      },
1064 |      "execution_count": 80,
1065 |      "metadata": {},
1066 |      "output_type": "execute_result"
1067 |     }
1068 |    ],
1069 |    "source": [
1070 |     "#Output feature to predict\n",
1071 |     "my_output"
1072 |    ]
1073 |   },
1074 |   {
1075 |    "cell_type": "code",
1076 |    "execution_count": 119,
1077 |    "metadata": {
1078 |     "collapsed": false
1079 |    },
1080 |    "outputs": [
1081 |     {
1082 |      "data": {
1083 |       "text/plain": [
1084 |        "array([  -0.35743482,  243.0541689 ,   22.41481594])"
1085 |       ]
1086 |      },
1087 |      "execution_count": 119,
1088 |      "metadata": {},
1089 |      "output_type": "execute_result"
1090 |     }
1091 |    ],
1092 |    "source": [
1093 |     "#Weights with no regularization learned\n",
1094 |     "multiple_weights_0_penalty"
1095 |    ]
1096 |   },
1097 |   {
1098 |    "cell_type": "code",
1099 |    "execution_count": 120,
1100 |    "metadata": {
1101 |     "collapsed": false
1102 |    },
1103 |    "outputs": [
1104 |     {
1105 |      "data": {
1106 |       "text/plain": [
1107 |        "array([  6.7429658 ,  91.48927361,  78.43658768])"
1108 |       ]
1109 |      },
1110 |      "execution_count": 120,
1111 |      "metadata": {},
1112 |      "output_type": "execute_result"
1113 |     }
1114 |    ],
1115 |    "source": [
1116 |     "#Weights with regularization learned\n",
1117 |     "multiple_weights_high_penalty"
1118 |    ]
1119 |   },
1120 |   {
1121 |    "cell_type": "code",
1122 |    "execution_count": 122,
1123 |    "metadata": {
1124 |     "collapsed": false
1125 |    },
1126 |    "outputs": [
1127 |     {
1128 |      "name": "stdout",
1129 |      "output_type": "stream",
1130 |      "text": [
1131 |       "The predicted house 1 price (weights with no regularization): \n"
1132 |      ]
1133 |     },
1134 |     {
1135 |      "data": {
1136 |       "text/plain": [
1137 |        "387465.47646474396"
1138 |       ]
1139 |      },
1140 |      "execution_count": 122,
1141 |      "metadata": {},
1142 |      "output_type": "execute_result"
1143 |     }
1144 |    ],
1145 |    "source": [
1146 |     "#Predicting the output using weights with no regularization\n",
1147 |     "predicted_output = predict_output(test_feature_matrix[0:1], multiple_weights_0_penalty)\n",
1148 |     "print \"The predicted house 1 price (weights with no regularization): \"\n",
1149 |     "predicted_output[0]"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "code",
1154 |    "execution_count": 123,
1155 |    "metadata": {
1156 |     "collapsed": false
1157 |    },
1158 |    "outputs": [
1159 |     {
1160 |      "name": "stdout",
1161 |      "output_type": "stream",
1162 |      "text": [
1163 |       "The predicted house 1 price (weights with regularization): \n"
1164 |      ]
1165 |     },
1166 |     {
1167 |      "data": {
1168 |       "text/plain": [
1169 |        "270453.53030485858"
1170 |       ]
1171 |      },
1172 |      "execution_count": 123,
1173 |      "metadata": {},
1174 |      "output_type": "execute_result"
1175 |     }
1176 |    ],
1177 |    "source": [
1178 |     "#Predicting the output using weights with regularization\n",
1179 |     "predicted_output = predict_output(test_feature_matrix[0:1], multiple_weights_high_penalty)\n",
1180 |     "print \"The predicted house 1 price (weights with regularization): \"\n",
1181 |     "predicted_output[0]"
1182 |    ]
1183 |   },
1184 |   {
1185 |    "cell_type": "code",
1186 |    "execution_count": 125,
1187 |    "metadata": {
1188 |     "collapsed": false
1189 |    },
1190 |    "outputs": [
1191 |     {
1192 |      "name": "stdout",
1193 |      "output_type": "stream",
1194 |      "text": [
1195 |       "The actual house 1 price is: \n"
1196 |      ]
1197 |     },
1198 |     {
1199 |      "data": {
1200 |       "text/plain": [
1201 |        "310000.0"
1202 |       ]
1203 |      },
1204 |      "execution_count": 125,
1205 |      "metadata": {},
1206 |      "output_type": "execute_result"
1207 |     }
1208 |    ],
1209 |    "source": [
1210 |     "#Actual house 1 price\n",
1211 |     "print \"The actual house 1 price is: \"\n",
1212 |     "test_output[0]"
1213 |    ]
1214 |   }
1215 |  ],
1216 |  "metadata": {
1217 |   "kernelspec": {
1218 |    "display_name": "Python 2",
1219 |    "language": "python",
1220 |    "name": "python2"
1221 |   },
1222 |   "language_info": {
1223 |    "codemirror_mode": {
1224 |     "name": "ipython",
1225 |     "version": 2
1226 |    },
1227 |    "file_extension": ".py",
1228 |    "mimetype": "text/x-python",
1229 |    "name": "python",
1230 |    "nbconvert_exporter": "python",
1231 |    "pygments_lexer": "ipython2",
1232 |    "version": "2.7.13"
1233 |   }
1234 |  },
1235 |  "nbformat": 4,
1236 |  "nbformat_minor": 0
1237 | }
1238 | 


--------------------------------------------------------------------------------
/simple-linear-regression/simple-linear-regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Simple Linear Regression on House Sales data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Fire up Graphlab Create"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import graphlab"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Load in house sales data\n",
 33 |     "\n",
 34 |     "Dataset is from house sales in King County, the region where the city of Seattle, WA is located."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 17,
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "sales = graphlab.SFrame('kc_house_data.gl/')"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "### Explore house sales data"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 18,
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/html": [
 65 |        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\"><table frame=\"box\" rules=\"cols\">\n",
 66 |        "    <tr>\n",
 67 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">id</th>\n",
 68 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">date</th>\n",
 69 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">price</th>\n",
 70 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">bedrooms</th>\n",
 71 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">bathrooms</th>\n",
 72 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_living</th>\n",
 73 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_lot</th>\n",
 74 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">floors</th>\n",
 75 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">waterfront</th>\n",
 76 |        "    </tr>\n",
 77 |        "    <tr>\n",
 78 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">7129300520</td>\n",
 79 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">2014-10-13 00:00:00+00:00</td>\n",
 80 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">221900.0</td>\n",
 81 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">3.0</td>\n",
 82 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1.0</td>\n",
 83 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1180.0</td>\n",
 84 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">5650</td>\n",
 85 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1</td>\n",
 86 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
 87 |        "    </tr>\n",
 88 |        "</table>\n",
 89 |        "<table frame=\"box\" rules=\"cols\">\n",
 90 |        "    <tr>\n",
 91 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">view</th>\n",
 92 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">condition</th>\n",
 93 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">grade</th>\n",
 94 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_above</th>\n",
 95 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_basement</th>\n",
 96 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">yr_built</th>\n",
 97 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">yr_renovated</th>\n",
 98 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">zipcode</th>\n",
 99 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">lat</th>\n",
100 |        "    </tr>\n",
101 |        "    <tr>\n",
102 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
103 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">3</td>\n",
104 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">7</td>\n",
105 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1180</td>\n",
106 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
107 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1955</td>\n",
108 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">0</td>\n",
109 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">98178</td>\n",
110 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">47.51123398</td>\n",
111 |        "    </tr>\n",
112 |        "</table>\n",
113 |        "<table frame=\"box\" rules=\"cols\">\n",
114 |        "    <tr>\n",
115 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">long</th>\n",
116 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_living15</th>\n",
117 |        "        <th style=\"padding-left: 1em; padding-right: 1em; text-align: center\">sqft_lot15</th>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">-122.25677536</td>\n",
121 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">1340.0</td>\n",
122 |        "        <td style=\"padding-left: 1em; padding-right: 1em; text-align: center; vertical-align: top\">5650.0</td>\n",
123 |        "    </tr>\n",
124 |        "</table>\n",
125 |        "[1 rows x 21 columns]<br/>\n",
126 |        "</div>"
127 |       ],
128 |       "text/plain": [
129 |        "Columns:\n",
130 |        "\tid\tstr\n",
131 |        "\tdate\tdatetime\n",
132 |        "\tprice\tfloat\n",
133 |        "\tbedrooms\tfloat\n",
134 |        "\tbathrooms\tfloat\n",
135 |        "\tsqft_living\tfloat\n",
136 |        "\tsqft_lot\tint\n",
137 |        "\tfloors\tstr\n",
138 |        "\twaterfront\tint\n",
139 |        "\tview\tint\n",
140 |        "\tcondition\tint\n",
141 |        "\tgrade\tint\n",
142 |        "\tsqft_above\tint\n",
143 |        "\tsqft_basement\tint\n",
144 |        "\tyr_built\tint\n",
145 |        "\tyr_renovated\tint\n",
146 |        "\tzipcode\tstr\n",
147 |        "\tlat\tfloat\n",
148 |        "\tlong\tfloat\n",
149 |        "\tsqft_living15\tfloat\n",
150 |        "\tsqft_lot15\tfloat\n",
151 |        "\n",
152 |        "Rows: 1\n",
153 |        "\n",
154 |        "Data:\n",
155 |        "+------------+---------------------------+----------+----------+-----------+\n",
156 |        "|     id     |            date           |  price   | bedrooms | bathrooms |\n",
157 |        "+------------+---------------------------+----------+----------+-----------+\n",
158 |        "| 7129300520 | 2014-10-13 00:00:00+00:00 | 221900.0 |   3.0    |    1.0    |\n",
159 |        "+------------+---------------------------+----------+----------+-----------+\n",
160 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
161 |        "| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |\n",
162 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
163 |        "|    1180.0   |   5650   |   1    |     0      |  0   |     3     |   7   |    1180    |\n",
164 |        "+-------------+----------+--------+------------+------+-----------+-------+------------+\n",
165 |        "+---------------+----------+--------------+---------+-------------+\n",
166 |        "| sqft_basement | yr_built | yr_renovated | zipcode |     lat     |\n",
167 |        "+---------------+----------+--------------+---------+-------------+\n",
168 |        "|       0       |   1955   |      0       |  98178  | 47.51123398 |\n",
169 |        "+---------------+----------+--------------+---------+-------------+\n",
170 |        "+---------------+---------------+-----+\n",
171 |        "|      long     | sqft_living15 | ... |\n",
172 |        "+---------------+---------------+-----+\n",
173 |        "| -122.25677536 |     1340.0    | ... |\n",
174 |        "+---------------+---------------+-----+\n",
175 |        "[1 rows x 21 columns]"
176 |       ]
177 |      },
178 |      "execution_count": 18,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "sales[0:1]"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "### Splitting the data"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 15,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "train_data, test_data = sales.random_split(.8,seed=0)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "## Simple linear regression algorithm - Part 1\n",
210 |     "To calculate the intercept and the slope of the regression line"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 19,
216 |    "metadata": {
217 |     "collapsed": false
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "# x is the input and y is the output\n",
222 |     "def simple_linear_regression(x, y):\n",
223 |     "    \n",
224 |     "    # compute the sum of input and output\n",
225 |     "    sum = x + y\n",
226 |     "    \n",
227 |     "    # compute the product of the output and the input and its sum\n",
228 |     "    product = x * y\n",
229 |     "    sum_of_product = product.sum()\n",
230 |     "    \n",
231 |     "    # compute the squared value of the input and its sum\n",
232 |     "    x_squared = x * x\n",
233 |     "    sum_x_squared = x_squared.sum()\n",
234 |     "    \n",
235 |     "    # use the formula for the slope\n",
236 |     "    numerator = sum_of_product - ((x.sum() * y.sum()) / x.size())\n",
237 |     "    denominator = sum_x_squared - ((x.sum() * x.sum()) / x.size())\n",
238 |     "    slope = numerator / denominator\n",
239 |     "    \n",
240 |     "    # use the formula for the intercept\n",
241 |     "    intercept = y.mean() - (slope * x.mean())\n",
242 |     "    \n",
243 |     "    return (intercept, slope)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "We can test that our function works by passing it something where we know the answer. In particular we can generate a feature and then put the output exactly on a line: output = 1 + 1\\*input_feature then we know both our slope and intercept should be 1"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 20,
256 |    "metadata": {
257 |     "collapsed": false,
258 |     "scrolled": true
259 |    },
260 |    "outputs": [
261 |     {
262 |      "name": "stdout",
263 |      "output_type": "stream",
264 |      "text": [
265 |       "[0L, 1L, 2L, 3L, 4L]\n",
266 |       "[1L, 2L, 3L, 4L, 5L]\n",
267 |       "Intercept: 1.0\n",
268 |       "Slope: 1\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "test_feature = graphlab.SArray(range(5))\n",
274 |     "test_output = graphlab.SArray(1 + 1*test_feature)\n",
275 |     "(test_intercept, test_slope) =  simple_linear_regression(test_feature, test_output)\n",
276 |     "print test_feature\n",
277 |     "print test_output\n",
278 |     "print \"Intercept: \" + str(test_intercept)\n",
279 |     "print \"Slope: \" + str(test_slope)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "So now it works let's build a regression model for predicting price based on sqft_living"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 21,
292 |    "metadata": {
293 |     "collapsed": false
294 |    },
295 |    "outputs": [
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "Intercept: -47116.0765749\n",
301 |       "Slope: 281.958838568\n"
302 |      ]
303 |     }
304 |    ],
305 |    "source": [
306 |     "sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])\n",
307 |     "\n",
308 |     "print \"Intercept: \" + str(sqft_intercept)\n",
309 |     "print \"Slope: \" + str(sqft_slope)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "## Simple linear regression algorithm - Part 2\n",
317 |     "To calculate the predicted output"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 22,
323 |    "metadata": {
324 |     "collapsed": false
325 |    },
326 |    "outputs": [],
327 |    "source": [
328 |     "def get_regression_predictions(input_feature, intercept, slope):\n",
329 |     "    # calculate the predicted values:\n",
330 |     "    predicted_values = intercept + (slope * input_feature)\n",
331 |     "    return predicted_values"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "metadata": {},
337 |    "source": [
338 |     "**What is the predicted price for a house with 2650 sqft?**"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 23,
344 |    "metadata": {
345 |     "collapsed": false
346 |    },
347 |    "outputs": [
348 |     {
349 |      "name": "stdout",
350 |      "output_type": "stream",
351 |      "text": [
352 |       "The estimated price for a house with 2650 squarefeet is $700074.85\n"
353 |      ]
354 |     }
355 |    ],
356 |    "source": [
357 |     "my_house_sqft = 2650\n",
358 |     "estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)\n",
359 |     "print \"The estimated price for a house with %d squarefeet is $%.2f\" % (my_house_sqft, estimated_price)"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "## Residual Sum of Squares"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {},
372 |    "source": [
373 |     "RSS is the sum of the squares of the residuals which is the difference between the predicted output and the true output."
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 24,
379 |    "metadata": {
380 |     "collapsed": true
381 |    },
382 |    "outputs": [],
383 |    "source": [
384 |     "def get_residual_sum_of_squares(input_feature, actual_output, intercept, slope):\n",
385 |     "    # First get the predictions\n",
386 |     "    predicted_output = intercept + (slope * input_feature)\n",
387 |     "\n",
388 |     "    # then compute the residuals (since we are squaring it doesn't matter which order you subtract)\n",
389 |     "    residuals = actual_output - predicted_output\n",
390 |     "\n",
391 |     "    # square the residuals and add them up\n",
392 |     "    residuals_squared = residuals * residuals\n",
393 |     "    residual_sum_squares = residuals_squared.sum()\n",
394 |     "\n",
395 |     "    return(residual_sum_squares)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 25,
401 |    "metadata": {
402 |     "collapsed": false
403 |    },
404 |    "outputs": [
405 |     {
406 |      "name": "stdout",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "The RSS of predicting Prices based on Square Feet is : 1.20191835632e+15\n"
410 |      ]
411 |     }
412 |    ],
413 |    "source": [
414 |     "rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)\n",
415 |     "print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "metadata": {},
421 |    "source": [
422 |     "### Function to predict the squarefeet of a house from a given price"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": 26,
428 |    "metadata": {
429 |     "collapsed": true
430 |    },
431 |    "outputs": [],
432 |    "source": [
433 |     "def inverse_regression_predictions(output, intercept, slope):\n",
434 |     "    # solve output = intercept + slope*input_feature for input_feature. Use this equation to compute the inverse predictions:\n",
435 |     "    estimated_feature = (output - intercept) / slope\n",
436 |     "    return estimated_feature"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "metadata": {},
442 |    "source": [
443 |     "**What is the estimated square-feet for a house costing $800,000?**"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 27,
449 |    "metadata": {
450 |     "collapsed": false
451 |    },
452 |    "outputs": [
453 |     {
454 |      "name": "stdout",
455 |      "output_type": "stream",
456 |      "text": [
457 |       "The estimated squarefeet for a house worth $800000.00 is 3004\n"
458 |      ]
459 |     }
460 |    ],
461 |    "source": [
462 |     "my_house_price = 800000\n",
463 |     "estimated_squarefeet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)\n",
464 |     "print \"The estimated squarefeet for a house worth $%.2f is %d\" % (my_house_price, estimated_squarefeet)"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {},
470 |    "source": [
471 |     "## Estimate house price from no. of bedrooms"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 28,
477 |    "metadata": {
478 |     "collapsed": false
479 |    },
480 |    "outputs": [
481 |     {
482 |      "name": "stdout",
483 |      "output_type": "stream",
484 |      "text": [
485 |       "Intercept: 109473.180469\n",
486 |       "Slope: 127588.952175\n"
487 |      ]
488 |     }
489 |    ],
490 |    "source": [
491 |     "# Estimate the slope and intercept for predicting 'price' based on 'bedrooms'\n",
492 |     "bedrm_intercept, bedrm_slope = simple_linear_regression(train_data['bedrooms'], train_data['price'])\n",
493 |     "print \"Intercept: \" + str(bedrm_intercept)\n",
494 |     "print \"Slope: \" + str(bedrm_slope)"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "markdown",
499 |    "metadata": {},
500 |    "source": [
501 |     "### Test Linear Regression Algorithm for Square feet and Bedrooms Model"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "markdown",
506 |    "metadata": {},
507 |    "source": [
508 |     "**Which model (square feet or bedrooms) has lowest RSS on TEST data?**"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 29,
514 |    "metadata": {
515 |     "collapsed": false
516 |    },
517 |    "outputs": [
518 |     {
519 |      "data": {
520 |       "text/plain": [
521 |        "275402936247141.3"
522 |       ]
523 |      },
524 |      "execution_count": 29,
525 |      "metadata": {},
526 |      "output_type": "execute_result"
527 |     }
528 |    ],
529 |    "source": [
530 |     "# Compute RSS when using bedrooms on TEST data:\n",
531 |     "get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope)"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 30,
537 |    "metadata": {
538 |     "collapsed": false
539 |    },
540 |    "outputs": [
541 |     {
542 |      "data": {
543 |       "text/plain": [
544 |        "493364582868287.94"
545 |       ]
546 |      },
547 |      "execution_count": 30,
548 |      "metadata": {},
549 |      "output_type": "execute_result"
550 |     }
551 |    ],
552 |    "source": [
553 |     "# Compute RSS when using squarefeet on TEST data:\n",
554 |     "get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], bedrm_intercept, bedrm_slope)"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "markdown",
559 |    "metadata": {},
560 |    "source": [
561 |     "# So the Square feet model has a lower RSS than the Bedrooms model."
562 |    ]
563 |   }
564 |  ],
565 |  "metadata": {
566 |   "kernelspec": {
567 |    "display_name": "Python 2",
568 |    "language": "python",
569 |    "name": "python2"
570 |   },
571 |   "language_info": {
572 |    "codemirror_mode": {
573 |     "name": "ipython",
574 |     "version": 2
575 |    },
576 |    "file_extension": ".py",
577 |    "mimetype": "text/x-python",
578 |    "name": "python",
579 |    "nbconvert_exporter": "python",
580 |    "pygments_lexer": "ipython2",
581 |    "version": "2.7.13"
582 |   }
583 |  },
584 |  "nbformat": 4,
585 |  "nbformat_minor": 0
586 | }
587 | 


--------------------------------------------------------------------------------