├── README.md ├── data ├── kc_house_data.gl │ ├── dir_archive.ini │ ├── m_fb913aaf43c120c4.0000 │ ├── m_fb913aaf43c120c4.frame_idx │ ├── m_fb913aaf43c120c4.sidx │ └── objects.bin └── kc_house_data_small.gl │ ├── dir_archive.ini │ ├── m_ef92e6258b8f7992.0000 │ ├── m_ef92e6258b8f7992.frame_idx │ ├── m_ef92e6258b8f7992.sidx │ └── objects.bin ├── lasso-regression └── lasso-regression.ipynb ├── multiple-linear-regression ├── multiple-regression-gradient-descent.ipynb └── multiple-regression.ipynb ├── nearest-neighbor-regression └── nearest-neighbor-regression.ipynb ├── polynomial-regression └── polynomial-regression.ipynb ├── ridge-regression ├── ridge-regression-gradient-descent.ipynb └── ridge-regression.ipynb └── simple-linear-regression └── simple-linear-regression.ipynb /README.md: -------------------------------------------------------------------------------- 1 | ## Machine Learning Regression: House Sales Price Prediction Models 2 | 3 | ### Description 4 | * Implemented linear regression and k nearest neighbors algorithm with gradient descent optimization to make an optimal model for predicting house prices using the Seattle King County dataset. 5 | * Performed feature engineering and selection using lasso and ridge penalties to eliminate features which had little or no impact on the residual sum of squares error. 6 | 7 | ### Code 8 | 1. [Simple Linear Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/simple-linear-regression/simple-linear-regression.ipynb) 9 | 2. [Multiple Linear Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/multiple-linear-regression/multiple-regression.ipynb) 10 | 3. [Multiple Linear Regression with Gradient Descent Optimization](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/multiple-linear-regression/multiple-regression-gradient-descent.ipynb) 11 | 4. [Polynomial Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/polynomial-regression/polynomial-regression.ipynb) 12 | 5. [Ridge Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/ridge-regression/ridge-regression.ipynb) 13 | 6. [Ridge Regression with Gradient Descent Optimization](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/ridge-regression/ridge-regression-gradient-descent.ipynb) 14 | 7. [Lasso Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/lasso-regression/lasso-regression.ipynb) 15 | 8. [Nearest Neighbor Regression](https://github.com/agrawal-priyank/machine-learning-regression/blob/master/nearest-neighbor-regression/nearest-neighbor-regression.ipynb) 16 | 17 | ### [Data](https://github.com/agrawal-priyank/machine-learning-regression/tree/master/data) 18 | 19 | ### Programming Language 20 | Python 21 | 22 | ### Packages 23 | Anaconda, Graphlab Create [Installation guide](https://turi.com/learn/coursera/) 24 | 25 | ### Tools/IDE 26 | Jupyter notebook (IPython) 27 | 28 | ### How to use it 29 | 1. Fork this repository to have your own copy 30 | 2. Clone your copy on your local system 31 | 3. Install necessary packages 32 | 33 | ### Note 34 | This repository does not contain optimal machine learning models! It only assesses various models that can be built using different machine learning algorithms (either implemented or used directly from Graphlab Create package) to perform different tasks. 35 | -------------------------------------------------------------------------------- /data/kc_house_data.gl/dir_archive.ini: -------------------------------------------------------------------------------- 1 | [archive] 2 | version=1 3 | num_prefixes=3 4 | [metadata] 5 | contents=sframe 6 | [prefixes] 7 | 0000=dir_archive.ini 8 | 0001=objects.bin 9 | 0002=m_fb913aaf43c120c4 10 | -------------------------------------------------------------------------------- /data/kc_house_data.gl/m_fb913aaf43c120c4.0000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/agrawal-priyank/machine-learning-regression/8a58fda3947e28289c9277c74d899f87f0a4a79b/data/kc_house_data.gl/m_fb913aaf43c120c4.0000 -------------------------------------------------------------------------------- /data/kc_house_data.gl/m_fb913aaf43c120c4.frame_idx: -------------------------------------------------------------------------------- 1 | [sframe] 2 | version=0 3 | num_segments=0 4 | num_columns=21 5 | nrows=21613 6 | [column_names] 7 | 0000=id 8 | 0001=date 9 | 0002=price 10 | 0003=bedrooms 11 | 0004=bathrooms 12 | 0005=sqft_living 13 | 0006=sqft_lot 14 | 0007=floors 15 | 0008=waterfront 16 | 0009=view 17 | 0010=condition 18 | 0011=grade 19 | 0012=sqft_above 20 | 0013=sqft_basement 21 | 0014=yr_built 22 | 0015=yr_renovated 23 | 0016=zipcode 24 | 0017=lat 25 | 0018=long 26 | 0019=sqft_living15 27 | 0020=sqft_lot15 28 | [column_files] 29 | 0000=m_fb913aaf43c120c4.sidx:0 30 | 0001=m_fb913aaf43c120c4.sidx:1 31 | 0002=m_fb913aaf43c120c4.sidx:2 32 | 0003=m_fb913aaf43c120c4.sidx:3 33 | 0004=m_fb913aaf43c120c4.sidx:4 34 | 0005=m_fb913aaf43c120c4.sidx:5 35 | 0006=m_fb913aaf43c120c4.sidx:6 36 | 0007=m_fb913aaf43c120c4.sidx:7 37 | 0008=m_fb913aaf43c120c4.sidx:8 38 | 0009=m_fb913aaf43c120c4.sidx:9 39 | 0010=m_fb913aaf43c120c4.sidx:10 40 | 0011=m_fb913aaf43c120c4.sidx:11 41 | 0012=m_fb913aaf43c120c4.sidx:12 42 | 0013=m_fb913aaf43c120c4.sidx:13 43 | 0014=m_fb913aaf43c120c4.sidx:14 44 | 0015=m_fb913aaf43c120c4.sidx:15 45 | 0016=m_fb913aaf43c120c4.sidx:16 46 | 0017=m_fb913aaf43c120c4.sidx:17 47 | 0018=m_fb913aaf43c120c4.sidx:18 48 | 0019=m_fb913aaf43c120c4.sidx:19 49 | 0020=m_fb913aaf43c120c4.sidx:20 50 | -------------------------------------------------------------------------------- /data/kc_house_data.gl/m_fb913aaf43c120c4.sidx: -------------------------------------------------------------------------------- 1 | { 2 | "sarray" : { 3 | "version" : 2, 4 | "num_segments" : 1 5 | }, 6 | "segment_files" : { 7 | "0000" : "m_fb913aaf43c120c4.0000" 8 | }, 9 | "columns" : [ 10 | { 11 | "content_type" : "", 12 | "metadata" : { 13 | "__type__" : "2" 14 | }, 15 | "segment_sizes" : { 16 | "0000" : "21613" 17 | } 18 | }, 19 | { 20 | "content_type" : "", 21 | "metadata" : { 22 | "__type__" : "6" 23 | }, 24 | "segment_sizes" : { 25 | "0000" : "21613" 26 | } 27 | }, 28 | { 29 | "content_type" : "", 30 | "metadata" : { 31 | "__type__" : "1" 32 | }, 33 | "segment_sizes" : { 34 | "0000" : "21613" 35 | } 36 | }, 37 | { 38 | "content_type" : "", 39 | "metadata" : { 40 | "__type__" : "1" 41 | }, 42 | "segment_sizes" : { 43 | "0000" : "21613" 44 | } 45 | }, 46 | { 47 | "content_type" : "", 48 | "metadata" : { 49 | "__type__" : "1" 50 | }, 51 | "segment_sizes" : { 52 | "0000" : "21613" 53 | } 54 | }, 55 | { 56 | "content_type" : "", 57 | "metadata" : { 58 | "__type__" : "1" 59 | }, 60 | "segment_sizes" : { 61 | "0000" : "21613" 62 | } 63 | }, 64 | { 65 | "content_type" : "", 66 | "metadata" : { 67 | "__type__" : "0" 68 | }, 69 | "segment_sizes" : { 70 | "0000" : "21613" 71 | } 72 | }, 73 | { 74 | "content_type" : "", 75 | "metadata" : { 76 | "__type__" : "2" 77 | }, 78 | "segment_sizes" : { 79 | "0000" : "21613" 80 | } 81 | }, 82 | { 83 | "content_type" : "", 84 | "metadata" : { 85 | "__type__" : "0" 86 | }, 87 | "segment_sizes" : { 88 | "0000" : "21613" 89 | } 90 | }, 91 | { 92 | "content_type" : "", 93 | "metadata" : { 94 | "__type__" : "0" 95 | }, 96 | "segment_sizes" : { 97 | "0000" : "21613" 98 | } 99 | }, 100 | { 101 | "content_type" : "", 102 | "metadata" : { 103 | "__type__" : "0" 104 | }, 105 | "segment_sizes" : { 106 | "0000" : "21613" 107 | } 108 | }, 109 | { 110 | "content_type" : "", 111 | "metadata" : { 112 | "__type__" : "0" 113 | }, 114 | "segment_sizes" : { 115 | "0000" : "21613" 116 | } 117 | }, 118 | { 119 | "content_type" : "", 120 | "metadata" : { 121 | "__type__" : "0" 122 | }, 123 | "segment_sizes" : { 124 | "0000" : "21613" 125 | } 126 | }, 127 | { 128 | "content_type" : "", 129 | "metadata" : { 130 | "__type__" : "0" 131 | }, 132 | "segment_sizes" : { 133 | "0000" : "21613" 134 | } 135 | }, 136 | { 137 | "content_type" : "", 138 | "metadata" : { 139 | "__type__" : "0" 140 | }, 141 | "segment_sizes" : { 142 | "0000" : "21613" 143 | } 144 | }, 145 | { 146 | "content_type" : "", 147 | "metadata" : { 148 | "__type__" : "0" 149 | }, 150 | "segment_sizes" : { 151 | "0000" : "21613" 152 | } 153 | }, 154 | { 155 | "content_type" : "", 156 | "metadata" : { 157 | "__type__" : "2" 158 | }, 159 | "segment_sizes" : { 160 | "0000" : "21613" 161 | } 162 | }, 163 | { 164 | "content_type" : "", 165 | "metadata" : { 166 | "__type__" : "1" 167 | }, 168 | "segment_sizes" : { 169 | "0000" : "21613" 170 | } 171 | }, 172 | { 173 | "content_type" : "", 174 | "metadata" : { 175 | "__type__" : "1" 176 | }, 177 | "segment_sizes" : { 178 | "0000" : "21613" 179 | } 180 | }, 181 | { 182 | "content_type" : "", 183 | "metadata" : { 184 | "__type__" : "1" 185 | }, 186 | "segment_sizes" : { 187 | "0000" : "21613" 188 | } 189 | }, 190 | { 191 | "content_type" : "", 192 | "metadata" : { 193 | "__type__" : "1" 194 | }, 195 | "segment_sizes" : { 196 | "0000" : "21613" 197 | } 198 | } 199 | ] 200 | } -------------------------------------------------------------------------------- /data/kc_house_data.gl/objects.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/agrawal-priyank/machine-learning-regression/8a58fda3947e28289c9277c74d899f87f0a4a79b/data/kc_house_data.gl/objects.bin -------------------------------------------------------------------------------- /data/kc_house_data_small.gl/dir_archive.ini: -------------------------------------------------------------------------------- 1 | [archive] 2 | version=1 3 | num_prefixes=3 4 | [metadata] 5 | contents=sframe 6 | [prefixes] 7 | 0000=dir_archive.ini 8 | 0001=objects.bin 9 | 0002=m_ef92e6258b8f7992 10 | -------------------------------------------------------------------------------- /data/kc_house_data_small.gl/m_ef92e6258b8f7992.0000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/agrawal-priyank/machine-learning-regression/8a58fda3947e28289c9277c74d899f87f0a4a79b/data/kc_house_data_small.gl/m_ef92e6258b8f7992.0000 -------------------------------------------------------------------------------- /data/kc_house_data_small.gl/m_ef92e6258b8f7992.frame_idx: -------------------------------------------------------------------------------- 1 | [sframe] 2 | version=0 3 | num_segments=0 4 | num_columns=21 5 | nrows=8703 6 | [column_names] 7 | 0000=id 8 | 0001=date 9 | 0002=price 10 | 0003=bedrooms 11 | 0004=bathrooms 12 | 0005=sqft_living 13 | 0006=sqft_lot 14 | 0007=floors 15 | 0008=waterfront 16 | 0009=view 17 | 0010=condition 18 | 0011=grade 19 | 0012=sqft_above 20 | 0013=sqft_basement 21 | 0014=yr_built 22 | 0015=yr_renovated 23 | 0016=zipcode 24 | 0017=lat 25 | 0018=long 26 | 0019=sqft_living15 27 | 0020=sqft_lot15 28 | [column_files] 29 | 0000=m_ef92e6258b8f7992.sidx:0 30 | 0001=m_ef92e6258b8f7992.sidx:1 31 | 0002=m_ef92e6258b8f7992.sidx:2 32 | 0003=m_ef92e6258b8f7992.sidx:3 33 | 0004=m_ef92e6258b8f7992.sidx:4 34 | 0005=m_ef92e6258b8f7992.sidx:5 35 | 0006=m_ef92e6258b8f7992.sidx:6 36 | 0007=m_ef92e6258b8f7992.sidx:7 37 | 0008=m_ef92e6258b8f7992.sidx:8 38 | 0009=m_ef92e6258b8f7992.sidx:9 39 | 0010=m_ef92e6258b8f7992.sidx:10 40 | 0011=m_ef92e6258b8f7992.sidx:11 41 | 0012=m_ef92e6258b8f7992.sidx:12 42 | 0013=m_ef92e6258b8f7992.sidx:13 43 | 0014=m_ef92e6258b8f7992.sidx:14 44 | 0015=m_ef92e6258b8f7992.sidx:15 45 | 0016=m_ef92e6258b8f7992.sidx:16 46 | 0017=m_ef92e6258b8f7992.sidx:17 47 | 0018=m_ef92e6258b8f7992.sidx:18 48 | 0019=m_ef92e6258b8f7992.sidx:19 49 | 0020=m_ef92e6258b8f7992.sidx:20 50 | -------------------------------------------------------------------------------- /data/kc_house_data_small.gl/m_ef92e6258b8f7992.sidx: -------------------------------------------------------------------------------- 1 | { 2 | "sarray" : { 3 | "version" : 2, 4 | "num_segments" : 1 5 | }, 6 | "segment_files" : { 7 | "0000" : "m_ef92e6258b8f7992.0000" 8 | }, 9 | "columns" : [ 10 | { 11 | "content_type" : "", 12 | "metadata" : { 13 | "__type__" : "2" 14 | }, 15 | "segment_sizes" : { 16 | "0000" : "8703" 17 | } 18 | }, 19 | { 20 | "content_type" : "", 21 | "metadata" : { 22 | "__type__" : "6" 23 | }, 24 | "segment_sizes" : { 25 | "0000" : "8703" 26 | } 27 | }, 28 | { 29 | "content_type" : "", 30 | "metadata" : { 31 | "__type__" : "0" 32 | }, 33 | "segment_sizes" : { 34 | "0000" : "8703" 35 | } 36 | }, 37 | { 38 | "content_type" : "", 39 | "metadata" : { 40 | "__type__" : "1" 41 | }, 42 | "segment_sizes" : { 43 | "0000" : "8703" 44 | } 45 | }, 46 | { 47 | "content_type" : "", 48 | "metadata" : { 49 | "__type__" : "1" 50 | }, 51 | "segment_sizes" : { 52 | "0000" : "8703" 53 | } 54 | }, 55 | { 56 | "content_type" : "", 57 | "metadata" : { 58 | "__type__" : "1" 59 | }, 60 | "segment_sizes" : { 61 | "0000" : "8703" 62 | } 63 | }, 64 | { 65 | "content_type" : "", 66 | "metadata" : { 67 | "__type__" : "0" 68 | }, 69 | "segment_sizes" : { 70 | "0000" : "8703" 71 | } 72 | }, 73 | { 74 | "content_type" : "", 75 | "metadata" : { 76 | "__type__" : "1" 77 | }, 78 | "segment_sizes" : { 79 | "0000" : "8703" 80 | } 81 | }, 82 | { 83 | "content_type" : "", 84 | "metadata" : { 85 | "__type__" : "0" 86 | }, 87 | "segment_sizes" : { 88 | "0000" : "8703" 89 | } 90 | }, 91 | { 92 | "content_type" : "", 93 | "metadata" : { 94 | "__type__" : "0" 95 | }, 96 | "segment_sizes" : { 97 | "0000" : "8703" 98 | } 99 | }, 100 | { 101 | "content_type" : "", 102 | "metadata" : { 103 | "__type__" : "0" 104 | }, 105 | "segment_sizes" : { 106 | "0000" : "8703" 107 | } 108 | }, 109 | { 110 | "content_type" : "", 111 | "metadata" : { 112 | "__type__" : "0" 113 | }, 114 | "segment_sizes" : { 115 | "0000" : "8703" 116 | } 117 | }, 118 | { 119 | "content_type" : "", 120 | "metadata" : { 121 | "__type__" : "0" 122 | }, 123 | "segment_sizes" : { 124 | "0000" : "8703" 125 | } 126 | }, 127 | { 128 | "content_type" : "", 129 | "metadata" : { 130 | "__type__" : "0" 131 | }, 132 | "segment_sizes" : { 133 | "0000" : "8703" 134 | } 135 | }, 136 | { 137 | "content_type" : "", 138 | "metadata" : { 139 | "__type__" : "0" 140 | }, 141 | "segment_sizes" : { 142 | "0000" : "8703" 143 | } 144 | }, 145 | { 146 | "content_type" : "", 147 | "metadata" : { 148 | "__type__" : "0" 149 | }, 150 | "segment_sizes" : { 151 | "0000" : "8703" 152 | } 153 | }, 154 | { 155 | "content_type" : "", 156 | "metadata" : { 157 | "__type__" : "2" 158 | }, 159 | "segment_sizes" : { 160 | "0000" : "8703" 161 | } 162 | }, 163 | { 164 | "content_type" : "", 165 | "metadata" : { 166 | "__type__" : "1" 167 | }, 168 | "segment_sizes" : { 169 | "0000" : "8703" 170 | } 171 | }, 172 | { 173 | "content_type" : "", 174 | "metadata" : { 175 | "__type__" : "1" 176 | }, 177 | "segment_sizes" : { 178 | "0000" : "8703" 179 | } 180 | }, 181 | { 182 | "content_type" : "", 183 | "metadata" : { 184 | "__type__" : "1" 185 | }, 186 | "segment_sizes" : { 187 | "0000" : "8703" 188 | } 189 | }, 190 | { 191 | "content_type" : "", 192 | "metadata" : { 193 | "__type__" : "1" 194 | }, 195 | "segment_sizes" : { 196 | "0000" : "8703" 197 | } 198 | } 199 | ] 200 | } -------------------------------------------------------------------------------- /data/kc_house_data_small.gl/objects.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/agrawal-priyank/machine-learning-regression/8a58fda3947e28289c9277c74d899f87f0a4a79b/data/kc_house_data_small.gl/objects.bin -------------------------------------------------------------------------------- /lasso-regression/lasso-regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Lasso Regression on House Sales Data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Fire up Graphlab Create" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import graphlab" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Load in house sales data\n", 33 | "\n", 34 | "Dataset is from house sales in King County, the region where the city of Seattle, WA is located." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 98, 40 | "metadata": { 41 | "collapsed": false, 42 | "scrolled": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "sales = graphlab.SFrame('kc_house_data.gl/')" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "### Explore house sales data" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 99, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/html": [ 66 | "
\n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfront
71293005202014-10-13 00:00:00+00:00221900.03.01.01180.0565010
\n", 90 | "\n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | "
viewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelat
03711800195509817847.51123398
\n", 114 | "\n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
longsqft_living15sqft_lot15
-122.256775361340.05650.0
\n", 126 | "[1 rows x 21 columns]
\n", 127 | "
" 128 | ], 129 | "text/plain": [ 130 | "Columns:\n", 131 | "\tid\tstr\n", 132 | "\tdate\tdatetime\n", 133 | "\tprice\tfloat\n", 134 | "\tbedrooms\tfloat\n", 135 | "\tbathrooms\tfloat\n", 136 | "\tsqft_living\tfloat\n", 137 | "\tsqft_lot\tint\n", 138 | "\tfloors\tstr\n", 139 | "\twaterfront\tint\n", 140 | "\tview\tint\n", 141 | "\tcondition\tint\n", 142 | "\tgrade\tint\n", 143 | "\tsqft_above\tint\n", 144 | "\tsqft_basement\tint\n", 145 | "\tyr_built\tint\n", 146 | "\tyr_renovated\tint\n", 147 | "\tzipcode\tstr\n", 148 | "\tlat\tfloat\n", 149 | "\tlong\tfloat\n", 150 | "\tsqft_living15\tfloat\n", 151 | "\tsqft_lot15\tfloat\n", 152 | "\n", 153 | "Rows: 1\n", 154 | "\n", 155 | "Data:\n", 156 | "+------------+---------------------------+----------+----------+-----------+\n", 157 | "| id | date | price | bedrooms | bathrooms |\n", 158 | "+------------+---------------------------+----------+----------+-----------+\n", 159 | "| 7129300520 | 2014-10-13 00:00:00+00:00 | 221900.0 | 3.0 | 1.0 |\n", 160 | "+------------+---------------------------+----------+----------+-----------+\n", 161 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 162 | "| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |\n", 163 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 164 | "| 1180.0 | 5650 | 1 | 0 | 0 | 3 | 7 | 1180 |\n", 165 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 166 | "+---------------+----------+--------------+---------+-------------+\n", 167 | "| sqft_basement | yr_built | yr_renovated | zipcode | lat |\n", 168 | "+---------------+----------+--------------+---------+-------------+\n", 169 | "| 0 | 1955 | 0 | 98178 | 47.51123398 |\n", 170 | "+---------------+----------+--------------+---------+-------------+\n", 171 | "+---------------+---------------+-----+\n", 172 | "| long | sqft_living15 | ... |\n", 173 | "+---------------+---------------+-----+\n", 174 | "| -122.25677536 | 1340.0 | ... |\n", 175 | "+---------------+---------------+-----+\n", 176 | "[1 rows x 21 columns]" 177 | ] 178 | }, 179 | "execution_count": 99, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "sales[0:1]" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "### Import Numpy" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 100, 198 | "metadata": { 199 | "collapsed": true 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "import numpy as np" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "### Create new features" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 101, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "from math import log, sqrt\n", 222 | "sales['sqft_living_sqrt'] = sales['sqft_living'].apply(sqrt)\n", 223 | "sales['sqft_lot_sqrt'] = sales['sqft_lot'].apply(sqrt)\n", 224 | "sales['bedrooms_square'] = sales['bedrooms']*sales['bedrooms']\n", 225 | "\n", 226 | "# In the dataset, 'floors' was defined with type string, \n", 227 | "# so we'll convert them to float, before creating a new feature.\n", 228 | "sales['floors'] = sales['floors'].astype(float)\n", 229 | "sales['floors_square'] = sales['floors']*sales['floors']" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "* Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly affect houses with many bedrooms.\n", 237 | "* On the other hand, taking square root of sqft_living will decrease the separation between big house and small house. The owner may not be exactly twice as happy for getting a house that is twice as big." 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "### Selected features" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 102, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "all_features = ['bedrooms', 'bedrooms_square',\n", 256 | " 'bathrooms',\n", 257 | " 'sqft_living', 'sqft_living_sqrt',\n", 258 | " 'sqft_lot', 'sqft_lot_sqrt',\n", 259 | " 'floors', 'floors_square',\n", 260 | " 'waterfront', 'view', 'condition', 'grade',\n", 261 | " 'sqft_above',\n", 262 | " 'sqft_basement',\n", 263 | " 'yr_built', 'yr_renovated']" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "## Model with a choosen l1 penalty" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "### Linear regression model with a single l1 penalty (lasso)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 103, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "model_all = graphlab.linear_regression.create(sales, target='price', features=all_features,\n", 289 | " validation_set=None, l1_penalty=1e10, l2_penalty=0., verbose=None)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "### Explore coefficients in the model" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 105, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "+------------------+-------+---------------+--------+\n", 311 | "| name | index | value | stderr |\n", 312 | "+------------------+-------+---------------+--------+\n", 313 | "| (intercept) | None | 274873.05595 | None |\n", 314 | "| bedrooms | None | 0.0 | None |\n", 315 | "| bedrooms_square | None | 0.0 | None |\n", 316 | "| bathrooms | None | 8468.53108691 | None |\n", 317 | "| sqft_living | None | 24.4207209824 | None |\n", 318 | "| sqft_living_sqrt | None | 350.060553386 | None |\n", 319 | "| sqft_lot | None | 0.0 | None |\n", 320 | "| sqft_lot_sqrt | None | 0.0 | None |\n", 321 | "| floors | None | 0.0 | None |\n", 322 | "| floors_square | None | 0.0 | None |\n", 323 | "| waterfront | None | 0.0 | None |\n", 324 | "| view | None | 0.0 | None |\n", 325 | "| condition | None | 0.0 | None |\n", 326 | "| grade | None | 842.068034898 | None |\n", 327 | "| sqft_above | None | 20.0247224171 | None |\n", 328 | "| sqft_basement | None | 0.0 | None |\n", 329 | "| yr_built | None | 0.0 | None |\n", 330 | "| yr_renovated | None | 0.0 | None |\n", 331 | "+------------------+-------+---------------+--------+\n", 332 | "[18 rows x 4 columns]\n", 333 | "\n", 334 | "None\n", 335 | "Number of non zero coefficients: 6\n" 336 | ] 337 | } 338 | ], 339 | "source": [ 340 | "print model_all['coefficients'].print_rows(num_rows=18)\n", 341 | "print \"Number of non zero coefficients: \" ,model_all['coefficients']['value'].nnz()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "Note that a majority of the weights have been set to zero. So by setting an L1 penalty that's large enough, we are performing a subset selection." 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### Splitting the data" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 106, 361 | "metadata": { 362 | "collapsed": true 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "(training_and_validation, testing) = sales.random_split(.9,seed=1)\n", 367 | "(training, validation) = training_and_validation.random_split(0.5, seed=1)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "## Model with best selected L1 penalty from a range of l1 penalties" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 141, 380 | "metadata": { 381 | "collapsed": true 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "max_nonzeros = 7 # maximum non zero weights allowed" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "### Exploring the larger range of values for l1 penalty to find a narrow range with the desired sparsity" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 112, 398 | "metadata": { 399 | "collapsed": false 400 | }, 401 | "outputs": [], 402 | "source": [ 403 | "l1_penalty_values = np.logspace(8, 10, num=20)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 113, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "non_zeros = []\n", 415 | "l1_penalties = []\n", 416 | "for l1_penalty in l1_penalty_values:\n", 417 | " model = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None, \n", 418 | " l1_penalty=l1_penalty, l2_penalty=0., verbose=False)\n", 419 | " non_zeros.append(model['coefficients']['value'].nnz())\n", 420 | " l1_penalties.append(l1_penalty)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "L1 penalties applied to the models and the corresponding coefficients which are non zero in the model" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 114, 433 | "metadata": { 434 | "collapsed": false 435 | }, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "[100000000.0,\n", 441 | " 127427498.57031322,\n", 442 | " 162377673.91887242,\n", 443 | " 206913808.11147901,\n", 444 | " 263665089.87303555,\n", 445 | " 335981828.62837881,\n", 446 | " 428133239.8719396,\n", 447 | " 545559478.11685145,\n", 448 | " 695192796.17755914,\n", 449 | " 885866790.41008317,\n", 450 | " 1128837891.6846883,\n", 451 | " 1438449888.2876658,\n", 452 | " 1832980710.8324375,\n", 453 | " 2335721469.0901213,\n", 454 | " 2976351441.6313128,\n", 455 | " 3792690190.7322536,\n", 456 | " 4832930238.5717525,\n", 457 | " 6158482110.6602545,\n", 458 | " 7847599703.5146227,\n", 459 | " 10000000000.0]" 460 | ] 461 | }, 462 | "execution_count": 114, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "l1_penalties" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 115, 474 | "metadata": { 475 | "collapsed": false 476 | }, 477 | "outputs": [ 478 | { 479 | "data": { 480 | "text/plain": [ 481 | "[18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 15, 15, 13, 12, 10, 6, 5, 3, 1, 1]" 482 | ] 483 | }, 484 | "execution_count": 115, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "non_zeros" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "Out of this large range, we want to find the two ends of our desired narrow range of `l1_penalty`. At one end, we will have `l1_penalty` values that have too few non-zeros, and at the other end, we will have an `l1_penalty` that has too many non-zeros. \n", 498 | "\n", 499 | "* The largest `l1_penalty` that has more non-zeros than `max_nonzeros`\n", 500 | "* The smallest `l1_penalty` that has fewer non-zeros than `max_nonzeros'" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 128, 506 | "metadata": { 507 | "collapsed": false 508 | }, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "Min l1 penalty: 2976351441.63\n", 515 | "Max l1 penalty: 3792690190.73\n" 516 | ] 517 | } 518 | ], 519 | "source": [ 520 | "l1_penalty_min = l1_penalties[14]\n", 521 | "l1_penalty_max = l1_penalties[15]\n", 522 | "print \"Min l1 penalty: \" ,l1_penalty_min\n", 523 | "print \"Max l1 penalty: \" ,l1_penalty_max" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "### Explore narrow range of values for l1 penalty to find a solution with the right number of non-zeros that has lowest RSS on the validation set " 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 129, 536 | "metadata": { 537 | "collapsed": true 538 | }, 539 | "outputs": [], 540 | "source": [ 541 | "l1_penalty_values = np.linspace(l1_penalty_min, l1_penalty_max, 20)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 133, 547 | "metadata": { 548 | "collapsed": true 549 | }, 550 | "outputs": [], 551 | "source": [ 552 | "all_rss = []\n", 553 | "all_models = []\n", 554 | "all_penalties = []\n", 555 | "for l1_penalty in l1_penalty_values:\n", 556 | " model = graphlab.linear_regression.create(training, target='price', features=all_features, validation_set=None, \n", 557 | " l1_penalty=l1_penalty, l2_penalty=0., verbose=False)\n", 558 | " predicted_price = model.predict(validation)\n", 559 | " residuals = predicted_price - validation['price']\n", 560 | " rss = (residuals*residuals).sum()\n", 561 | " all_rss.append(rss)\n", 562 | " all_models.append(model)\n", 563 | " all_penalties.append(l1_penalty)" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "### Explore all models with number of non zeros equal to max non zeroes and it's corresponding RSS and l1 penalty " 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 135, 576 | "metadata": { 577 | "collapsed": false 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "# Loop to select those models from all models whose number of non zero coefficients are equal to max non zeros allowed that is 7\n", 582 | "selected_models = []\n", 583 | "selected_rss = []\n", 584 | "selected_penalties = []\n", 585 | "index = 0\n", 586 | "for model in all_models:\n", 587 | " if model['coefficients']['value'].nnz() == 7:\n", 588 | " selected_models.append(model)\n", 589 | " selected_rss.append(all_rss[index])\n", 590 | " selected_penalties.append(all_penalties[index])\n", 591 | " index += 1" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 138, 597 | "metadata": { 598 | "collapsed": false 599 | }, 600 | "outputs": [], 601 | "source": [ 602 | "# Select a model from selected models and a l1 penalty from selected penalties that has the lowest RSS\n", 603 | "lowest_rss, index = min((val, idx) for (idx, val) in enumerate(selected_rss))\n", 604 | "best_model = selected_models[index]\n", 605 | "best_l1_penalty = selected_penalties[index]" 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": {}, 611 | "source": [ 612 | "### Best l1 penalty" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": 139, 618 | "metadata": { 619 | "collapsed": false 620 | }, 621 | "outputs": [ 622 | { 623 | "data": { 624 | "text/plain": [ 625 | "3448968612.1634364" 626 | ] 627 | }, 628 | "execution_count": 139, 629 | "metadata": {}, 630 | "output_type": "execute_result" 631 | } 632 | ], 633 | "source": [ 634 | "best_l1_penalty" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": {}, 640 | "source": [ 641 | "### Explore coefficients in the best model" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 140, 647 | "metadata": { 648 | "collapsed": false 649 | }, 650 | "outputs": [ 651 | { 652 | "name": "stdout", 653 | "output_type": "stream", 654 | "text": [ 655 | "+------------------+-------+---------------+--------+\n", 656 | "| name | index | value | stderr |\n", 657 | "+------------------+-------+---------------+--------+\n", 658 | "| (intercept) | None | 222253.192544 | None |\n", 659 | "| bedrooms | None | 661.722717782 | None |\n", 660 | "| bedrooms_square | None | 0.0 | None |\n", 661 | "| bathrooms | None | 15873.9572593 | None |\n", 662 | "| sqft_living | None | 32.4102214513 | None |\n", 663 | "| sqft_living_sqrt | None | 690.114773313 | None |\n", 664 | "| sqft_lot | None | 0.0 | None |\n", 665 | "| sqft_lot_sqrt | None | 0.0 | None |\n", 666 | "| floors | None | 0.0 | None |\n", 667 | "| floors_square | None | 0.0 | None |\n", 668 | "| waterfront | None | 0.0 | None |\n", 669 | "| view | None | 0.0 | None |\n", 670 | "| condition | None | 0.0 | None |\n", 671 | "| grade | None | 2899.42026975 | None |\n", 672 | "| sqft_above | None | 30.0115753022 | None |\n", 673 | "| sqft_basement | None | 0.0 | None |\n", 674 | "| yr_built | None | 0.0 | None |\n", 675 | "| yr_renovated | None | 0.0 | None |\n", 676 | "+------------------+-------+---------------+--------+\n", 677 | "[18 rows x 4 columns]\n", 678 | "\n", 679 | "None\n", 680 | "Number of non zero coefficients: 7\n" 681 | ] 682 | } 683 | ], 684 | "source": [ 685 | "print best_model['coefficients'].print_rows(num_rows=18)\n", 686 | "print \"Number of non zero coefficients: \" ,best_model['coefficients']['value'].nnz()" 687 | ] 688 | } 689 | ], 690 | "metadata": { 691 | "kernelspec": { 692 | "display_name": "Python 2", 693 | "language": "python", 694 | "name": "python2" 695 | }, 696 | "language_info": { 697 | "codemirror_mode": { 698 | "name": "ipython", 699 | "version": 2 700 | }, 701 | "file_extension": ".py", 702 | "mimetype": "text/x-python", 703 | "name": "python", 704 | "nbconvert_exporter": "python", 705 | "pygments_lexer": "ipython2", 706 | "version": "2.7.13" 707 | } 708 | }, 709 | "nbformat": 4, 710 | "nbformat_minor": 0 711 | } 712 | -------------------------------------------------------------------------------- /multiple-linear-regression/multiple-regression-gradient-descent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Multiple Regression using gradient descent on house sales data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Fire up Graphlab Create" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import graphlab" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Load in house sales data\n", 33 | "\n", 34 | "Dataset is from house sales in King County, the region where the city of Seattle, WA is located." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [ 44 | { 45 | "name": "stderr", 46 | "output_type": "stream", 47 | "text": [ 48 | "[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\\Users\\agraw\\AppData\\Local\\Temp\\graphlab_server_1504899765.log.0\n" 49 | ] 50 | }, 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "This non-commercial license of GraphLab Create for academic use is assigned to agrawal.pr@husky.neu.edu and will expire on March 12, 2018.\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "sales = graphlab.SFrame('kc_house_data.gl/')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Explore house sales data" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 6, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/html": [ 80 | "
\n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfront
71293005202014-10-13 00:00:00+00:00221900.03.01.01180.0565010
\n", 104 | "\n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
viewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelat
03711800195509817847.51123398
\n", 128 | "\n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | "
longsqft_living15sqft_lot15
-122.256775361340.05650.0
\n", 140 | "[1 rows x 21 columns]
\n", 141 | "
" 142 | ], 143 | "text/plain": [ 144 | "Columns:\n", 145 | "\tid\tstr\n", 146 | "\tdate\tdatetime\n", 147 | "\tprice\tfloat\n", 148 | "\tbedrooms\tfloat\n", 149 | "\tbathrooms\tfloat\n", 150 | "\tsqft_living\tfloat\n", 151 | "\tsqft_lot\tint\n", 152 | "\tfloors\tstr\n", 153 | "\twaterfront\tint\n", 154 | "\tview\tint\n", 155 | "\tcondition\tint\n", 156 | "\tgrade\tint\n", 157 | "\tsqft_above\tint\n", 158 | "\tsqft_basement\tint\n", 159 | "\tyr_built\tint\n", 160 | "\tyr_renovated\tint\n", 161 | "\tzipcode\tstr\n", 162 | "\tlat\tfloat\n", 163 | "\tlong\tfloat\n", 164 | "\tsqft_living15\tfloat\n", 165 | "\tsqft_lot15\tfloat\n", 166 | "\n", 167 | "Rows: 1\n", 168 | "\n", 169 | "Data:\n", 170 | "+------------+---------------------------+----------+----------+-----------+\n", 171 | "| id | date | price | bedrooms | bathrooms |\n", 172 | "+------------+---------------------------+----------+----------+-----------+\n", 173 | "| 7129300520 | 2014-10-13 00:00:00+00:00 | 221900.0 | 3.0 | 1.0 |\n", 174 | "+------------+---------------------------+----------+----------+-----------+\n", 175 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 176 | "| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |\n", 177 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 178 | "| 1180.0 | 5650 | 1 | 0 | 0 | 3 | 7 | 1180 |\n", 179 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 180 | "+---------------+----------+--------------+---------+-------------+\n", 181 | "| sqft_basement | yr_built | yr_renovated | zipcode | lat |\n", 182 | "+---------------+----------+--------------+---------+-------------+\n", 183 | "| 0 | 1955 | 0 | 98178 | 47.51123398 |\n", 184 | "+---------------+----------+--------------+---------+-------------+\n", 185 | "+---------------+---------------+-----+\n", 186 | "| long | sqft_living15 | ... |\n", 187 | "+---------------+---------------+-----+\n", 188 | "| -122.25677536 | 1340.0 | ... |\n", 189 | "+---------------+---------------+-----+\n", 190 | "[1 rows x 21 columns]" 191 | ] 192 | }, 193 | "execution_count": 6, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "sales[0:1]" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "### Convert SFrame to Numpy array" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 4, 212 | "metadata": { 213 | "collapsed": true 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "import numpy as np" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 1, 223 | "metadata": { 224 | "collapsed": false 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "# function to convert sframe to numpy array (matrix)\n", 229 | "def get_numpy_data(data_sframe, features, output):\n", 230 | " \n", 231 | " data_sframe['constant'] = 1 # new constant column in the sframe signifying intercept\n", 232 | " \n", 233 | " features = ['constant'] + features # prepend constant to features list\n", 234 | " \n", 235 | " features_sframe = data_sframe[features] # new sframe selecting columns from data_sframe mentioned in features list\n", 236 | "\n", 237 | " feature_matrix = features_sframe.to_numpy() # convert sframe to numpy matrix\n", 238 | "\n", 239 | " output_sarray = data_sframe['price'] # an sarray consisting of the output column\n", 240 | "\n", 241 | " output_array = output_sarray.to_numpy() # converts sarray to a numpy array\n", 242 | "\n", 243 | " return(feature_matrix, output_array)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "### Test the function " 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 20, 256 | "metadata": { 257 | "collapsed": false 258 | }, 259 | "outputs": [ 260 | { 261 | "name": "stdout", 262 | "output_type": "stream", 263 | "text": [ 264 | "[[ 1.00000000e+00 1.18000000e+03]]\n", 265 | "[ 221900.]\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price')\n", 271 | "print example_features[0:1] # the first row of the data\n", 272 | "print example_output[0:1] # and the corresponding output" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "### Predicting output given regression weights" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "Suppose we had the weights [1.0, 1.0] and the features [1.0, 1180.0] and we wanted to compute the predicted output 1.0\\*1.0 + 1.0\\*1180.0 = 1181.0 this is the dot product between these two arrays. If they're numpy arrayws we can use np.dot() to compute this:" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 21, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "1181.0\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "my_weights = np.array([1., 1.]) # example weights\n", 306 | "my_features = example_features[0,] # first data point\n", 307 | "predicted_value = np.dot(my_features, my_weights)\n", 308 | "print predicted_value" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "### Function to predict output given feature matrix and weight vector" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 22, 321 | "metadata": { 322 | "collapsed": true 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "def predict_output(feature_matrix, weights):\n", 327 | " predictions = np.dot(feature_matrix, weights)\n", 328 | " return(predictions)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "### Test the function" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 23, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "1181.0\n", 350 | "2571.0\n" 351 | ] 352 | } 353 | ], 354 | "source": [ 355 | "test_predictions = predict_output(example_features, my_weights)\n", 356 | "print test_predictions[0] # should be 1181.0\n", 357 | "print test_predictions[1] # should be 2571.0" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "### Computing the Derivative" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "RSS (error) for 1 data point is:\n", 372 | "\n", 373 | "(w[0]\\*[CONSTANT] + w[1]\\*[feature_1] + ... + w[i] \\*[feature_i] + ... + w[k]\\*[feature_k] - output)^2\n", 374 | "\n", 375 | "So the derivative with respect to weight w[i] by the chain rule is:\n", 376 | "\n", 377 | "2\\*(w[0]\\*[CONSTANT] + w[1]\\*[feature_1] + ... + w[i] \\*[feature_i] + ... + w[k]\\*[feature_k] - output)\\* [feature_i]\n", 378 | "\n", 379 | "In short:\n", 380 | "\n", 381 | "2\\*error\\*[feature_i]\n", 382 | "\n", 383 | "That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself. In the case of the constant then this is just twice the sum of the errors!" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 27, 389 | "metadata": { 390 | "collapsed": true 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "def feature_derivative(errors, feature):\n", 395 | " \n", 396 | " # Assume that errors and feature are both numpy arrays of the same length (number of data points)\n", 397 | " dot_product = np.dot(errors, feature)\n", 398 | " \n", 399 | " # compute twice the dot product of these vectors as 'derivative' and return the value\n", 400 | " derivative = 2 * dot_product\n", 401 | "\n", 402 | " return(derivative)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "### Test function" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 29, 415 | "metadata": { 416 | "collapsed": false 417 | }, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "-23345850022.0\n", 424 | "-23345850022.0\n" 425 | ] 426 | } 427 | ], 428 | "source": [ 429 | "(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price') \n", 430 | "my_weights = np.array([0., 0.]) # this makes all the predictions 0\n", 431 | "test_predictions = predict_output(example_features, my_weights) \n", 432 | "errors = test_predictions - example_output # prediction errors in this case is just the -example_output\n", 433 | "feature = example_features[:,0] # let's compute the derivative with respect to 'constant', the \":\" indicates \"all rows\"\n", 434 | "derivative = feature_derivative(errors, feature)\n", 435 | "print derivative\n", 436 | "print -np.sum(example_output)*2 # should be the same as derivative" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "### Gradient Descent" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "Here is a function that performs a gradient descent. Given a starting point we update the current weights by moving in the negative gradient direction. The gradient is the direction of *increase* and therefore the negative gradient is the direction of *decrease* and we're trying to *minimize* a cost function. \n", 451 | "\n", 452 | "The amount by which we move in the negative gradient *direction* is called the 'step size'. We stop when we are 'sufficiently close' to the optimum. We define this by requiring that the magnitude (length) of the gradient vector to be smaller than a fixed 'tolerance'." 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 30, 458 | "metadata": { 459 | "collapsed": true 460 | }, 461 | "outputs": [], 462 | "source": [ 463 | "from math import sqrt # the magnitude/length of a vector [g[0], g[1], g[2]] is sqrt(g[0]^2 + g[1]^2 + g[2]^2)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 33, 469 | "metadata": { 470 | "collapsed": false 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "def regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance):\n", 475 | " converged = False \n", 476 | " weights = np.array(initial_weights) # converting to a numpy array\n", 477 | " \n", 478 | " while not converged:\n", 479 | " # compute the predictions based on feature_matrix and weights using your predict_output() function\n", 480 | " predictions = predict_output(feature_matrix, weights)\n", 481 | " \n", 482 | " # compute the errors as predictions - output\n", 483 | " errors = predictions - output\n", 484 | "\n", 485 | " gradient_sum_squares = 0 # initialize the gradient sum of squares\n", 486 | " \n", 487 | " # while we haven't reached the tolerance yet, update each feature's weight\n", 488 | " for i in range(len(weights)): # loop over each weight\n", 489 | " \n", 490 | " # compute the derivative for weight[i]:\n", 491 | " derivative_weight_i = feature_derivative(errors, feature_matrix[:, i])\n", 492 | "\n", 493 | " # add the squared value of the derivative to the gradient sum of squares (for assessing convergence)\n", 494 | " gradient_sum_squares = gradient_sum_squares + derivative_weight_i**2\n", 495 | "\n", 496 | " # subtract the step size times the derivative from the current weight\n", 497 | " weights[i] = weights[i] - (step_size * derivative_weight_i)\n", 498 | " \n", 499 | " # compute the square-root of the gradient sum of squares to get the gradient magnitude:\n", 500 | " gradient_magnitude = sqrt(gradient_sum_squares)\n", 501 | " if gradient_magnitude < tolerance:\n", 502 | " converged = True\n", 503 | " return(weights)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "markdown", 508 | "metadata": {}, 509 | "source": [ 510 | "Since the gradient is a sum over all the data points and involves a product of an error and a feature the gradient itself will be very large since the features are large (squarefeet) and the output is large (prices). So while you might expect \"tolerance\" to be small, small is only relative to the size of the features. \n", 511 | "\n", 512 | "For similar reasons the step size will be much smaller than you might expect but this is because the gradient has such large values." 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "# Running the Gradient Descent as Simple Regression (Simple model)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "First let's split the data into training and test data." 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 34, 532 | "metadata": { 533 | "collapsed": true 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "train_data,test_data = sales.random_split(.8,seed=0)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 36, 543 | "metadata": { 544 | "collapsed": true 545 | }, 546 | "outputs": [], 547 | "source": [ 548 | "simple_features = ['sqft_living']\n", 549 | "my_output= 'price'\n", 550 | "(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)\n", 551 | "initial_weights = np.array([-47000., 1.])\n", 552 | "step_size = 7e-12\n", 553 | "tolerance = 2.5e7" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 38, 559 | "metadata": { 560 | "collapsed": false 561 | }, 562 | "outputs": [ 563 | { 564 | "name": "stdout", 565 | "output_type": "stream", 566 | "text": [ 567 | "[-46999.88716555 281.91211912]\n" 568 | ] 569 | } 570 | ], 571 | "source": [ 572 | "simple_weights = regression_gradient_descent(simple_feature_matrix, output,initial_weights, step_size, tolerance)\n", 573 | "print simple_weights" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": {}, 579 | "source": [ 580 | "### Get predictions for test data using new weights (Simple model)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 39, 586 | "metadata": { 587 | "collapsed": false 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "(test_simple_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 41, 597 | "metadata": { 598 | "collapsed": false 599 | }, 600 | "outputs": [ 601 | { 602 | "name": "stdout", 603 | "output_type": "stream", 604 | "text": [ 605 | "[ 356134.44317093 784640.86422788 435069.83652353 ..., 663418.65300782\n", 606 | " 604217.10799338 240550.4743332 ]\n" 607 | ] 608 | } 609 | ], 610 | "source": [ 611 | "simple_predictions = predict_output(test_simple_feature_matrix, simple_weights)\n", 612 | "print simple_predictions" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": {}, 618 | "source": [ 619 | "**What is the predicted price for the 1st house in the TEST data set for model 1 (round to nearest dollar)?**" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 74, 625 | "metadata": { 626 | "collapsed": false 627 | }, 628 | "outputs": [ 629 | { 630 | "data": { 631 | "text/plain": [ 632 | "356134.44317092974" 633 | ] 634 | }, 635 | "execution_count": 74, 636 | "metadata": {}, 637 | "output_type": "execute_result" 638 | } 639 | ], 640 | "source": [ 641 | "simple_predictions[0]" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": {}, 647 | "source": [ 648 | "### RSS function" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 44, 654 | "metadata": { 655 | "collapsed": false 656 | }, 657 | "outputs": [], 658 | "source": [ 659 | "def RSS (predicted_output, true_output):\n", 660 | " difference = true_output - predicted_output\n", 661 | " squared_difference = difference * difference\n", 662 | " sum_of_squared_difference = squared_difference.sum()\n", 663 | " return (sum_of_squared_difference)" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 56, 669 | "metadata": { 670 | "collapsed": false 671 | }, 672 | "outputs": [ 673 | { 674 | "data": { 675 | "text/plain": [ 676 | "277000.0" 677 | ] 678 | }, 679 | "execution_count": 56, 680 | "metadata": {}, 681 | "output_type": "execute_result" 682 | } 683 | ], 684 | "source": [ 685 | "output[5000]" 686 | ] 687 | }, 688 | { 689 | "cell_type": "markdown", 690 | "metadata": {}, 691 | "source": [ 692 | "### RSS for Simple model" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 61, 698 | "metadata": { 699 | "collapsed": false 700 | }, 701 | "outputs": [ 702 | { 703 | "name": "stdout", 704 | "output_type": "stream", 705 | "text": [ 706 | "Residual sum of squares error for Simple model: 2.75400047593e+14\n" 707 | ] 708 | } 709 | ], 710 | "source": [ 711 | "rss = RSS(simple_predictions, test_output)\n", 712 | "print \"Residual sum of squares error for Simple model: \" +str(rss)" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "# Running a multiple regression" 720 | ] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "metadata": {}, 725 | "source": [ 726 | "Now we will use more than one actual feature. Use the following code to produce the weights for a second model with the following parameters:" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": 62, 732 | "metadata": { 733 | "collapsed": false 734 | }, 735 | "outputs": [], 736 | "source": [ 737 | "model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. \n", 738 | "my_output = 'price'\n", 739 | "(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)\n", 740 | "initial_weights = np.array([-100000., 1., 1.])\n", 741 | "step_size = 4e-12\n", 742 | "tolerance = 1e9" 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "metadata": {}, 748 | "source": [ 749 | "Use the above parameters to estimate the model weights. Record these values for your quiz." 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 64, 755 | "metadata": { 756 | "collapsed": false 757 | }, 758 | "outputs": [ 759 | { 760 | "name": "stdout", 761 | "output_type": "stream", 762 | "text": [ 763 | "[ -9.99999688e+04 2.45072603e+02 6.52795277e+01]\n" 764 | ] 765 | } 766 | ], 767 | "source": [ 768 | "multiple_weights = regression_gradient_descent(feature_matrix, output,initial_weights, step_size, tolerance)\n", 769 | "print multiple_weights" 770 | ] 771 | }, 772 | { 773 | "cell_type": "markdown", 774 | "metadata": {}, 775 | "source": [ 776 | "### Get predictions for test data using new weights (Multiple regression model)" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 66, 782 | "metadata": { 783 | "collapsed": true 784 | }, 785 | "outputs": [], 786 | "source": [ 787 | "(test_multiple_feature_matrix, test_multiple_output) = get_numpy_data(test_data, model_features, my_output)" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 68, 793 | "metadata": { 794 | "collapsed": false 795 | }, 796 | "outputs": [ 797 | { 798 | "name": "stdout", 799 | "output_type": "stream", 800 | "text": [ 801 | "[ 366651.41203656 762662.39786164 386312.09499712 ..., 682087.39928241\n", 802 | " 585579.27865729 216559.20396617]\n" 803 | ] 804 | } 805 | ], 806 | "source": [ 807 | "multiple_predictions = predict_output(test_multiple_feature_matrix, multiple_weights)\n", 808 | "print multiple_predictions" 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "metadata": {}, 814 | "source": [ 815 | "**What is the predicted price for the 1st house in the TEST data set for model 2?**" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": 73, 821 | "metadata": { 822 | "collapsed": false 823 | }, 824 | "outputs": [ 825 | { 826 | "data": { 827 | "text/plain": [ 828 | "366651.41203655908" 829 | ] 830 | }, 831 | "execution_count": 73, 832 | "metadata": {}, 833 | "output_type": "execute_result" 834 | } 835 | ], 836 | "source": [ 837 | "multiple_predictions[0]" 838 | ] 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "metadata": {}, 843 | "source": [ 844 | "**What is the actual price for the 1st house in the test data set?**" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 72, 850 | "metadata": { 851 | "collapsed": false 852 | }, 853 | "outputs": [ 854 | { 855 | "data": { 856 | "text/plain": [ 857 | "310000.0" 858 | ] 859 | }, 860 | "execution_count": 72, 861 | "metadata": {}, 862 | "output_type": "execute_result" 863 | } 864 | ], 865 | "source": [ 866 | "test_multiple_output[0]" 867 | ] 868 | }, 869 | { 870 | "cell_type": "markdown", 871 | "metadata": {}, 872 | "source": [ 873 | "# So the simple model is more closer to the actual price of the house 1" 874 | ] 875 | }, 876 | { 877 | "cell_type": "markdown", 878 | "metadata": {}, 879 | "source": [ 880 | "RSS for Multiple regression model" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": 79, 886 | "metadata": { 887 | "collapsed": false 888 | }, 889 | "outputs": [ 890 | { 891 | "name": "stdout", 892 | "output_type": "stream", 893 | "text": [ 894 | "Residual sum of squares error for Multiple regression model: 2.70263446465e+14\n" 895 | ] 896 | } 897 | ], 898 | "source": [ 899 | "rss_multiple = RSS(multiple_predictions, test_multiple_output)\n", 900 | "print \"Residual sum of squares error for Multiple regression model: \" +str(rss_multiple)" 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": {}, 906 | "source": [ 907 | "# The multiple regression model has lower RSS than Simple model" 908 | ] 909 | } 910 | ], 911 | "metadata": { 912 | "kernelspec": { 913 | "display_name": "Python 2", 914 | "language": "python", 915 | "name": "python2" 916 | }, 917 | "language_info": { 918 | "codemirror_mode": { 919 | "name": "ipython", 920 | "version": 2 921 | }, 922 | "file_extension": ".py", 923 | "mimetype": "text/x-python", 924 | "name": "python", 925 | "nbconvert_exporter": "python", 926 | "pygments_lexer": "ipython2", 927 | "version": "2.7.13" 928 | } 929 | }, 930 | "nbformat": 4, 931 | "nbformat_minor": 0 932 | } 933 | -------------------------------------------------------------------------------- /nearest-neighbor-regression/nearest-neighbor-regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Predicting house prices using k-nearest neighbors regression" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Fire up GraphLab Create" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 187, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import graphlab" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Load in house sales data" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 188, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "sales = graphlab.SFrame('kc_house_data_small.gl/')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### Explore the house sales data" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 189, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/html": [ 63 | "
\n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfront
71293005202014-10-13 00:00:00+00:002219003.01.01180.056501.00
\n", 87 | "\n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | "
viewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelat
03711800195509817847.51123398
\n", 111 | "\n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | "
longsqft_living15sqft_lot15
-122.256775361340.05650.0
\n", 123 | "[1 rows x 21 columns]
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | "Columns:\n", 128 | "\tid\tstr\n", 129 | "\tdate\tdatetime\n", 130 | "\tprice\tint\n", 131 | "\tbedrooms\tfloat\n", 132 | "\tbathrooms\tfloat\n", 133 | "\tsqft_living\tfloat\n", 134 | "\tsqft_lot\tint\n", 135 | "\tfloors\tfloat\n", 136 | "\twaterfront\tint\n", 137 | "\tview\tint\n", 138 | "\tcondition\tint\n", 139 | "\tgrade\tint\n", 140 | "\tsqft_above\tint\n", 141 | "\tsqft_basement\tint\n", 142 | "\tyr_built\tint\n", 143 | "\tyr_renovated\tint\n", 144 | "\tzipcode\tstr\n", 145 | "\tlat\tfloat\n", 146 | "\tlong\tfloat\n", 147 | "\tsqft_living15\tfloat\n", 148 | "\tsqft_lot15\tfloat\n", 149 | "\n", 150 | "Rows: 1\n", 151 | "\n", 152 | "Data:\n", 153 | "+------------+---------------------------+--------+----------+-----------+\n", 154 | "| id | date | price | bedrooms | bathrooms |\n", 155 | "+------------+---------------------------+--------+----------+-----------+\n", 156 | "| 7129300520 | 2014-10-13 00:00:00+00:00 | 221900 | 3.0 | 1.0 |\n", 157 | "+------------+---------------------------+--------+----------+-----------+\n", 158 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 159 | "| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |\n", 160 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 161 | "| 1180.0 | 5650 | 1.0 | 0 | 0 | 3 | 7 | 1180 |\n", 162 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 163 | "+---------------+----------+--------------+---------+-------------+\n", 164 | "| sqft_basement | yr_built | yr_renovated | zipcode | lat |\n", 165 | "+---------------+----------+--------------+---------+-------------+\n", 166 | "| 0 | 1955 | 0 | 98178 | 47.51123398 |\n", 167 | "+---------------+----------+--------------+---------+-------------+\n", 168 | "+---------------+---------------+-----+\n", 169 | "| long | sqft_living15 | ... |\n", 170 | "+---------------+---------------+-----+\n", 171 | "| -122.25677536 | 1340.0 | ... |\n", 172 | "+---------------+---------------+-----+\n", 173 | "[1 rows x 21 columns]" 174 | ] 175 | }, 176 | "execution_count": 189, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "sales[0:1]" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "### Import Numpy" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 190, 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "import numpy as np" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### Function to convert sframe to numpy matrix and array" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 191, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "def get_numpy_data(data_sframe, features, output): \n", 219 | " data_sframe['constant'] = 1 # new constant column in the sframe signifying intercept\n", 220 | " \n", 221 | " features = ['constant'] + features # prepend constant to features list\n", 222 | " \n", 223 | " features_sframe = data_sframe[features] # new sframe selecting columns from data_sframe mentioned in features list\n", 224 | "\n", 225 | " feature_matrix = features_sframe.to_numpy() # convert sframe to numpy matrix\n", 226 | "\n", 227 | " output_sarray = data_sframe['price'] # an sarray consisting of the output column\n", 228 | "\n", 229 | " output_array = output_sarray.to_numpy() # converts sarray to a numpy array\n", 230 | "\n", 231 | " return(feature_matrix, output_array)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### Function to normalize features of the matrix" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 192, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "def normalize_features(features_matrix):\n", 250 | " norms = np.linalg.norm(features_matrix, axis=0)\n", 251 | " normalized_features = features_matrix / norms\n", 252 | " return(normalized_features, norms)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "### Split data into training, test, and validation sets" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 193, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "(train_and_validation, test) = sales.random_split(.8, seed=1) # initial train/test split\n", 271 | "(train, validation) = train_and_validation.random_split(.8, seed=1) # split training set into training and validation sets" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Feature list" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 194, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "feature_list = ['bedrooms', \n", 290 | " 'bathrooms', \n", 291 | " 'sqft_living', \n", 292 | " 'sqft_lot', \n", 293 | " 'floors',\n", 294 | " 'waterfront', \n", 295 | " 'view', \n", 296 | " 'condition', \n", 297 | " 'grade', \n", 298 | " 'sqft_above', \n", 299 | " 'sqft_basement',\n", 300 | " 'yr_built', \n", 301 | " 'yr_renovated', \n", 302 | " 'lat', \n", 303 | " 'long', \n", 304 | " 'sqft_living15', \n", 305 | " 'sqft_lot15']" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Convert sframe datasets into numpy matrix and output numpy array" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 195, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "features_train, output_train = get_numpy_data(train, feature_list, 'price')\n", 324 | "features_test, output_test = get_numpy_data(test, feature_list, 'price')\n", 325 | "features_valid, output_valid = get_numpy_data(validation, feature_list, 'price')" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "In computing distances, it is crucial to normalize features. Otherwise, for example, the `sqft_living` feature (typically on the order of thousands) would exert a much larger influence on distance than the `bedrooms` feature (typically on the order of ones). We divide each column of the training feature matrix by its 2-norm, so that the transformed column has unit norm.\n", 333 | "\n", 334 | "The features in the test and validation sets must be divided by the same norms used to divide features of train set, so that the training, test, and validation sets are normalized consistently." 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "### Normalize features" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 196, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "features_train, norms = normalize_features(features_train) # normalize training set features (columns)\n", 353 | "features_test = features_test / norms # normalize test set by training set norms\n", 354 | "features_valid = features_valid / norms # normalize validation set by training set norms" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "## Compute a single distance" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "* Lets compute distance between house no.1 from test set and house no.10 from train set\n", 369 | "* The features associated with both these houses seen in the 18-dimensional vector have values between 0 to 1" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 197, 375 | "metadata": { 376 | "collapsed": false 377 | }, 378 | "outputs": [ 379 | { 380 | "name": "stdout", 381 | "output_type": "stream", 382 | "text": [ 383 | "[ 0.01345102 0.01551285 0.01807473 0.01759212 0.00160518 0.017059 0.\n", 384 | " 0.05102365 0.0116321 0.01564352 0.01362084 0.02481682 0.01350306\n", 385 | " 0. 0.01345386 -0.01346927 0.01375926 0.0016225 ]\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "house_1 = features_test[0]\n", 391 | "print house_1" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 198, 397 | "metadata": { 398 | "collapsed": false 399 | }, 400 | "outputs": [ 401 | { 402 | "name": "stdout", 403 | "output_type": "stream", 404 | "text": [ 405 | "[ 0.01345102 0.01163464 0.00602491 0.0083488 0.00050756 0.01279425\n", 406 | " 0. 0. 0.01938684 0.01390535 0.0096309 0.\n", 407 | " 0.01302544 0. 0.01346821 -0.01346254 0.01195898 0.00156612]\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "house_2 = features_train[9]\n", 413 | "print house_2" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 199, 419 | "metadata": { 420 | "collapsed": false 421 | }, 422 | "outputs": [ 423 | { 424 | "name": "stdout", 425 | "output_type": "stream", 426 | "text": [ 427 | "0.0597235937167\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "distance = np.sqrt(np.sum((house_1-house_2)**2))\n", 433 | "print distance" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "## Compute multiple distances" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "To do nearest neighbor regression, we need to compute the distance between our query house and *all* houses in the training set. " 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "### Function to calculate euclidean distance" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 200, 460 | "metadata": { 461 | "collapsed": false 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "def euclidean_distance(query_house, houses):\n", 466 | " distance_list = []\n", 467 | " for house in houses:\n", 468 | " distance = np.sqrt(np.sum((query_house-house)**2))\n", 469 | " distance_list.append(distance)\n", 470 | " return(distance_list)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "* Lets compute distance for query house no.1 from test set against first 10 houses of the train set" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 201, 483 | "metadata": { 484 | "collapsed": false 485 | }, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "[0.0602747091729555, 0.085468811488270832, 0.061499464371202843, 0.053402739788200579, 0.058444840639381393, 0.059879215101840008, 0.054631404972615261, 0.055431083241597921, 0.052383627840972731, 0.059723593716661257]\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "distance_list = euclidean_distance(features_test[0], features_train[0:10])\n", 497 | "print distance_list" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": {}, 503 | "source": [ 504 | "Among the first 10 training houses, the closest house to the test query house is:" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 222, 510 | "metadata": { 511 | "collapsed": false 512 | }, 513 | "outputs": [ 514 | { 515 | "name": "stdout", 516 | "output_type": "stream", 517 | "text": [ 518 | "Train house number: 8\n", 519 | "Distance of test query house: 0.052383627841\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "distance, train_house_number = min((val, idx) for (idx, val) in enumerate(distance_list))\n", 525 | "print \"Train house number: \" ,train_house_number\n", 526 | "print \"Distance of test query house: \" ,distance" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "## Perform 1-nearest neighbor regression\n", 534 | "\n", 535 | "Looping to calculate distance is not efficient in python so let us use two single line expressions instead of using the previously defined euclidean distance function to calculate distance of 1 test house from all train houses.\n", 536 | "\n", 537 | "First step is to calculate the difference between the features of each training house and the query (test) house" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 203, 543 | "metadata": { 544 | "collapsed": false 545 | }, 546 | "outputs": [], 547 | "source": [ 548 | "diff = features_train[0:len(features_train)] - features_test[0]" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "Second step is to take these feature-by-feature differences in `diff`, square each, take their sum and finally perform square root" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 204, 561 | "metadata": { 562 | "collapsed": false 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "distances = np.sqrt(np.sum(diff**2, axis=1))" 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "### Function that computes the distances from a query house to all training houses\n", 574 | "We will use the previous two single line expressions and modify them so that they can be used in this function to calculate the distance" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 205, 580 | "metadata": { 581 | "collapsed": false 582 | }, 583 | "outputs": [], 584 | "source": [ 585 | "def compute_distances(features_instances, features_query):\n", 586 | " diff = features_instances[0:len(features_instances)] - features_query\n", 587 | " distances = np.sqrt(np.sum(diff**2, axis=1))\n", 588 | " return(distances)" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": {}, 594 | "source": [ 595 | "### Compute 1 nearest neighbor regression for a single query house" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 206, 601 | "metadata": { 602 | "collapsed": false 603 | }, 604 | "outputs": [ 605 | { 606 | "name": "stdout", 607 | "output_type": "stream", 608 | "text": [ 609 | "[ 0.01954476 0.06861035 0.02165079 ..., 0.02433478 0.02622734\n", 610 | " 0.02637942]\n" 611 | ] 612 | } 613 | ], 614 | "source": [ 615 | "query_house = features_test[2]\n", 616 | "distances = compute_distances(features_train, query_house)\n", 617 | "print distances" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": { 623 | "collapsed": true 624 | }, 625 | "source": [ 626 | "Closest house to the query house" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 223, 632 | "metadata": { 633 | "collapsed": false 634 | }, 635 | "outputs": [ 636 | { 637 | "name": "stdout", 638 | "output_type": "stream", 639 | "text": [ 640 | "Train house number: 382\n", 641 | "Distance of test query house: 0.00286049526751\n" 642 | ] 643 | } 644 | ], 645 | "source": [ 646 | "distance, train_house_number = min((val, idx) for (idx, val) in enumerate(distances))\n", 647 | "print \"Train house number: \" ,train_house_number\n", 648 | "print \"Distance of test query house: \" ,distance" 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "### Predicted value of the query house" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": 226, 661 | "metadata": { 662 | "collapsed": false 663 | }, 664 | "outputs": [ 665 | { 666 | "name": "stdout", 667 | "output_type": "stream", 668 | "text": [ 669 | "249000\n" 670 | ] 671 | } 672 | ], 673 | "source": [ 674 | "predicted_value = output_train[382]\n", 675 | "print predicted_value" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "## Perform k-nearest neighbor regression" 683 | ] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": {}, 688 | "source": [ 689 | "### Function to calculate k-nearest neighbor of a single query house" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 227, 695 | "metadata": { 696 | "collapsed": true 697 | }, 698 | "outputs": [], 699 | "source": [ 700 | "def compute_k_distances(k, features_matrix, feature_vector):\n", 701 | " all_distances = compute_distances(features_matrix, feature_vector)\n", 702 | " house_numbers = np.argsort(all_distances) # sorts distances in ascending order and inserts their indexes in the array\n", 703 | " return house_numbers[0:k]" 704 | ] 705 | }, 706 | { 707 | "cell_type": "markdown", 708 | "metadata": {}, 709 | "source": [ 710 | "### Compute k-nearest neighbors of a single query house" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 228, 716 | "metadata": { 717 | "collapsed": false 718 | }, 719 | "outputs": [ 720 | { 721 | "name": "stdout", 722 | "output_type": "stream", 723 | "text": [ 724 | "[ 382 1149 4087 3142]\n" 725 | ] 726 | } 727 | ], 728 | "source": [ 729 | "query_house = features_test[2]\n", 730 | "house_numbers = compute_k_distances(4, features_train, query_house)\n", 731 | "print house_numbers # four closest houses from the training set to the house present in the test set" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "metadata": {}, 737 | "source": [ 738 | "## Make a single prediction by averaging k nearest neighbor outputs" 739 | ] 740 | }, 741 | { 742 | "cell_type": "markdown", 743 | "metadata": { 744 | "collapsed": true 745 | }, 746 | "source": [ 747 | "### Function to predict price of a house using k-nearest neighbors" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 211, 753 | "metadata": { 754 | "collapsed": true 755 | }, 756 | "outputs": [], 757 | "source": [ 758 | "def predict_price(k, features_matrix, feature_vector, prices):\n", 759 | " all_distances = compute_distances(features_matrix, feature_vector)\n", 760 | " house_numbers = np.argsort(all_distances) # sorts distances in ascending order and inserts their indexes in the array\n", 761 | " k_house_numbers = house_numbers[0:k] # closest k houses to the query house\n", 762 | " total_price = 0\n", 763 | " for house_number in k_house_numbers:\n", 764 | " total_price += prices[house_number] # sum prices of all the k closest houses\n", 765 | " predicted_price = total_price / k # average out the total price\n", 766 | " return(predicted_price)" 767 | ] 768 | }, 769 | { 770 | "cell_type": "markdown", 771 | "metadata": {}, 772 | "source": [ 773 | "### Compute price for a single query house" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": 212, 779 | "metadata": { 780 | "collapsed": false 781 | }, 782 | "outputs": [ 783 | { 784 | "name": "stdout", 785 | "output_type": "stream", 786 | "text": [ 787 | "413987\n" 788 | ] 789 | } 790 | ], 791 | "source": [ 792 | "predicted_price = predict_price(4, features_train, features_test[2], output_train)\n", 793 | "print predicted_price" 794 | ] 795 | }, 796 | { 797 | "cell_type": "markdown", 798 | "metadata": {}, 799 | "source": [ 800 | "On comparing price obtained using 4-nearest neighbors to the price obtained using 1-nearest neighbor computed earlier of house number 3 of test set, it is clear that the 4-nearest neighbors gives us a much reasonable price estimate." 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": {}, 806 | "source": [ 807 | "### Function to predict prices of multiple houses using k-nearest neighbors" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 229, 813 | "metadata": { 814 | "collapsed": false 815 | }, 816 | "outputs": [], 817 | "source": [ 818 | "def predict_prices(k, features_matrix, features_matrix_query, prices):\n", 819 | " predicted_prices = []\n", 820 | " for i in range(0, len(features_matrix_query)):\n", 821 | " features_array = features_matrix_query[i]\n", 822 | " predicted_price = predict_price(k, features_matrix, features_array, prices)\n", 823 | " predicted_prices.append(predicted_price)\n", 824 | " return(predicted_prices)" 825 | ] 826 | }, 827 | { 828 | "cell_type": "markdown", 829 | "metadata": {}, 830 | "source": [ 831 | "### Computer prices for first 10 houses of the test set with k = 10" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": 230, 837 | "metadata": { 838 | "collapsed": false 839 | }, 840 | "outputs": [ 841 | { 842 | "name": "stdout", 843 | "output_type": "stream", 844 | "text": [ 845 | "[881300, 431860, 460595, 430200, 766750, 667420, 350032, 512800, 484000, 457235]\n" 846 | ] 847 | } 848 | ], 849 | "source": [ 850 | "predicted_prices = predict_prices(10, features_train, features_test[0:10], output_train)\n", 851 | "print predicted_prices" 852 | ] 853 | }, 854 | { 855 | "cell_type": "markdown", 856 | "metadata": {}, 857 | "source": [ 858 | "### House with the lowest predicted price from the query set" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": 231, 864 | "metadata": { 865 | "collapsed": false 866 | }, 867 | "outputs": [ 868 | { 869 | "name": "stdout", 870 | "output_type": "stream", 871 | "text": [ 872 | "The house number is: 6\n", 873 | "The predicted house price is: 350032\n" 874 | ] 875 | } 876 | ], 877 | "source": [ 878 | "house_number = predicted_prices.index(min(predicted_prices))\n", 879 | "print \"The house number is: \" ,house_number\n", 880 | "print \"The predicted house price is: \" , min(predicted_prices)" 881 | ] 882 | }, 883 | { 884 | "cell_type": "markdown", 885 | "metadata": {}, 886 | "source": [ 887 | "## Choosing the best value of k using a validation set" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": 232, 893 | "metadata": { 894 | "collapsed": false 895 | }, 896 | "outputs": [], 897 | "source": [ 898 | "rss_all = []\n", 899 | "for k in range(1, 16):\n", 900 | " predicted_prices = predict_prices(k, features_train, features_valid, output_train)\n", 901 | " residual = predicted_prices - output_valid\n", 902 | " rss = (residual*residual).sum()\n", 903 | " rss_all.append(rss)" 904 | ] 905 | }, 906 | { 907 | "cell_type": "markdown", 908 | "metadata": {}, 909 | "source": [ 910 | "### Best value of k that reported lowest RSS" 911 | ] 912 | }, 913 | { 914 | "cell_type": "code", 915 | "execution_count": 233, 916 | "metadata": { 917 | "collapsed": false 918 | }, 919 | "outputs": [ 920 | { 921 | "name": "stdout", 922 | "output_type": "stream", 923 | "text": [ 924 | "7\n" 925 | ] 926 | } 927 | ], 928 | "source": [ 929 | "k = rss_all.index(min(rss_all))\n", 930 | "print k" 931 | ] 932 | }, 933 | { 934 | "cell_type": "markdown", 935 | "metadata": { 936 | "collapsed": false 937 | }, 938 | "source": [ 939 | "Visualize the performance as a function of `k`, plot the RSS on the VALIDATION set for each considered `k` value:" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": 234, 945 | "metadata": { 946 | "collapsed": false 947 | }, 948 | "outputs": [ 949 | { 950 | "data": { 951 | "text/plain": [ 952 | "[]" 953 | ] 954 | }, 955 | "execution_count": 234, 956 | "metadata": {}, 957 | "output_type": "execute_result" 958 | }, 959 | { 960 | "data": { 961 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEGCAYAAACJnEVTAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XuYVNWZ7/Hvr0W8gHfUcNEG2vsdNKgQTRnHiCZq4mQy\nKM4E4y0XiXMyTtQkpJun84w642QSL2fymBgZJyi5mRNz4iSQoz0RFcWAgApya1tuY2JEBBIzSL/n\nj10NRXd1V3VT1bu66/d5nnq69t5r7/327d2r1lp7bUUEZmZWHWrSDsDMzHqPk76ZWRVx0jczqyJO\n+mZmVcRJ38ysijjpm5lVkYpK+pIekPSGpMVFlD1H0m8lbZN0eZ7t+0laI+nu8kRrZtb3VFTSBx4E\nLiyybAvwKWBmJ9sbgf8qRVBmZv1FRSX9iJgLbMxdJ2m0pP+UNF/Sf0k6Jlv29Yh4Cehwd5mk04HD\ngNm9EbeZWV9RUUm/E/cDN0bE+4F/AP6tq8KSBNwF3Ayo/OGZmfUdA9IOoCuSBgHjgR9lkznAngV2\n+xzwi4hYn93Fid/MLKuikz7JJ5GNETG2G/ucDXxA0ueA/YA9JW2OiC+XJUIzsz6kYPNOoRE1ko6V\n9IykdyV9sd22iZKWSVou6ZYiY1L2RURsBpolfSLnmKd0sg/Zfa6KiJERMZqkiechJ3wzs0QxbfqF\nRtT8AZgK/HPuSkk1wL3ZfU8ErpB0XFcnkvQw8AxwjKTXJV0NTAaukfSipJeAS7Nlz5C0BvgE8G1J\nS4r4XszMqpqKmVpZUi3w84jIV8tuK1MPbI6Ib2SXzwLqI+Ki7PKtQETEnSWJ3MzMuq2co3eGA2ty\nltdm15mZWUr6wpBNMzMrkXKO3lkHHJmzPCK7Li9JfoSXmVk3RUS3hqUXW9PfMaKmiHJt5gNHSaqV\nNBCYBDzW1c4RUdGv+vr61GNwnI7TcTrOtldPFKzpZ0fUZIBDJL0O1AMDkxwd90s6HHiBZEx8q6Sb\ngBMiYoukG0mmQqgBHoiIpT2K0szMSqJg0o+IKwtsfwM4opNtvwSO7VloZmZWau7I7YZMJpN2CEVx\nnKXlOEvLcaarqHH6vUFSVEosZmZ9gSSiTB25ZmbWDzjpm5lVESd9M7Mq4qRvZlZFnPTNzKqIk76Z\nWRVx0jczqyJO+mZmVcRJ38ysijjpm5lVESd9M7Mq4qRvZlZFnPTNzKpIOR+XmJrm5hamTZvBunWt\nDB9eQ2PjFEaNqk07LDOz1BWcWlnSA8BHgTci4pROytwNXARsBa6OiIXZ9a8Bm4BWYFtEjOviPCWZ\nWrm5uYULLriHVaumA4OArdTV1TNnzlQnfjPrV8o1tfKDwIVdnPQioC4ijgZuAP4tZ3MrkImIMV0l\n/FKaNm1GTsIHGMSqVdOZNm1Gb5zezKyiFUz6ETEX2NhFkcuAh7JlnwMOyD43F5IHpfdqv8G6da3s\nTPhtBrF+fWtvhmFmVpFKkZCHA2tyltdl1wEEMEfSfEnXleBchYMZXkPSypRrK8OGuc/azKzcHbkT\nImKDpENJkv/S7CeHvBoaGna8z2QyPXpGZWPjFObNq+/Qpt/YOLXbxzIzqyRNTU00NTXt1jGKekau\npFrg5/k6ciV9G3gyIn6QXV4GfDAi3mhXrh7YHBHf6OQcJXtGbnNzC1deOYPVq1u54AKP3jGz/qkn\nHbnFJv2RJEn/5DzbLgY+HxEfkXQW8M2IOEvSvkBNRGyRNAiYDUyPiNmdnKOkD0ZfvBg++UlYtqxk\nhzQzqyg9SfoFm3ckPQxkgEMkvQ7UAwOBiIj7I+JxSRdLWkl2yGZ218OBn0qK7Hlmdpbwy+H44+H1\n12HLFhg8uLfOamZW2Yqq6feGUtf0Ac44A+6+G8aPL+lhzcwqQrnG6fdZY8bAwoVpR2FmVjmc9M3M\nqki/T/oLFqQdhZlZ5ejXbfpbt8KQIbBpEwwcWNJDm5mlzm367QwaBCNHwiuvpB2JmVll6NdJH2Ds\nWLfrm5m16fdJ3525ZmY7OembmVWRft2RC/DWW0m7/ttvQ02/v8SZWTVxR24eBx8MBx0Eq1alHYmZ\nWfr6fdIHN/GYmbVx0jczqyJO+mZmVaSqkn6F9FmbmaWmKpL+iBGwfTts2JB2JGZm6aqKpC+5icfM\nDIpI+pIekPSGpMVdlLlb0gpJL0o6LWf9REnLJC2XdEupgu4JJ30zs+Jq+g8CF3a2UdJFQF1EHA3c\nAHw7u74GuDe774nAFZKO2+2Ie8jTLJuZFZH0I2IusLGLIpcBD2XLPgccIOlwYBywIiJaImIbMCtb\nNhWu6ZuZlaZNfziwJmd5bXZdZ+tTcfTR8OabsLGry5eZWT83oAzH7NY8ELkaGhp2vM9kMmQymRKE\nk9hjDzjlFHjxRTjvvJId1sys1zQ1NdHU1LRbxyhqwjVJtcDPI+KUPNu+DTwZET/ILi8DPgiMAhoi\nYmJ2/a1ARMSdnZyjLBOu5brxRhg9Gr74xbKexsysV5RzwjXReQ3+MeBvswGcBbwdEW8A84GjJNVK\nGghMypZNjdv1zazaFWzekfQwkAEOkfQ6UA8MJKm13x8Rj0u6WNJKYCtwNcnG7ZJuBGaTXFweiIil\nZfo+ijJmDPzrv6YZgZlZuvr9fPq5/vxnOPDAZI79ffYp66nMzMrO8+kXsNdecOyxsGRJ2pGYmaWj\nqpI+uF3fzKqbk76ZWRVx0jczqyJV1ZEL8M47MHQobNoEA8pxa5qZWS9xR24R9t8fhg2DV19NOxIz\ns95XdUkf3MRjZtWrKpP+2LGeZtnMqlNVJn3X9M2sWlVdRy7A736X3KT11lvJoxTNzPoid+QW6bDD\nYN994bXX0o7EzKx3VWXSBzfxmFl1ctI3M6siTvpmZlXESd/MrIpUbdIfORL++MdkJI+ZWbUoKulL\nmihpmaTlkm7Js/1ASY9KWiRpnqQTcra9ll2/UNLzpQx+d0hw2mmu7ZtZdSmY9CXVAPcCFwInAldI\nOq5dsS8DCyPiVOBTwN0521qBTESMiYhxpQm7NNzEY2bVppia/jhgRUS0RMQ2YBZwWbsyJwBPAETE\nq8BISYdmt6nI8/Q6J30zqzbFJOPhwJqc5bXZdbkWAZcDSBoHHAmMyG4LYI6k+ZKu271wS8tJ38yq\nTalmlL8D+JakBcASYCGwPbttQkRsyNb850haGhFz8x2koaFhx/tMJkMmkylRePkddxysWwebN8N+\n+5X1VGZmu62pqYmmpqbdOkbBuXcknQU0RMTE7PKtQETEnV3s0wycHBFb2q2vBzZHxDfy7NNrc+/k\nOvNM+Jd/gQ98oNdPbWa2W8o198584ChJtZIGApOAx9qd+ABJe2bfXwf8V0RskbSvpMHZ9YOADwMv\ndSfAcnMTj5lVk4LNOxGxXdKNwGySi8QDEbFU0g3J5rgfOB74d0mtwMvANdndDwd+Kimy55oZEbPL\n8Y301JgxMG9e2lGYmfWOqpxaOdfzz8P118OLL/b6qc3MdktPmneqPun/6U9w8MHw9tuw1169fnoz\nsx7zfPo9sM8+UFcHL7+cdiRmZuVX9Ukf3JlrZtXDSR8nfTOrHk76OOmbWfWo+o5cgI0b4cgjk87c\nPfZIJQQzs25zR24PHXQQDBkCK1emHYmZWXk56We5icfMqoGTfpaTvplVAyf9LCd9M6sGTvpZY8cm\nSb9C+rXNzMrCST9r6FCoqUnm1zcz66+c9LMkN/GYWf/npJ9jzBhYsCDtKMzMysdJP4dr+mbW3xWV\n9CVNlLRM0nJJt+TZfqCkRyUtkjRP0gnF7ltJnPTNrL8r5hm5NcBy4HxgPcnjEydFxLKcMv9E8uzb\nRknHAvdFxF8Us2/OMVKbhqFNaysceCA0N8Mhh6QaiplZQeWahmEcsCIiWiJiGzALuKxdmROAJwAi\n4lVgpKRDi9y3YtTUwKmn+ilaZtZ/FZP0hwNrcpbXZtflWgRcDiBpHHAkMKLIfSuKm3jMrD8rVUfu\nHcBBkhYAnwcWAttLdOxe5aRvZv3ZgCLKrCOpubcZkV23Q0RsBj7dtiypGVgN7Fto31wNDQ073mcy\nGTKZTBHhldaYMfDP/9zrpzUzK6ipqYmmpqbdOkYxHbl7AK+SdMZuAJ4HroiIpTllDgD+GBHbJF0H\nTIiIKcXsm3OM1DtyAf7nf5LO3DffhH33TTsaM7POlaUjNyK2AzcCs4GXgVkRsVTSDZKuzxY7HnhJ\n0lLgQuCmrvbtToC9beBAOO44WLw47UjMzErPT87K49pr4fTT4bOfTTsSM7PO+clZJeLOXDPrr5z0\n83DSN7P+ys07eWzZAocfnjwofc89047GzCw/N++UyODBcMQRsKzDZBFmZn2bk34nPM2ymfVHTvqd\ncLu+mfVHTvqdcNI3s/7IHbmdePNNqKuDjRuT2TfNzCqNO3JLaMgQ2H//ZG59M7P+wkm/C27iMbP+\nxkm/C076ZtbfOOl3wUnfzPobJ/0uOOmbWX/jpN+FI49M5tf/7/9OOxIzs9Jw0u+C5Nq+mfUvTvoF\nOOmbWX9SVNKXNFHSMknLJd2SZ/v+kh6T9KKkJZKm5Gx7TdIiSQslPV/C2HuFk76Z9SfFPCO3BlhO\n8pzb9cB8YFJELMspcxuwf0TcJmkIyXNxD4+I9yStBk6PiI0FzlNRd+S2WboULrkEVq5MOxIzs12V\n647cccCKiGiJiG3ALOCydmUC2C/7fj/gDxHxXltcRZ6nIh1zTNKRu2lT2pGYme2+YpLxcGBNzvLa\n7Lpc9wInSFoPLCL7YPSsAOZImi/put0JNg177AEnnwyLFqUdiZnZ7itVDfxCYGFEDAPGAPdJGpzd\nNiEixgIXA5+X9IESnbPXeG59M+svBhRRZh1wZM7yiOy6XFcDtwNExCpJzcBxwAsRsSG7/veSfkrS\nXDQ334kaGhp2vM9kMmQymaK+iXIbMwbm5o3YzKz3NDU10dTUtFvHKKYjdw+SjtnzgQ3A88AVEbE0\np8x9wO8iYrqkw4EXgFOBd4GaiNgiaRAwG5geEbPznKciO3IBXngBPv1pWLw47UjMzHbqSUduwZp+\nRGyXdCNJwq4BHoiIpZJuSDbH/cDXgRmS2tLilyLiLUmjgJ9Kiuy5ZuZL+JXupJNgxQp4913Ye++0\nozEz6zk/RKVIp5wC3/senHFG2pGYmSV6UtN30i9Cc3ML558/g4EDWznjjBoaG6cwalRt2mGZWZVz\n0i+D5uYWLrjgHlatmg4MArZSV1fPnDlTnfjNLFV+XGIZTJs2IyfhAwxi1arpTJs2I8WozMx6xkm/\ngHXrWtmZ8NsMYv361jTCMTPbLU76BQwfXgNsbbd2K8OG+UdnZn2PM1cBjY1TqKurZ2fiT9r0Gxun\npBaTmVlPuSO3CM3NLUybNoN161p54YUaHnpoCh//uDtxzSxdHr3TC26/HZqb4f77047EzKqdk34v\n2LABTjgB1qyBwYMLlzczKxcP2ewFQ4fCBz8IP/hB2pGYmXWfk34PXHcdfOc7aUdhZtZ9Tvo9cOGF\nsHYtLFmSdiRmZt3jpN8DAwYkUy1/97tpR2Jm1j3uyO2h115LZtxcu9bTLZtZOtyR24tGjoSxY+HR\nR9OOxMyseE76u+G669zEY2Z9S1FJX9JEScskLZd0S57t+0t6TNKLkpZImlLsvn3ZpZfCSy/BypVp\nR2JmVpxinpFbAywneUbuemA+MCkiluWUuQ3YPyJukzSE5Jm6hwOthfbNOUafatNvc/PNsOeeyZ26\nZma9qVxt+uOAFRHREhHbgFnAZe3KBLBf9v1+wB8i4r0i9+3TrrkGZsyAbdvSjsTMrLBikv5wYE3O\n8trsulz3AidIWg8sAm7qxr592vHHw1FHwS9+kXYkZmaFDSjRcS4EFkbEhyTVAXMkndLdgzQ0NOx4\nn8lkyGQyJQqvvK69NunQ/djH0o7EzPqzpqYmmpqadusYxbTpnwU0RMTE7PKtQETEnTll/i9we0Q8\nnV3+f8AtJBeVLvfNOUafbNMH2LoVjjgCFi+GESPSjsbMqkW52vTnA0dJqpU0EJgEPNauTAvwF9kg\nDgeOAVYXuW+fN2gQTJoEDz6YdiRmZl0r6o5cSROBb5FcJB6IiDsk3UBSa79f0lBgBjA0u8vtEfFI\nZ/t2co4+W9MHWLAALr8cVq+GGt/9YGa9wPPpp+z005Ohmx/+cNqRmFk18DQMKbv2Wk+5bGaVzTX9\nEtq0CWprYflyOOywtKMxs/7ONf2UHXBAMmzzP/4j7UjMzPJz0i+xtiaePv6hxcz6KSf9EpswASR4\n+um0IzEz68hJv8Qkd+iaWeVyR24Z/P73cPTRydO1Djww7WjMrL9yR26FOPTQZKz+ww+nHYmZ2a6c\n9MvET9Uys0rkpF8m558Pb70Fv/1t2pGYme3kpF8mNTXJA1Zc2zezSuKO3DJauxZOOQXWrElm4jQz\nKyV35FaYESNg/Hj40Y/SjsTMLOGkX2bu0DWzSuKkX2YXXwyrVsErr6QdiZmZk37Z7bknTJkCDzyQ\ndiRmZt17ctY32fn0qzvbbb8ZmAwEsCdwPDAkIt6W9BqwCWgFtkXEuE7O0e86ctusXJm07a9ZA3vt\nlXY0ZtZflOXJWZJqgOXA+cB6kufeToqIZZ2U/yjwdxHR9szc1cDpEbGxwHn6bdIH+NCH4DOfgU9+\nMu1IzKy/KNfonXHAiohoiYhtwCzgsi7KXwE8khtXkefp1667zpOwmVn6iknGw4E1Octrs+s6kLQP\nMBH4Sc7qAOZImi/pup4G2td9/OOwcCE0N6cdiZlVswElPt4lwNyIeDtn3YSI2CDpUJLkvzQi5ubb\nuaGhYcf7TCZDJpMpcXjp2XtvuOqqpEP3619POxoz64uamppoamrarWMU06Z/FtAQEROzy7cC0b4z\nN7vtUeCHETGrk2PVA5sj4ht5tvXrNn2AJUtg4kRoaYEBpb7cmlnVKVeb/nzgKEm1kgYCk4DH8pz8\nAOCDwM9y1u0raXD2/SDgw8BL3QmwPzn5ZDjiCPjlL9OOxMyqVcGkHxHbgRuB2cDLwKyIWCrpBknX\n5xT9GPCriPhTzrrDgbmSFgLzgJ9HxOzShd/3uEPXzNLkCdd62ZYtSW3/5Zdh2LC0ozGzvswTrvUB\ngwfDX/0VzJiRdiRmVo1c00/B88/DFVfAihXJvPtmZj3hmn4f8f73JzX+J59MOxIzqzZO+imQ4Npr\nPeWymfU+N++kZOHCFs48cwbjxrUycmQNjY1TGDWqNu2wzKwPKcuEa72lmpJ+c3MLF1xwD6tWTQcG\nAVupq6tnzpypTvxmVjS36fcR06bNyEn4AINYtWo6U6bMYPVqaG1NMzoz6888GUAK1q1rZWfCbzOI\nl15q5dxzYdMmOOmk5A7ek09OHq5+8slw8MFdH7e5uYVp02awbl0rw4e7ycjMOnLST8Hw4TXAVnZN\n/Fu56KIavv99eOsteOmlZK6exYvhkUeS5f3263ghOP745MEs+ZqM5s1zk5GZ7cpt+inoSZt+RDJR\nW9uFYMmS5LV6NYwaBVu3Tuf112+m/YVk8uS7+P7363vhuzKz3taTNn3X9FMwalQtc+ZMZdq0u1i/\nvpVhw2pobOy6Ri7ByJHJ65JLdq7/859h2TK44or8TUbr17uDwMx2ctJPyahRtSWpge+1F5x6Kowd\nW8PSpR2bjF5/vYbVq2H06N0+lZn1Ax690080Nk6hrq6epK8AYCsjR9Zz0UVTGDcOrrkmaQoys641\nN7dw1VXTOe+8eq66ajrNzS0Vdbzd5Tb9fqRt9M7OJqNk9M7GjfDNb8J998Gll8JXvgJ1dWlHa1Z5\nSn0PTbnvyfHNWdaljRvhW9+Ce+9N+gW+8hU46qi0ozKrHFddNZ2ZMzsOiJg48S5uu62eLVvo1mvl\nyuls3ly+ARbuyLUuHXQQNDTA3/1dkvzPOgs+8hH46lfh6KPTjs4sHa2tyWCI556DJ5/MPyBi7txW\nvvrVZKLEfK/3va/juv32g89+tpXnnqusARZFJX1JE4FvkvQBPND++biSbgYmAwHsCRwPDImItwvt\na73vwAOhvh5uugnuvhvGj4eLLkqS/zHHpB2dWXmtX59Mb/7cc8nXF16AQw+FceNg6NAa1q/vOCDi\nssuSe2i666ijanjuuY7HGzYsxe7UiOjyRZKsVwK1JAn9ReC4Lsp/FPh1d/dNQrE0vP12RGNjxJAh\nEVddFbFsWdoRmRVn9erXYvLkhshkvhaTJzfE6tWv7bJ98+aIJ5+MuPPOiMsvjxgxIuKQQyIuuiii\nvj7i8ccjfv/7XY9XV/f3AVsiuTtmS9TV/X2H43YnvlIer71s3iyYx3NfBdv0JZ0F1EfERdnlW7Mn\nyltjlzQTeCIiHujOvm7TT98778A99ySdvhdemNT899rLUztYZcrXSTpiRD2f+cxUVq+u5fnnkxFr\np56a1OLPPDP5Onp0ct9LV8fNNyBid+Is5fFylaUjV9JfAhdGxPXZ5auAcRHxhTxl9wHWAnWRNO10\nZ18n/QrxzjtJZ+9dd7Wwbds9bNni2UCrTV+Yx+kv/3I6jz7asZN01Ki7uPnmes48M5mqZODAtCIs\nv0royL0EmBsRb/dk54aGhh3vM5kMmUymNFFZt+y/P3z5y7Bo0Qx++MOOs4HedttdzJrlqR0qRakT\ndCXO47R9O7z8Mjz9NDzzTPJqacnf6TpyZCuf+1waUZZfU1MTTU1Nu3eQQu0/wFnAL3OWbwVu6aTs\no8CkHu5bkjYuK51M5mvZdshdX9LX4v3vj/jCFyJmzYpoaYlobU072upU6jbj1taIv/7rhpzjxY7j\nTp7cUOLoO7dpU8Ts2RENDREXXBCx//4Rxx4bcfXVEd/5TsTLL0dceWX6caaNHrTpF1PTnw8cJakW\n2ABMAq5oX0jSAcAHSUbxdGtfq0ydzQb6yU/W8PnPJ7WtRx6BqVOTj9Bnn52MBDr7bBgzJpkiIp++\n0HTQV3T2bIYrr7yLv/mbZFz51q10+Jpv3ZYt8Mc/AuSvQT/+eCvXXANHHLHzNWJE8nW//QrH2tnv\nPQJee23XWvzKlTB2LEyYkPx9PfwwDBmy6/G+/vUpPPdcfYcbnxobp+7Oj7TfK5j0I2K7pBuB2ewc\ndrlU0g3J5rg/W/RjwK8i4k+F9i35d2Fl0dg4hXnzOv5T3X77VEaNgnPOScpFJB1mzz6b/MM+9BAs\nXw6nnbbzInD22TB0aGU2HfQ1774Lv/1t8rP+1a/yJ+jm5laWLEnGiw8atHMc+aBBXX/dd1/41Kdq\nmDmz48X+1FNrOPNMWLMG5s5Nvra9Bg7seCHIfb33XguXXrrr7/2Xv6znjDOmsmhRLVKS4MePhylT\nkr+dQm3xPZm40HxHrhXQ05EHmzfD/PlJYnr22eR1wAEQMZ2WFk8B3R1vvJH8HNtqwosWJc9RGD8e\nFiyYztNPl/bn2d2pAyKSu73bLgBr1+56QVi7Flavnk5ra8c4x4+/i5kz66mt7XpEjeXnaRisYrW2\nJrX/j3+8nmXLpnfYPnRoPVOnTmf0aHa8Dj64+ETQV5qMCsW5fTu88squTR1/+EPySamtJjxuXFIz\nbzteOeZ2KfUww/POq6epqePv/bzz6nniiY7rrTiVMHrHLK+aGjjuODj99BqWLevYdDBiRA0bN8KP\nfpQ0Fa1endQgR49OHhKTezEYPRpqa3f2GfSVJqN8cT7zTD3Tp0+lubmWZ56BefPgsMOSBH/OOXDr\nrcnPraaTGzjL1cRRqqm/23TWP5TqnanVqrs9v+V64dE7VaE7o03eeivihRcifvjDiDvuiLjhhmQk\nR11dxMCByd2V554bMXp03xjFMXly/jgPPbQhvvSliJ/9LOJ3v0s7yvIo952p1Yoyjd4xK5nu1EwP\nOghOPz15tffee21txfC5z+XvzHzmmVaeeCK5E3NQ+829ICJp0nrqKfjNb+DHP84f50kntXJnP5+R\nyp2ulcNJ33pdKZoOBgzY+fjIM86o4dVXOzYd7L13DdOm7ez4nDABPvCB5OvQobt1+ry2b0/O9dRT\nO197750005x7Lrz1Vg2/+EX1NnGUusnIesYdudbnFerMfPfdZCbFp59Ohho+/XTyKSL3InD88R3b\nzQt1uv75z8kIpd/8Jknwzz4Lw4YlSb7tVVtbfJxm3eXRO1a1ujPapG3+9LYLwNy5yZDD8eN3XggO\nPbSFj3501wQ9alQ9X/vaVFasqOWpp2DBguRi0Zbgk/1KF6dZIU76Zj20YUNyAWi7CCxcOJ3t2zuO\nKz/ssLu4/vp6zjknGUZZzJ2oZuXiIZtmPTR0KHziE8kL4NxzW3nqqY6driee2EpjY6+HZ1Yy1dGD\nZNZNRx7ZNq48V/V0ulr/5b9gszwaG6dQV1fPzsTfNpnXlNRiMisFt+mbdcKdrlbp3JFrZlZFepL0\n3bxjZlZFnPTNzKpIUUlf0kRJyyQtl3RLJ2UykhZKeknSkznrX5O0KLvt+VIFbmZm3Vcw6UuqAe4F\nLgROBK6QdFy7MgcA9wEfjYiTgL/K2dwKZCJiTESMK1nkKdjtBxL3EsdZWo6ztBxnuoqp6Y8DVkRE\nS0RsA2YBl7UrcyXwk4hYBxARb+ZsU5HnqXh95Y/AcZaW4ywtx5muYpLxcGBNzvLa7LpcxwAHS3pS\n0nxJf5OzLYA52fXX7V64Zma2O0o1DcMAYCzwIZLJSp6V9GxErAQmRMQGSYeSJP+lETG3ROc1M7Nu\nKDhOX9JZQENETMwu30rytJY7c8rcAuwdEdOzy98F/jMiftLuWPXA5oj4Rp7zeJC+mVk3lWPCtfnA\nUZJqgQ3AJOCKdmV+BtwjaQ9gL+BM4BuS9gVqImKLpEHAh4G8T0HubuBmZtZ9BZN+RGyXdCMwm6QP\n4IGIWCrphmRz3B8RyyT9ClgMbAfuj4hXJI0CfpqtxQ8AZkbE7PJ9O2Zm1pWKmYbBzMzKL/WhlMXc\n+JU2SSMkPSHpZUlLJH0h7Zi6IqlG0gJJj6UdS2ckHSDpR5KWZn+uZ6YdU3uS/lf2ZsPFkmZKGph2\nTG0kPSDpDUmLc9YdJGm2pFcl/Sp7/0ylxfhP2d/5i5J+Imn/NGPMxtQhzpxtfy+pVdLBacTWLpa8\ncUqamv3G0spKAAADg0lEQVSZLpF0R6HjpJr0i7nxq0K8B3wxIk4EzgY+X6FxtrkJeCXtIAr4FvB4\nRBwPnAosTTmeXUgaBkwFxkbEKSTNk5PSjWoXD5L83+S6Ffh1RBwLPAHc1utR7SpfjLOBEyPiNGAF\n6ccI+eNE0gjgAqCl1yPKr0OckjLAJcDJEXEycFehg6Rd0y/mxq/URcR/R8SL2fdbSBJU+3sVKkL2\nD/Vi4Ltpx9KZbO3unIh4ECAi3ouId1IOK589gEGSBgD7AutTjmeH7LDnje1WXwb8e/b9vwMf69Wg\n2skXY0T8OiJas4vzgBG9Hlg7nfwsAf4V+IdeDqdTncT5WeCOiHgvW+bNDju2k3bSL+bGr4oiaSRw\nGvBcupF0qu0PtZI7a0YBb0p6MNsMdb+kfdIOKldErAf+BXgdWAe8HRG/Tjeqgg6LiDcgqagAh6Uc\nTyGfBv4z7SDykXQpsCYilqQdSwHHAOdKmpe9OfaMQjuknfT7FEmDgR8DN2Vr/BVF0keAN7KfSpR9\nVaK2m/nui4ixwB9JmiYqhqQDSWrOtcAwYLCkK9ONqtsq9sIv6SvAtoh4OO1Y2stWQL4M1OeuTimc\nQgYAB0XEWcCXgB8W2iHtpL8OODJneUR2XcXJfsT/MfAfEfGztOPpxATgUkmrgUeA8yQ9lHJM+awl\nqUW9kF3+MclFoJL8BbA6It6KiO3Ao8D4lGMq5A1JhwNIeh/wu5TjyUvSFJImyEq9iNYBI4FFkppJ\n8tJvJVXiJ6c1JH+bRMR8oFXSIV3tkHbS33HjV3ZkxCSgUkecfA94JSK+lXYgnYmIL0fEkRExmuRn\n+URE/G3acbWXbYJYI+mY7KrzqbyO59eBsyTtLUkkMVZUZzMdP809BkzJvv8UyU2TadslRkkTSZof\nL42IP6cWVUc74oyIlyLifRExOiJGkVRSxkREJVxE2//O/w/J9Ddk/5/2jIg/dHWAVJN+tgbVduPX\ny8CsiKi0fywkTQAmAx/KPhdgQfaP13ruC8BMSS+SjN75x5Tj2UVEPE/yCWQhsIjkH+3+VIPKIelh\n4BngGEmvS7oauAO4QNKrJBepgsP3UojxHmAwyTxcCyT97zRjhE7jzBVUQPNOJ3F+DxgtaQnwMFCw\nkuebs8zMqkjazTtmZtaLnPTNzKqIk76ZWRVx0jczqyJO+mZmVcRJ38ysijjpm5lVESd9M7Mq8v8B\nvQBp8jPaPM8AAAAASUVORK5CYII=\n", 962 | "text/plain": [ 963 | "" 964 | ] 965 | }, 966 | "metadata": {}, 967 | "output_type": "display_data" 968 | } 969 | ], 970 | "source": [ 971 | "import matplotlib.pyplot as plt\n", 972 | "%matplotlib inline\n", 973 | "\n", 974 | "kvals = range(1, 16)\n", 975 | "plt.plot(kvals, rss_all,'bo-')" 976 | ] 977 | } 978 | ], 979 | "metadata": { 980 | "kernelspec": { 981 | "display_name": "Python 2", 982 | "language": "python", 983 | "name": "python2" 984 | }, 985 | "language_info": { 986 | "codemirror_mode": { 987 | "name": "ipython", 988 | "version": 2 989 | }, 990 | "file_extension": ".py", 991 | "mimetype": "text/x-python", 992 | "name": "python", 993 | "nbconvert_exporter": "python", 994 | "pygments_lexer": "ipython2", 995 | "version": "2.7.13" 996 | } 997 | }, 998 | "nbformat": 4, 999 | "nbformat_minor": 0 1000 | } 1001 | -------------------------------------------------------------------------------- /ridge-regression/ridge-regression-gradient-descent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Ridge regression using gradient descent on house sales data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Fire up Graphlab Create" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import graphlab" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Load in house sales data\n", 33 | "\n", 34 | "Dataset is from house sales in King County, the region where the city of Seattle, WA is located." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 91, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "sales = graphlab.SFrame('kc_house_data.gl/')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Function to convert SFrame to Numpy data" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 92, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "import numpy as np" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 93, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "def get_numpy_data(data_sframe, features, output):\n", 75 | " \n", 76 | " data_sframe['constant'] = 1 # new constant column in the sframe signifying intercept\n", 77 | " \n", 78 | " features = ['constant'] + features # prepend constant to features list\n", 79 | " \n", 80 | " features_sframe = data_sframe[features] # new sframe selecting columns from data_sframe mentioned in features list\n", 81 | "\n", 82 | " feature_matrix = features_sframe.to_numpy() # convert sframe to numpy matrix\n", 83 | "\n", 84 | " output_sarray = data_sframe[output] # an sarray consisting of the output column\n", 85 | "\n", 86 | " output_array = output_sarray.to_numpy() # converts sarray to a numpy array\n", 87 | "\n", 88 | " return(feature_matrix, output_array)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "source": [ 97 | "### Function to predict output given feature matrix and weight vector" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 94, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "def predict_output(feature_matrix, weights):\n", 109 | " predictions = np.dot(feature_matrix, weights)\n", 110 | " return(predictions)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Computing the Derivative" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "We are now going to move to computing the derivative of the regression cost function. The cost function is the sum over the data points of the squared difference between an observed output and a predicted output, plus the L2 penalty term.\n", 125 | "```\n", 126 | "Cost(w)\n", 127 | "= SUM[ (prediction - output)^2 ]\n", 128 | "+ l2_penalty*(w[0]^2 + w[1]^2 + ... + w[k]^2).\n", 129 | "```\n", 130 | "\n", 131 | "Since the derivative of a sum is the sum of the derivatives, we can take the derivative of the first part (the RSS) and add the derivative of the regularization part. The derivative of the RSS with respect to `w[i]` can be written as: \n", 132 | "```\n", 133 | "2*SUM[ error*[feature_i] ].\n", 134 | "```\n", 135 | "The derivative of the regularization term with respect to `w[i]` is:\n", 136 | "```\n", 137 | "2*l2_penalty*w[i].\n", 138 | "```\n", 139 | "Summing both, we get\n", 140 | "```\n", 141 | "2*SUM[ error*[feature_i] ] + 2*l2_penalty*w[i].\n", 142 | "```\n", 143 | "That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself, plus `2*l2_penalty*w[i]`. \n", 144 | "\n", 145 | "**We dont have to regularize the constant.** Thus, in the case of the constant, the derivative is just twice the sum of the errors (without the `2*l2_penalty*w[0]` term).\n", 146 | "\n", 147 | "Twice the sum of the product of two vectors is just twice the dot product of the two vectors. Therefore the derivative for the weight for feature_i is just two times the dot product between the values of feature_i and the current errors, plus `2*l2_penalty*w[i]`.\n", 148 | "\n", 149 | "The following derivative function computes the derivative of the weight given the value of the feature (over all data points) and the errors (over all data points)." 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Function to compute derivative of weight" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 95, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "def feature_derivative_ridge(errors, feature, weight, l2_penalty, feature_is_constant):\n", 168 | " derivative = 0\n", 169 | " # If feature_is_constant is True, derivative is twice the dot product of errors and feature\n", 170 | " if feature_is_constant:\n", 171 | " total_error = errors.sum()\n", 172 | " derivative = 2 * total_error\n", 173 | " # Otherwise, derivative is twice the dot product plus 2*l2_penalty*weight\n", 174 | " else:\n", 175 | " dot_product = np.dot(errors, feature)\n", 176 | " rss_part = 2 * dot_product\n", 177 | " regularized_part = 2 * l2_penalty * weight\n", 178 | " derivative = rss_part + regularized_part\n", 179 | " return derivative" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "### Gradient Descent" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "Now we will write a function that performs a gradient descent. The basic premise is simple. Given a starting point we update the current weights by moving in the negative gradient direction. Recall that the gradient is the direction of *increase* and therefore the negative gradient is the direction of *decrease* and we're trying to *minimize* a cost function. \n", 194 | "\n", 195 | "The amount by which we move in the negative gradient *direction* is called the 'step size'. We stop when we are 'sufficiently close' to the optimum. \n", 196 | "\n", 197 | "We will set a **maximum number of iterations** and take gradient steps until we reach this maximum number. If no maximum number is supplied, the maximum should be set 100 by default." 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "### Gradient descent algorithm with Ridge regression" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 96, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "def ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations=100):\n", 216 | " print 'Starting gradient descent with l2_penalty = ' + str(l2_penalty)\n", 217 | " \n", 218 | " weights = np.array(initial_weights) # a numpy array\n", 219 | " iteration = 0 # iteration counter\n", 220 | " print_frequency = 1 # for adjusting frequency of debugging output\n", 221 | " \n", 222 | " # while not reached maximum number of iterations \n", 223 | " while iteration < max_iterations: \n", 224 | " iteration += 1 # increment iteration counter\n", 225 | " \n", 226 | " ### === code section for adjusting frequency of debugging output. ===\n", 227 | " if iteration == 10:\n", 228 | " print_frequency = 10\n", 229 | " if iteration == 100:\n", 230 | " print_frequency = 100\n", 231 | " if iteration%print_frequency==0:\n", 232 | " print('Iteration = ' + str(iteration))\n", 233 | " ### === end code section ===\n", 234 | " \n", 235 | " # compute the predictions based on feature_matrix and weights using your predict_output() function\n", 236 | " predictions = predict_output(feature_matrix, weights)\n", 237 | "\n", 238 | " # compute the errors in prediction\n", 239 | " errors = predictions - output\n", 240 | "\n", 241 | " # from time to time, print the value of the cost function\n", 242 | " if iteration%print_frequency==0:\n", 243 | " print 'Cost function = ', str(np.dot(errors,errors) + l2_penalty*(np.dot(weights,weights) - weights[0]**2))\n", 244 | " \n", 245 | " for i in xrange(len(weights)): # loop over each weight\n", 246 | " \n", 247 | " # feature column associated with weights[i]\n", 248 | " feature_column = feature_matrix[:,i]\n", 249 | " \n", 250 | " # computing derivative of weight[i]\n", 251 | " if i == 0: # feature is constant\n", 252 | " derivative = feature_derivative_ridge(errors, feature_column, weights[i], l2_penalty, True)\n", 253 | " else: # feature is not constant\n", 254 | " derivative = feature_derivative_ridge(errors, feature_column, weights[i], l2_penalty, False)\n", 255 | " \n", 256 | " # subtracting the step size times the derivative from the current weight\n", 257 | " weights[i] = weights[i] - (step_size * derivative)\n", 258 | " \n", 259 | " print 'Done with gradient descent at iteration ', iteration\n", 260 | " print 'Learned weights = ', str(weights)\n", 261 | " return weights" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "### Visualizing effect of L2 penalty" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "### Simple model with no L2 penalty (No regularization)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 97, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "simple_features = ['sqft_living']\n", 287 | "my_output = 'price'" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Let us split the dataset into training set and test set. Make sure to use `seed=0`:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 98, 300 | "metadata": { 301 | "collapsed": true 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "train_data,test_data = sales.random_split(.8,seed=0)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Get numpy versions by using `get_numpy_data` of your data with only this feature, for both the `train_data` and the `test_data`. " 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 99, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)\n", 324 | "(simple_test_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "Let's set the parameters for our optimization:" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 100, 337 | "metadata": { 338 | "collapsed": true 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "initial_weights = np.array([0., 0.])\n", 343 | "step_size = 1e-12\n", 344 | "max_iterations=1000" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "#### Learned weights with no regulariztion" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 101, 357 | "metadata": { 358 | "collapsed": false 359 | }, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "Starting gradient descent with l2_penalty = 0.0\n", 366 | "Iteration = 1\n", 367 | "Cost function = 7.43305185103e+15\n", 368 | "Iteration = 2\n", 369 | "Cost function = 5.39426721314e+15\n", 370 | "Iteration = 3\n", 371 | "Cost function = 4.0232377365e+15\n", 372 | "Iteration = 4\n", 373 | "Cost function = 3.10125618392e+15\n", 374 | "Iteration = 5\n", 375 | "Cost function = 2.48124764451e+15\n", 376 | "Iteration = 6\n", 377 | "Cost function = 2.06430807789e+15\n", 378 | "Iteration = 7\n", 379 | "Cost function = 1.78392709737e+15\n", 380 | "Iteration = 8\n", 381 | "Cost function = 1.59537820315e+15\n", 382 | "Iteration = 9\n", 383 | "Cost function = 1.46858399105e+15\n", 384 | "Iteration = 10\n", 385 | "Cost function = 1.38331819148e+15\n", 386 | "Iteration = 20\n", 387 | "Cost function = 1.2115621405e+15\n", 388 | "Iteration = 30\n", 389 | "Cost function = 1.20831376268e+15\n", 390 | "Iteration = 40\n", 391 | "Cost function = 1.20825232625e+15\n", 392 | "Iteration = 50\n", 393 | "Cost function = 1.20825116361e+15\n", 394 | "Iteration = 60\n", 395 | "Cost function = 1.20825114092e+15\n", 396 | "Iteration = 70\n", 397 | "Cost function = 1.20825113978e+15\n", 398 | "Iteration = 80\n", 399 | "Cost function = 1.20825113905e+15\n", 400 | "Iteration = 90\n", 401 | "Cost function = 1.20825113832e+15\n", 402 | "Iteration = 100\n", 403 | "Cost function = 1.2082511376e+15\n", 404 | "Iteration = 200\n", 405 | "Cost function = 1.20825113037e+15\n", 406 | "Iteration = 300\n", 407 | "Cost function = 1.20825112315e+15\n", 408 | "Iteration = 400\n", 409 | "Cost function = 1.20825111592e+15\n", 410 | "Iteration = 500\n", 411 | "Cost function = 1.2082511087e+15\n", 412 | "Iteration = 600\n", 413 | "Cost function = 1.20825110147e+15\n", 414 | "Iteration = 700\n", 415 | "Cost function = 1.20825109424e+15\n", 416 | "Iteration = 800\n", 417 | "Cost function = 1.20825108702e+15\n", 418 | "Iteration = 900\n", 419 | "Cost function = 1.20825107979e+15\n", 420 | "Iteration = 1000\n", 421 | "Cost function = 1.20825107257e+15\n", 422 | "Done with gradient descent at iteration 1000\n", 423 | "Learned weights = [ -1.63113501e-01 2.63024369e+02]\n" 424 | ] 425 | } 426 | ], 427 | "source": [ 428 | "l2_penalty = 0.0\n", 429 | "simple_weight_0_penalty = ridge_regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, \n", 430 | " l2_penalty, max_iterations)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "#### Learned weights with regulariztion" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 102, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "Starting gradient descent with l2_penalty = 1e+11\n", 452 | "Iteration = 1\n", 453 | "Cost function = 7.43305185103e+15\n", 454 | "Iteration = 2\n", 455 | "Cost function = 5.61830389841e+15\n", 456 | "Iteration = 3\n", 457 | "Cost function = 4.92061327812e+15\n", 458 | "Iteration = 4\n", 459 | "Cost function = 4.65238194261e+15\n", 460 | "Iteration = 5\n", 461 | "Cost function = 4.54925876401e+15\n", 462 | "Iteration = 6\n", 463 | "Cost function = 4.50961239088e+15\n", 464 | "Iteration = 7\n", 465 | "Cost function = 4.49437005028e+15\n", 466 | "Iteration = 8\n", 467 | "Cost function = 4.48850998403e+15\n", 468 | "Iteration = 9\n", 469 | "Cost function = 4.48625698853e+15\n", 470 | "Iteration = 10\n", 471 | "Cost function = 4.48539075267e+15\n", 472 | "Iteration = 20\n", 473 | "Cost function = 4.48484886803e+15\n", 474 | "Iteration = 30\n", 475 | "Cost function = 4.48484788048e+15\n", 476 | "Iteration = 40\n", 477 | "Cost function = 4.48484693108e+15\n", 478 | "Iteration = 50\n", 479 | "Cost function = 4.48484598169e+15\n", 480 | "Iteration = 60\n", 481 | "Cost function = 4.48484503229e+15\n", 482 | "Iteration = 70\n", 483 | "Cost function = 4.4848440829e+15\n", 484 | "Iteration = 80\n", 485 | "Cost function = 4.48484313351e+15\n", 486 | "Iteration = 90\n", 487 | "Cost function = 4.48484218411e+15\n", 488 | "Iteration = 100\n", 489 | "Cost function = 4.48484123472e+15\n", 490 | "Iteration = 200\n", 491 | "Cost function = 4.48483174082e+15\n", 492 | "Iteration = 300\n", 493 | "Cost function = 4.48482224696e+15\n", 494 | "Iteration = 400\n", 495 | "Cost function = 4.48481275314e+15\n", 496 | "Iteration = 500\n", 497 | "Cost function = 4.48480325936e+15\n", 498 | "Iteration = 600\n", 499 | "Cost function = 4.48479376562e+15\n", 500 | "Iteration = 700\n", 501 | "Cost function = 4.48478427191e+15\n", 502 | "Iteration = 800\n", 503 | "Cost function = 4.48477477825e+15\n", 504 | "Iteration = 900\n", 505 | "Cost function = 4.48476528463e+15\n", 506 | "Iteration = 1000\n", 507 | "Cost function = 4.48475579105e+15\n", 508 | "Done with gradient descent at iteration 1000\n", 509 | "Learned weights = [ 9.76730383 124.57217565]\n" 510 | ] 511 | } 512 | ], 513 | "source": [ 514 | "l2_penalty = 1e11\n", 515 | "simple_weight_high_penalty = ridge_regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, \n", 516 | " l2_penalty, max_iterations)" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "### Plotting the two learned models\n", 524 | "The blue line is for the model with no regularization and the red line is for the one with high regularization." 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 103, 530 | "metadata": { 531 | "collapsed": false 532 | }, 533 | "outputs": [ 534 | { 535 | "data": { 536 | "text/plain": [ 537 | "[,\n", 538 | " ,\n", 539 | " ,\n", 540 | " ,\n", 541 | " ,\n", 542 | " ]" 543 | ] 544 | }, 545 | "execution_count": 103, 546 | "metadata": {}, 547 | "output_type": "execute_result" 548 | }, 549 | { 550 | "data": { 551 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ0AAAEACAYAAABoJ6s/AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJztvXt8VdWZ//9+ck5CRo0IXkCIECIEEmKxWsGxhW+KF7Cd\n0bZW5duL2vHb33iZ1ttMFdsp2su30m8dhbZS1NZqq0SrnUn6GkWSCgMzgChqlYCB1ooYCjqiJK2j\nEvL8/jhrH/Y52efk5ORcc57367Vf2Vl7rb2fvc8567PXWs96lqgqhmEYhpELyvJtgGEYhlE6mOgY\nhmEYOcNExzAMw8gZJjqGYRhGzjDRMQzDMHKGiY5hGIaRM1ISHRG5TkS2iMiLIvKgiFSIyCgRWSUi\nnSLypIiM9OVfKCI7RGSbiJzjSz/FnWO7iNzpS68QkWZXZoOITPAdu9Tl7xSRS3zpNSKy0R1bISLh\noT8OwzAMI5sMKDoiMg74CnCKqn4ICAP/G7gJaFfVqcBTwEKXvwG4CKgHzgXuEhFxp1sGXK6qdUCd\niMxz6ZcD+1R1CnAn8H13rlHAN4HTgFnAIp+4LQZud+d6x53DMAzDKGBS7V4LAYe71sRfAV3A+cD9\n7vj9wKfc/nlAs6r2quqrwA5gpoiMBapU9RmX7wFfGf+5HgXmuv15wCpV3a+q7wCrgPnu2FzgMd/1\nP53ivRiGYRh5YkDRUdXdwO3Aa0TEZr+qtgNjVHWvy7MHOM4VGQ/s8p2iy6WNB173pb/u0mLKqOpB\nYL+IjE50LhE5GnhbVft85xqXyg0bhmEY+SOV7rWjiLREJhKp2A8Xkc8D8fFzMhlPRwbOklIewzAM\no4BIZfD9LOAVVd0HICL/CpwB7BWRMaq613WdveHydwEn+MpXu7RE6f4yu0UkBBypqvtEpAtoiiuz\nWlXfEpGRIlLmWjv+c8UgIhZczjAMIw1UNeMv96mM6bwGnC4ilc4h4ExgK9AKXObyXAq0uP1WYIHz\nSJsETAY2uS64/SIy053nkrgyl7r9C4k4JgA8CZztBGYUcLZLA1jt8sZfvx+qWrTbokWL8m5DKdpu\n9ud/M/vzu2WLAVs6qrpJRB4FngcOuL93A1XAIyLyd8BOIh5rqOpWEXmEiDAdAK7SQ3dwNfBzoBJ4\nXFVXuvSfAr8QkR3AW8ACd663ReTbwLNEuu9u1YhDAUS855rd8efdOQzDMIwCJqW5Lap6K3BrXPI+\nIl1vQfm/B3wvIH0zcFJA+vs40Qo49nMiQhWf/kcibtSGYRhGkWARCQqcpqamfJuQNsVsO5j9+cbs\nH55INvvuCgER0eF+j4ZhGJlGRNA8ORIUPT09Pfk2wTCMEqOnp4cNGzZY/RNHSYjO7Nmz7YM3DCNn\n9PT0MHv2bObMmWP1TxwlITpbt26lo6Mj32YYhlEibNmyhY6ODnp7e63+iaMkRKehoYHp06fn2wzD\nMEqExsZGpk+fTnl5udU/cZSEI0F3dzdVVVX5NsUwjBKip6eHjo4Opk+fXpT1T7YcCUpCdIb7PRqG\nYWQa814zDMMwih4THcMwDCNnmOgYhmEYOcNExzAMw8gZJjqGYRhGzjDRMQzDMHKGiY5hGIaRM0x0\nDMMwjJxhomMYhmHkDBMdwzAMI2eY6BiGYRg5Y0DREZE6EXleRJ5zf/eLyFdFZJSIrBKRThF5UkRG\n+sosFJEdIrJNRM7xpZ8iIi+KyHYRudOXXiEiza7MBhGZ4Dt2qcvfKSKX+NJrRGSjO7ZCRMKZeSSG\nYRhGthhQdFR1u6p+WFVPAU4F/gL8K3AT0K6qU4GngIUAItIAXATUA+cCd4mIFzRuGXC5qtYBdSIy\nz6VfDuxT1SnAncD33blGAd8ETgNmAYt84rYYuN2d6x13jkBsASXDKD5s5c3hyWC7184C/qCqu4Dz\ngftd+v3Ap9z+eUCzqvaq6qvADmCmiIwFqlT1GZfvAV8Z/7keBea6/XnAKlXdr6rvAKuA+e7YXOAx\n3/U/nchoW7nPMIoLW3lz+DJY0bkYeMjtj1HVvQCqugc4zqWPB3b5ynS5tPHA6770111aTBlVPQjs\nF5HRic4lIkcDb6tqn+9c4xIZbSv3GUZxYStvDl9SFh0RKSfSivmVS4pfpCaTi9aksoZDyus82Mp9\nhlFc2Mqbw5fBDL6fC2xW1f92/+8VkTGqutd1nb3h0ruAE3zlql1aonR/md0iEgKOVNV9ItIFNMWV\nWa2qb4nISBEpc60d/7n6G37uudx+++0ANDU10dTUlCirYRgFQFVVFevWrSvqlTeLjTVr1rBmzZqs\nXyfllUNFZAWwUlXvd/8vJjL4v1hEbgRGqepNzpHgQSID/+OBNmCKqqqIbAS+CjwD/DuwVFVXishV\nQKOqXiUiC4BPqeoC50jwLHAKkVbZs8CpqvqOiDwM/FpVHxaRZcDvVPUnAXbbyqGGYRiDJK/LVYvI\nYcBOoFZVe1zaaOARIi2UncBFbrAfEVlIxJvsAHCNqq5y6acCPwcqgcdV9RqXPgL4BfBh4C1ggXNC\nQEQuA75OpPvuO6r6gEufBDQDo4DngS+o6oEA2010DMMwBkleRaeYMdExDMMYPNkSHYtIYBiGYeQM\nEx3DMAwjZ5joGIZhGDnDRMcwDMPIGSY6hmEYRs4w0TEMwzByhomOYRiGkTNMdAzDMIycYaJjGIZh\n5AwTHcMwDCNnmOgYhmEYOcNExzAMw8gZJjqGYRhGzjDRMQzDMHKGiY5hGIaRM0x0DMMwjJxhomMY\nhmHkDBMdwzAMI2eY6BiGYRg5IyXREZGRIvIrEdkmIh0iMktERonIKhHpFJEnRWSkL/9CEdnh8p/j\nSz9FRF4Uke0icqcvvUJEml2ZDSIywXfsUpe/U0Qu8aXXiMhGd2yFiISH/jgMwzCMbJJqS2cJ8Liq\n1gMzgJeBm4B2VZ0KPAUsBBCRBuAioB44F7hLRMSdZxlwuarWAXUiMs+lXw7sU9UpwJ3A9925RgHf\nBE4DZgGLfOK2GLjdnesddw7DMAyjgBlQdETkSGC2qt4HoKq9qrofOB+432W7H/iU2z8PaHb5XgV2\nADNFZCxQparPuHwP+Mr4z/UoMNftzwNWqep+VX0HWAXMd8fmAo/5rv/plO/aMAzDyAuptHQmAf8t\nIveJyHMicreIHAaMUdW9AKq6BzjO5R8P7PKV73Jp44HXfemvu7SYMqp6ENgvIqMTnUtEjgbeVtU+\n37nGpXLDhmEYRv5IZRwkDJwCXK2qz4rIHUS61jQuX/z/Q0EGzpJSHgBuueWW6H5TUxNNTU2Dt8gw\nDGMYs2bNGtasWZP166QiOq8Du1T1Wff/Y0REZ6+IjFHVva7r7A13vAs4wVe+2qUlSveX2S0iIeBI\nVd0nIl1AU1yZ1ar6lnNuKHOtHf+5+uEXHcMwDKM/8S/kt956a1auM2D3mutC2yUidS7pTKADaAUu\nc2mXAi1uvxVY4DzSJgGTgU2uC26/iMx0jgWXxJW51O1fSMQxAeBJ4GwnMKOAs10awGqXN/76hmEY\nRoEiqgP3ionIDOBeoBx4BfgSEAIeIdJC2Qlc5Ab7EZGFRLzJDgDXqOoql34q8HOgkog33DUufQTw\nC+DDwFvAAueEgIhcBnydSPfdd1T1AZc+CWgGRgHPA19Q1QMBtmsq92gYhmEcQkRQ1ZSHMVI+73Cv\nkE10DMMwBk+2RMciEhiGYRg5w0THMAzDyBkmOoZhGEbOMNExDMMwcoaJjmEYhpEzSkJ0enp68m2C\nYQx7enp62LBhg/3ejKSUhOjMnj3bfgiGkUV6enqYPXs2c+bMsd+bkZSSEJ2tW7fS0dGRbzMMY9iy\nZcsWOjo66O3ttd+bkZSSEJ2GhgamT5+ebzMMY9jS2NjI9OnTKS8vt9+bkZSSiEjQ3d1NVVVVvk0x\njGFNT08PHR0dTJ8+3X5vwwALg5MmFgbHMAxj8FgYHMMwDKPoKQnRMU8ao9Qw92WjUCkJ0TnjjDPs\nx2cUJemIh7kvG4VMSYjOli1b2LRpU77NMIxBka54mPuyUciUhOgYRjGSrniY+7JRyJSE91pjYyPr\n1683N06jqPBaOlu3bqWhoYF169al/B0292VjqJjLdJrYPB2jmDHxMPKFiU6amOgYpUhPTw9btmyh\nsbHRvvtGWuR1no6IvCoivxOR50Vkk0sbJSKrRKRTRJ4UkZG+/AtFZIeIbBORc3zpp4jIiyKyXUTu\n9KVXiEizK7NBRCb4jl3q8neKyCW+9BoR2eiOrRCRcCL7zYPHKCXMe80oZFJ1JOgDmlT1w6o606Xd\nBLSr6lTgKWAhgIg0ABcB9cC5wF0i4qnlMuByVa0D6kRknku/HNinqlOAO4Hvu3ONAr4JnAbMAhb5\nxG0xcLs71zvuHIGYB49RSpj3mlHIpCo6EpD3fOB+t38/8Cm3fx7QrKq9qvoqsAOYKSJjgSpVfcbl\ne8BXxn+uR4G5bn8esEpV96vqO8AqYL47Nhd4zHf9Tycy3jx4jFLCvNeMQiZhl1QcCrSJyEFguare\nC4xR1b0AqrpHRI5zeccDG3xlu1xaL/C6L/11l+6V2eXOdVBE9ovIaH+6/1wicjTwtqr2+c41LpHx\ng/H6MYxip6qqinXr1pkDglGQpCo6H1XVP4nIscAqEekkIkR+MumRkMrgVcoDXLfffnt0v6mpiaam\npjRMMozioaqqitNPPz3fZhhFxJo1a1izZk3Wr5OS6Kjqn9zfN0Xk34CZwF4RGaOqe13X2Rsuexdw\ngq94tUtLlO4vs1tEQsCRqrpPRLqAprgyq1X1LREZKSJlrrXjP1c/brnlllRu0zAMo2SJfyG/9dZb\ns3KdAcd0ROQwETnC7R8OnAO8BLQCl7lslwItbr8VWOA80iYBk4FNqroH2C8iM51jwSVxZS51+xcS\ncUwAeBI42wnMKOBslwaw2uWNv75hGIZRoAw4T8cJx78S6T4LAw+q6m1uzOURIi2UncBFbrAfEVlI\nxJvsAHCNqq5y6acCPwcqgcdV9RqXPgL4BfBh4C1ggXNCQEQuA77urv8dVX3AZ1czMAp4HviCqh4I\nsN/W0zEMwxgkNjk0TUx0DMMwBo8t4mYYhmEUPSY6hmEYRs4w0TEMwzByRkmIjsWeMozCwZbSLm1K\nQnQs6KFhFAYWjNQoCdGxoIeGURhYMFKjJETHgh4aRmFgwUiNkpin09XVxbhxCeOBGkbGsUXUEmOr\noRYHNk9nCMybN8/6jo2sET8wbuMWyfGCkZrglCYlITpbtmxh06ZN+TbDGIYECYyNWxhGYkpCdAwj\nWwQJjI1bGEZiSmJMp7GxkfXr11tz3sg4Xktn69atNDQ0RBcMtHELo9ixgJ9pIiLa3d1tP3wja5jA\nGMMRE500sSjThmEYg8e81wzDMIyipyREx1xWjWLGYpUZw4mSEB2bK2EUKzbnxxhulITo2FwJo1ix\nOT/GcKMkRMfmShjFis35MYYbKYuOiJSJyHMi0ur+HyUiq0SkU0SeFJGRvrwLRWSHiGwTkXN86aeI\nyIsisl1E7vSlV4hIsyuzQUQm+I5d6vJ3isglvvQaEdnojq0QkXAi2725E4ZRbFRVVbFu3TrWrl1r\n32NjWDCYls41wFbf/zcB7ao6FXgKWAggIg3ARUA9cC5wl4h4bnfLgMtVtQ6oE5F5Lv1yYJ+qTgHu\nBL7vzjUK+CZwGjALWOQTt8XA7e5c77hzBGI/VKOYsVhlxnAiJdERkWrgE8C9vuTzgfvd/v3Ap9z+\neUCzqvaq6qvADmCmiIwFqlT1GZfvAV8Z/7keBea6/XnAKlXdr6rvAKuA+e7YXOAx3/U/nch+G3w1\nChXzTDNKjVRbOncA/wT4Z1mOUdW9AKq6BzjOpY8Hdvnydbm08cDrvvTXXVpMGVU9COwXkdGJziUi\nRwNvq2qf71wJ1y4wrx+jEDHPNKMUSTgO4iEinwT2quoLItKUJGsmp/2nMgs25ZmyL730Etdddx3V\n1dU0NTXR1NSUvmWGkSGCPNNOP/30fJtllChr1qxhzZo1Wb/OgKIDfBQ4T0Q+AfwVUCUivwD2iMgY\nVd3rus7ecPm7gBN85atdWqJ0f5ndIhICjlTVfSLSBTTFlVmtqm+JyEgRKXOtHf+5+nHSSSdxxx13\nWJ+4UVB4nmlesFDzTDPySfwL+a233pqV6wzYvaaqN6vqBFWtBRYAT6nqF4HfAJe5bJcCLW6/FVjg\nPNImAZOBTa4Lbr+IzHSOBZfElbnU7V9IxDEB4EngbCcwo4CzXRrAapc3/vr9ePzxx01wjITka1zF\nPNOMUmRQAT9F5H8BN6jqeW7M5REiLZSdwEVusB8RWUjEm+wAcI2qrnLppwI/ByqBx1X1Gpc+AvgF\n8GHgLWCBc0JARC4Dvk6k++47qvqAS58ENAOjgOeBL6jqgQCbdcaMGfajNgLxxlW8KNH2PTGMCBZl\nOk1EREOhEP/5n/9p/eUljreqZ2NjY1RYNmzYwJw5c+jt7aW8vJy1a9fm7HsSZI9hFAoWZXoI9PX1\nMXr06HybYeSRRJ5i+Zrxb55rRqlSEqKjqqxduzbfZhh5xO8p1tHRwaZNm4D8jatYTDWjVCkJ0RER\n5syZk28zjDzS2NjItGnTAOjt7eXaa6+Nti7yMePfYqoZpUrJiM6+ffvybYYxBII8zAbrdfblL3+Z\nsrLIV76zszOvrQvzXDNKlVTm6RQ9fX19VFRU5NsMI02CPMwAzjjjDF5++WWmTZvG+vXrE1bc/vIV\nFRX09vYWROvCa2EZRilREi0dgF/+8pf5NsFIE//4x5YtW9i0aRNPP/00W7ZsiUlLpfzBgwdZtmyZ\ntS4MI0+UjOh86EMfyrcJRpr4x2MOHjzItddey7vvvjuo8v7xk4svvtgExyg5Dh7MtwURSkZ0Tjjh\nhIEzGQVJVVUVd9xxB+FwpDe4s7OTww8/nMbGRkKhEI2NjcycOTNpeRs/MUqRp56CSZNABMJheO+9\nfFtUIpNDGxsbk/b5G4WPNy7jxSnzxnW8cR77bA0j0ppZvhyuvjo2/fLL4bbb4JhjUj+XRSRIExHR\n7u5uq5SGAT09PSYyhhHH/v3wjW/Aj34Um/7//h9ccw2Ul6d3XhOdNBERHe73aBhGabF9O1x1Ffz2\nt4fSxoyBe++Fv/mbzFzDwuAMAQsxUprYqpzGcOLJJ6G6OjI+M3VqRHCammDrVlCFPXsyJzjZpCRE\nx2JblR4W28wodnp74c47IyIjAvPnQ1cXXHEF7NsXEZrVq6G+Pt+WDo6SEJ2Ojg6LbVUg5Kr1kY3Y\nZkOxPRv3PdA5raVXfOzbB3//9xGRKS+H666LpC9ZAgcORIRm2TIYNSq/dg4JVR3WG5F1eHTlypVq\n5Jfu7m6dMWOGhsNhnTFjhnZ3d2f9WuXl5Rm51lBsz8Z9D3TOXD5rY2h0dKjOmaMakZTIdsIJqvmu\nsiLykIU6ORsnLaTNE5358+en89yNDLJ+/XoNh8MKaHl5uW7YsCHlst3d3bp+/fpBV/YbNmzISIU7\nFNuHUjbdc2bjmkbmaG1VPfbYWKE5+2zVzs58W3YIE50hio61dPKPv/XR2NiobW1tgW/o8eJSCG/t\n3d3d2tjYqOFwWBsbG9Nq6fhbXf77TFdQk7XkMt3SM4bGBx+oLl4cKzKg+tWvqr7zTr6tC8ZEZ4ii\ns2zZsnSeu5Fhuru7tb29PVqB+yvEROJSCG/tnuiEQqFBi45X3mt1+e+zsbEx8FkM9pzpHDeyy5tv\nqn7pS/2F5q67VHt7823dwJjoDFF0zjrrrHSeu5EGA725B4lId3e3Ll++PFBcUnmrH2xLYbBkUvj8\n5wqHwxoKhawbbJjwu9+pnn56rMjU1qo+9VS+LRs8eRMdYATwNPA88BKwyKWPAlYBncCTwEhfmYXA\nDmAbcI4v/RTgRWA7cKcvvQJodmU2ABN8xy51+TuBS3zpNcBGd2wFEE5gvwL64IMPZuFjMeJJ1A0V\n353kF5Guri6dMWOGhkIhHTFiRGAXVqK39lx1vWWyuyq+m7GxsdG6wYqUvj7VRx9VHTkyVmg++UnV\nP/wh39YNjby2dIDD3N+Qq+hnAouBr7n0G4Hb3H6DE6iwE4bfcyjywdPAaW7/cWCe278SuMvtXww0\n6yFh+wMwEjjK23fHHgYudPvLgL9PYLsC+uUvfzkbn4sRR1tbm3rPHND29vZAYfCLiP/NH9CysrKU\nu7By2fWWye6q+O426wYrHt57T/U734kVGVC94QbV4fQRZkt0Upqno6peHPkRTkwUOB+436XfD3zK\n7Z/nRKNXVV8l0nqZKSJjgSpVfcble8BXxn+uR4G5bn8esEpV96vqO0RaVvPdsbnAY77rfzrZPYhk\nPJqDkSJBc2b8C5j95S9/Ydq0adEo0n19fWzbtq3fGjlB8068ZQ9CoRBTp06NWZgt0/NUBrOs9UDX\n9p8rH8tlG4Njzx74whci82cqKyOxziASdubgwYjs/OAHYB/hwKQkOiJSJiLPA3uANiccY1R1L4Cq\n7gGOc9nHA7t8xbtc2njgdV/66y4tpoyqHgT2i8joROcSkaOBt1W1z3euccnuIVnoeyNzNDQ0UFtb\nS1lZWXTJgfj1bDxh8KIGnHvuuQD8+te/pt5Nr/bWzfEq7Z6eHs444wzmzJnDGWec0a8yj3+pyFZE\nglSELJPXtgme+eO55+AjH4kIzfHHw4MPwrRpsG7dofbN5ZdDWUlMsc8gg2kWAUcCvwWmA/vijr3l\n/v4Q+Jwv/V7gM8CpRFotXvrHgFa3/xIwznfs98Bo4AbgZl/6N4DrgaOBHb70auDFBDYroMcee6wu\nWrRIV69ePeRmpxGMvxtt8uTJ2tXVFXMsvgupra2tX9dYS0tL4MB6ULedauLutWx0u6U6fpSpaxeC\nq3gp0denumKF6mGHxXabfepTqjt35tu67LN69WpdtGhRdKNQvNeAf3ZisI1IawdgLLDN7d8E3OjL\nvxKY5c/j0hcAy/x59NC40Ru+PD/xlfkJcLHbfwMoc/unA08ksFcBPeOMMzL7CRlRvHGZIBFJVqax\nsTEqIo2NjdrV1dUvzatoE4lOV1eXTp48ud/cn8G4OHd1deny5ctjRDKIVMUkU04H+XQVz4VHYCHw\nP/+j+s1v9h+fWbhQ9c9/zrd1+SVvogMcw6HB+78C1gKfIOJIcKNLD3IkqAAmEetI4DkhCBFHgvku\n/SoOORIsINiRwNs/yh172CdAy4ArEtivgF5wwQXZ+WRKnKA5J6lUtv4KNRQKaXt7ez9XYk9YvOvE\ni4gnOGVlZVpbW6v19fXRVoEnYANN5uzq6tLKykoFtLKyMqnwDEZMMuEckK8JnsO9hdXVpXrRRbEi\nEw6rPvBApLVjRMin6JwEPAe8QMTd+esufTTQTsSVeZUnBu7YQic28S7TpxLpStsBLPGljwAecekb\ngRrfsctc+nZiXaYnEfGG2+4EqDyB/Qro5MmTs/G5lDzxb+Pt7e0pVbaJZukPNB/H7/E1efLkaMsn\nFArF2HH33Xen1EpYvnx5TAvqnnvuGdDuXHqa5cOzrRAm42aajRtVTzopVmhOOkl1GNxa1sib6BT7\n5lUm3/ve99J57sYADOVtPKhC9SIWBIXI8RPvZl1bWxvTyvLm/gxk12BaOqXCcAih09enev/9kRaM\nX2guukj19dfzbV1xYKIzBNEREatMskgqb+PJxgiCJo4O1LXjrxg9p4V4O1JtJXR1dek999yT8nek\nFMY7inHu0F/+EhmLiR+f+eY3Vd99N9/WFR8mOkMQHUCbm5vTee5GigwkKv5xH38rJl5kBuuMkKhi\nzJYwxNvb1dVVMAJUCmIYz86dEe8yv8gcdljEC83GZ4aGic4QReeiiy5K57kbKTBQ6yQo4oDXOgka\nExpq1042B8LjnR0mT55cEAPuw33w38+6dapTp8YKzamnqj77bL4tG16Y6AxRdP76r/86nedupMBA\nA8/+CtE/aO8JT5BDwYYNG9JuRWRzIDy+W69QBtyH4+C/x8GDqvfeGysyoPqFL6ju2ZNv64YvJjpD\nFJ077rgjnedupEAqA8+eg0BtbW1UdPwRpoMcCtJ9c/fP3cnGW79fFAtlwH04DP77+dOf+osMRGKe\nvfdevq0rDUx0hig6ra2t6Tz3kiSdsYHBDNoHCUL8NeO7se6+++6U7PEq31AoFBgVIdNjHoU04F5I\ntqTDb38bLDSPPmrjM/nARGeIomOLuKXGYFsYqc7mj7+Gv/vMazHER6H20iorKwPtCRKReLFqbW1N\neI18UYoD/olYtChYaB54IN+WGSY6QxSdj370o+k895IjKCpAd3e3trW19Zs7k2yOy0AVq9fi8Vok\niRZvSzTJM5E4epELvM99xIgRGgqFtLa2NiamW6otp0xTSgP+QXzwQf9Jmt62dWu+rTP8mOgMUXSO\nOeaYdJ57yRFfadfX12t9fX30f39ImUSz+ZNVrJ6A+cd2ysrKtLq6OrDLraWlRWtra/sdW79+fVRE\nwuFwzMB5W1ublpWVxdjm3zwhykSl74nrQE4PXr7BuIQPF3btChaZY4+NzK0xChMTnSGKzsiRI9N5\n7iVJW1tbTIXur8D9FXyilk4iTyr/eEu8EIhIzBhMvPjV1tbGtKSStbL8x0Qkxv5QKBT9f6iVvv9+\nEnUB+vMNNj5dOvYUSrfdv/97sNDYWorFg4nOEEXnsMMOS+e5D2sSVVJ+T6iGhgatqanp19LxynZ2\nduo999yjnZ2dSZejjn/LD9r83mzLly+PEadQKBSz0miyFkO86P3whz/UhoYGDYfD2tDQkLDSH0yl\nHWRjIiFLNz7dYCiEbrt//MdgofnVr3JuipEBTHSGKDof//jH03nuw5ZkYyJed1F7e3s0UnNtba02\nNzdrW1tbv0H5RI4Afrfi+Lf8+vr6aLeZv5Xg5Q+FQjpixIgYsYs/V319feDSBUGi50Worq+v15aW\nluhY1UDPY6BnV1lZGW3pJAtUmm135nzM03nvPdUTTwwWmh07sn55I8uY6AxRdKqqqtJ57sOWoEoq\nWUga/+x7zwHAPyifaqvD/5bvCVNnZ6cuXbpUW1pa+l1z6dKlUYGId3LwnAPiu95UY92H4yMiBAmt\nv9UyUKU6jyQcAAAgAElEQVQdf09e3LZkrZdsuzPnap7OK68Ei8zEiZG1aYzhg4nOEEWnoaEhnec+\nbAmqpOLXuGlubo4RGn9F659rkyiqQFtbm9533306ZswYLSsri7ZW/N1wbW1tMY4K9fX1Sbu/gqIB\neNENEnWVBY0lBQltsnGZgZ5dIZAtYfv1r4OF5pprMnoZI8sMdszPRGeIojNlypSUHnQpETRfJt7d\nuKysTMePH6+bN2/u12UVFNG5q6urn5D4PwP/wmrevj+Pt6BbUOXpiVR7e3vU5drfelmyZElg958n\nPF53YSKh9VotqY7pZKvlUggOAVdeGSw0v/lN3kwyhkA6Y34mOkMUnXA4POBDLkXiv4wtLS2Bg/3H\nHnusrl27VpcsWaIrVqzoFyk6fgJmkKOA35PMv+iaf6uvr49pBfmXn47/0XjCEw6HY8Z/amtrE3b3\nxYtFfOvJ7z2Xj4o/Xw4B776rOmZMsNDs3JkTE4wsks6Yn4nOEEWnvLx8wIdciiSL8lxeXp7Q0yxo\ncN8/1hO0+YXB60aLjz7tjd/4W1wNDQ0JvdW8CaT+63pRrFPt/vJPVE3kGJErcukQ8PLLwSLT0KD6\n/vtZu6yRB9LpEjbRGaLoVFRUDPiQS5GgL6PXFVVXV5dUdLyYaP5K0pvk6blah0IhHTt2rH7729/u\nF+mgq6tLlyxZovX19f26vOInd7a2tgb+aLwJpNXV1THi1tnZmXL3V3xFn+pS19kg2+NFDz0ULDQ3\n35zRyxgFyGC7hE10hig6M2bMSOlBlyLx4zHeFt/9JSIJWzqe67N/rRz/lzzIjdnv/ux3Ye7u7o6J\nWABEj8d3jflbREFOAql0kSWyLV+OApkcL+rrU73kkmChefLJDBhrDFvyJjpANfAU0AG8BHzVpY8C\nVgGdwJPASF+ZhcAOYBtwji/9FOBFYDtwpy+9Amh2ZTYAE3zHLnX5O4FLfOk1wEZ3bAUQTmC/Avrh\nD384G5/LsCGoS8s/p2bp0qW6efNmra2t1bKyshg35fgurkStA68FFS9qQfm7urqiLtHx83A84l2h\nvc2/Tk+qXWRBglasEZv37QsWGVC1VduNVMmn6IwFTnb7R7jKfxqwGPiaS78RuM3tNwDPA2EnDL8H\nxB17GjjN7T8OzHP7VwJ3uf2LgWY9JGx/AEYCR3n77tjDwIVufxnw9wnsV0Dnzp2blQ9muOCPZebv\nAvNXvMninSXrFop3NPDm1sR3q8XT1dWld999d8II1kEtHU8Mh/OiZkGsWBEsMqedpnrgQL6tM4qR\nguleA/4NOAt4GRijh4TpZbd/E3CjL/8TwCyXZ6svfQGwzO2vBGa5/RDwRnwePSQuF7v9N4Eyt386\nsDKBvQrookWLMvqBFDKJupWC0v2C4K/Ag2b5t7S0RJ0BgqJKt7W1aXNzsy5ZsiSmFeR3NPCP1dTU\n1ES7zfx2DGYpAq/11NraGh0n8pdPFiUg327JQ+WII4KFZt68fFtmDAcKQnRcy+VV1+J5O+7YPvf3\nh8DnfOn3Ap8BTgVW+dI/BrS6/ZeAcb5jO4DRwA3Azb70bwDXA0cD233p1cCLCWyOVqLDHa/i97zC\n4gfcE61Z4y0v0NnZqe3t7TGVt9812S8YiZYZ8MZ9KioqdPPmzbpkyZKY1tHYsWNjWibXXXeddnZ2\nRsPUJFqKoL29fUCRiL/HRFEC0nVLzodQ+a/Z1xcsMhBZUdMwMkneRccJzbPA+eoTGd/xtzRzovP7\nFERnhy99QNHxWjurV6/O5OdSMPgr0iBhCOpuivcSq62t7TcnprGxsd+gfnxlnWhsJch92u9l5m0V\nFRX90vxbXV1dwjhrflLtUkun6y0f82e6u7t12rS/SSg0mzYNjxabURisXr1aFy1aFN3yKjpExmdW\nAtf40rYR2722ze3Hd6+t5FD32jZfeqrdaz/xlfkJh7rX3iC2e+2JBLZHK6/hTHzFHx+TzBv/8Ffc\nnZ2d/YShtbU1ZmmDUCgUk6e2tjbaJeatGtrZ2RntykomHqlu8e7Sxx9/fMz/7e3t/e7f38obyOss\nkZt4sso7FaHKlABcfXXiFs0HH/S/j1JdEM7ILvkWnQeAf4lLW+yJC8GOBBXAJGIdCTYCMwEh4kgw\n36VfxSFHggUEOxJ4+0e5Yw/7BGgZcEUC20tCdPwVaXx0Zq9S9USnpqZGV6xYEbNkgbdNmDBBGxoa\nov/7Y6FVV1drZ2enqkYG+b3rVFRUaHNzs7a3t+uPf/zjIQnOhAkTtLm5OaZ1Fd+Kihcdf+VbX18f\nM6YU9Jy8MZ94d+5klfdA82eGKgCJRAY04TVLzVnCyC15Ex3go8BB4AUnJs8B84l0f7UT8WZb5YmB\nK7PQiU28y/SpRLrSdgBLfOkjgEdc+kagxnfsMpe+nViX6UlEvOG2OwEqT2B/SYiOavDsfG8dmkRd\nYPGbiMQE/Wxtbe03Y7+7u1uXLFnSr3XS2NioK1asGJLoiIg2NjZqZ2dnNKqAt4yBF1k6XlAGiiLt\nfz5BwpBs0Tl/yyXIrdo7HhTZIVmrp7c3sch8/vOxNidy3c72RFKjtMmb6BT7VkqioxqpiIKWl+7q\n6gocT4nf6uvrY1o6jY2NgXNqWlpaErZUvP0RI0ZoOBxOGBqnrKxMjzvuuMBj8ZNB/c4M/hhp3j3H\nR5H2BNM73tbWpkuWLEm6oml8d1uylkt8d6XfW84fzNRfduPGxELzzDPpf97FOp/IKGxMdEx0+pHI\nBdqrDL1WgVeBlpWVxYy71NXVaXNzs37/+9+Pdo95FXS855hXidbU1GhLS4t2dXXp1KlTkwrY8ccf\nr+PGjUurxdPa2hpzb/FOD0FLGbS3t+u0adNiRM/zjPOnBYXSaWtri45nxbdcvDlLftra2hKKpF+k\nRR5OKDS9vTn4khhGmpjomOjEEOQeHP8271WW/kmdZWVlOnbsWC0rK9OpU6fGvJF7btKdnZ1aWVmp\nEJmP09nZ2W+pgtraWv3Zz37Wb9A/2bZgwQKtq6tLGhQU0HHjxum4ceNiuvO6urpivNzKysp06dKl\n/YTnmmuuiTnXTTfd1K8F5F++wP8cPXdtr+WSbN5SkOh4JBKZYfYVNIY5JjomOjHEjyHEuzX7u8v8\nIhLvnuyJhncOr4XkpZeVlemNN97YL+6aV2YwrZdQKKR1dXU6atSopHmCIiMsXrw4MH9DQ0NM95t/\nrlBQSydePBKNdXnhehKF9vFalOFwWBsaTk4oMldcYeGajeLERMdEJwb/OESyJQVCoZB+61vfiopI\nfAXrRYIePXp0oKCIiIZCoUDRycU2btw4nTJlStI8ra2tMQu6eWLpX5snaNKrasQLzxPkeNFJNlD/\n298GiwyodnTk61thGI6331Z95BHVSy9VPeaYQ1/OQawpbqJjotMPLzZZ/Nt8ssmWJ554YkwLZeLE\niSl5tWVzO/rooxMKZiou2EuXLg0U3aDWSbxzQHzMufgJqP6B+rlzEwtNX1++vgVGyfLnP6u2tqpe\ncYXqhAmJv5ygOmOG6tKlgzq9iY6JTozjQNCYjv9tfunSpYGtk6qqqiGLxGDGcVLZQqGQVlRUqIhE\nl8j2vN4SOSJ4xz33aq+1Ul5erhMmTAhsnQQ5B/hbM7W1tdra2hpTJtnv2DCyzvvvq7a1qV57rWpd\nXfIvZF1dJF9bm+p77w350iY6JS468SKTbGmA7u5uXbFiRcYiBGRqi+/C828iotdff712dnbGLKLm\niZLnALFs2TJdunRpzCJtbW1tMULoxZKLn8/jjcN4+fzu5F4XZWPjRxP+pm+6aeg/ZMPox4EDqv/5\nn5GV9GbMSC4sEyZEWjatrZGWThYx0Slx0Um2rHRDQ0N0Fn78PB2vZTJy5MiUWhypiEc64zuVlZW6\ndu3aAVtJU6ZM0cWLF/eLluBFQwjy2PPPK/K2RDP0W1paYpwnNmzYoFdf/WqS3/mhcweF3/EzUBgc\ni5NWwvT1qW7erPqtb6mefnpyYTnmmMhYzCOPRMZm8oSJTomKjj9sS9AExtbW1phwNEFjILW1tXrz\nzTcPKAzl5eVZcxi46667dMWKFSmfPxwOx0wcDYVCunTp0phoCTU1NTEi5kUsSBaq5lBLJ/Fv3iuX\nzC066HMaaDKpxUkb5vT1qW7dqvqDH6h+/OPJheWII1Qvukj15z9X3bMn35YHYqJTgqITX1F1dnbq\n0qVLtaWlJVppLV++PKZijG+tlJWV6bHHHjskwfDHchvKNlhBO/roo2MiHDQ0NGhdXV1CkUq2nIFq\n8jogaGZ/UJDURAwUB83ipA0jXnlF9Uc/Uv3EJ5J/qcrKVP/2b1V/8hPVnTvzbfWgMdEpAdGJ736J\nH/j2z8XxR4rOt/dZNrdjjjkmxt07kWNB/ERRVdVduxLXB0cd9Xi0e86LQpDoM0klzEyqAUEtTlqR\nsHu36k9/qvqZz6hWViYXl7POUv2Xf1F1wXCHCyY6w1x0grpfurq6ohM1vSgC3r143U0NDQ0Z9ybL\n1ea5dofD4RhxjW+thUIhDYfDOn78+ECB9dYBUlW98MLEdcNzz/VoW1tb1MMv0aTRdMdeBhIoi5NW\nYLz1lupDD6l+4Quqo0YlF5aPfUz1//5f1RdeKBn/eBOdYS46QY4C/kpxOGzJAn/68wSJqDdBNb6r\nL+JwkLiuSBTA0+/959ngCULQCquJRMicAwqc7m7VX/9a9f/8H9Vx45ILyymnqP7zP0cis1pgPBOd\ntG+wgEUnaN6N1/0S7wZcTNu4ceP6LUud6lZXV5cwpI8nDhHxSlx3JFol1b9sdnt7e4yXnxcyZ/ny\n5YFegkFu2OYcUCD8z/+orlyp+pWvqNbWJheWhgbVf/xH1aeeisyBMRJiojPMRMcfu8sLMNnS0qJL\nly6Nuj77B9EHsyWbD5OrLdGSBQNtS5cu1c2bN/drFZWXn5qkLvltdCmDIE8/79l6raTKysroeI5/\nhVQv9lxlZWWM+Ptt8Ue3TnUdHiMDfPCB6po1ql/7mmpjY3JhqamJLL/6+OOq776bb8uLFhOdYSI6\n3mTG+ACW/nkp3pIE3/rWtwZdaZeVlelXvvKVvItOOltZWZmuXLnSN6fozYT1yjHHzOpX3osS7fdi\n27x5s1ZXV8cIhycQ8ZNFvS0cDkcjUXd3d8fEdRvqOjxGEg4cUP3xj5MLireNHav6d3+n+uijqvv3\n59vyYYmJzjAQnUSVnFfhxotQui2dYt2OOuqopPVM0GJtQZsnDPHLIQAxyyWoar+xnfjjqhqNWOBv\nOfm7Rf3OAeYaPQB9faoPPjjwwL23ffzjqr/8peqbb+bb8pLDRGcYiE78QmQQmbtSV1enEydOTFiJ\nFuvYTupbsnonkmft2rUJlyGoqanR8vJyraysjJnT9PnPfz4m37hx46Lx1jz8rZWGhoZo997y5cu1\ns7Ozn7h4XXcDTQJN5jqdStdb0XfRPfGE6qRJqQkLqM6apfr00/m22vCRN9EBfgrsBV70pY0CVgGd\nwJPASN+xhcAOYBtwji/9FOBFYDtwpy+9Amh2ZTYAE3zHLnX5O4FLfOk1wEZ3bAUQTmJ/wYhOd3d3\nv/AuY8eO1YaGhuiqnEGtm1TD0xTP9ukk9c9jgWVCoVA0DE55eXm0K62hoUFbWlqibtAbNmyICQDq\nbRUVFdrp5lHEV+je0geNjY1aVlYWncTqLevgF49UWjKJXKNT7Xrz5wuKIVcwbNyoetppqQvLlCmR\nAX+jKMin6HwMODlOdBYDX3P7NwK3uf0G4Hkg7ITh94C4Y08Dp7n9x4F5bv9K4C63fzHQrIeE7Q/A\nSOAob98dexi40O0vA/4+if0FJTr+5ZS9ytRfiV111VUxM/eHTysnWX2UmqfbxRdfrD/72c90yZIl\n2tnZGbOMtr/ba8mSJTHlvvjFL0Yr7kQVf6JWVLy4DGWSZ6pdb/G2xC/NnVO2blU9++zUheWYY1Qf\nfrhk5rIMZ/ImOpFrMzFOdF4Gxrj9scDLbv8m4EZfvieAWS7PVl/6AmCZ218JzHL7IeCN+Dx6SFwu\ndvtvAmVu/3RgZRLb8yY6/rhp69ev15aWln4VWk1NjTY0NETf3vMvDrkSmoHLJ2vh1dfXx3iWea2C\ncDis9fX10Wc5YsSImJZCMo8zb7zI39IJapWkO8kzVcGKd14Ih8PZHRt67TXVBQtSFxYR1bvuigz8\nG8OWQhOdfXHH97m/PwQ+50u/F/gMcCqwypf+MaDV7b8EjPMd2wGMBm4AbvalfwO4Hjga2O5Lr/bb\nFmB7XkTHX4l5Yw1B809ERCdNmqTnnXde3lbnzNwmQxYa/5ZsWWtAm5ubY5bh9keP9oKBequH+j+X\nRPHUPDHp7OzUe+65J2b5hEx+L1I5p3/57Yx4wf33f6teeWXqwgKq3/2uuRyXMNkSnTCZQTN0HgDJ\nUJ688vTTT7NlyxYOHjzIwYMHAdi5c2e/fKrKH//4R/74xz/m2sQMcRXw4wTHHgc+mfaZ33777aTH\nX3vtNQ4cOABEnmNtbS07d+5k4sSJvPLKK/T19bFjxw46OjqYPn06W7ZsYeLEidHy7777Lj09PVRV\nVQFQVVXF6aefDkBdXV3adifDf41kjBs3jueeey5qu2djUt5+Gz75SdiwIXWDrr8evvlNGDky9TKG\nMQTSFZ29IjJGVfeKyFjgDZfeBZzgy1ft0hKl+8vsFpEQcKSq7hORLqAprsxqVX1LREaKSJmq9sWd\nKym33HILTU1NNDU1DZg3XXp6enj66af5yle+EhWb8vJy+vr6OO644/jTn/6UtWvnjmTvGKOB5GKR\nLmVlZfT19QEwbdo07rvvvugzPuGEE3j44Yf5r//6L4444giuvPJK3n//fcLhMKNHj2b27Nl0dHQw\nduxYdu/eTV9fH6+88gqnnXYazzzzDOPGjQu8Zk9PD1u2bKGxsTG1ij+DBApUTw8ceeTgTvSlL8F3\nvwvHH58544xhx5o1a1izZk32L5RKc4iIU8BLvv8X48ZuCHYkqAAmEetIsBGYSaSV8jgw36VfxSFH\nggUEOxJ4+0e5Yw9zaHxnGXBFEtuz3r3mTfj82c9+pmPGjOnXDeTFEzvmmGMKoAss3S1z3Wbpbv7o\nAf61dSDicBE0JlZeXh64Eqk/jz9gaPznmouJnv3coz/4YHDdYN72xBNZsc8oTchS91oqgvMQsBt4\nH3gN+JITgXYirsyrPDFw+RcSEZt4l+lTiYzf7ACW+NJHAI+49I1Aje/YZS59O7Eu05OIeMNtJyJA\n5Unsz6roJJvwWdxbRUEIjbdNmDAhOhYzfvx43bx584ATRf1zdqqrq6Pp8WNnoVAocKA+qxM9+/oi\nnl6DFZbvfCdzNhhGEsiX6BT7lm3R8S9/XPzbE0nquyfzatvxxx+va9eujVkOYe3atTFu0/6WTn19\nfXTuzowZM6KeaF6sO/9icIkWaMvIGjjz5w9eWObMCbSlqCeLGkWHiU4Bik5QmJXi25LVf4cXgH2H\ntvjuyXA4HA2O6kULaG9vj0Yd6O7u1uXLl8e8FFRXV0fLxOcNqtRT8jb72tcGLyxHHqna15eSsFk8\nNyMfmOgUoOjET0Isni1ZfZhv2xJvXoQAf9o999wT+Nn4K2r/i0FQN1lKlfq99w5eWED1vfcG/B4N\nJGwWz83IByY6BSQ6XV1dumTJEp00aVLeK+LUtmOKVmj8W21trf74xz+OCo+3REEQ8Ut9V1dXJ2xN\neHkXpCMqEJkDk0VsqWsjH2RLdDzPsmGLGzQGIBP3un37dhobG6PzQwqXZPe6GpibK0NiGDlyJBUV\nFbz55psplwmHw0yYMIERI0awY8cOpkyZwpVXXskFF1yQ0NV59+7dnHjiibz33ntUVlbyu9/9jn37\n9vGh/fs5bP78wRu+fTtMmTL4chmip6dncHN2DGOIiAiqmvE5kSY6g6Cnp4epU6cW8FybZPdXDvTm\nypCMUV9fzw9/+ENUlXPPPZfe3l7Ky8tZu3ZtdA5Lv7k027fD1KmDvtaOb3+bsddcY5W6YZA90clU\nRIKS4Omnn2bPnj35NiOOZEJT8IEbkjJhwgRuu+02Zs6cCcD06dPZunUrU6dO5b09e0Ai91cF/HWq\nJ73hBvjBDwIP5a8dYxilg7V0BsB7iz766KOZO3cuXV0pBT/IIvXA1iTHi1towuEwvb2RFlkZcDCd\nk4wcCe+8Q09PD5s2beLaa6/l5ZdfZvr06axbt85aMoaRAtlq6ZRl+oTDiZ6eHmbPns2cOXOYPn16\nHgXHP54eLzi/JCI03lZ8+O/uQG9vdH8gwakoL+e37e2cPGMGFeXlnDxjBj3d3fDOO0AkjMxhhx3G\ntm3b6O3tpaOjg46Ojqzei2EYyTHRScLq1at56aWX6O3tjb595w5/VRyPX2S+mEuj0iaZW1oyykMh\nTp4xg/a2NsrD4Zg7nzp1KjNnzmTdunWsXbs2sBUzceJEysvLgUMOCYZh5A8TnQTs3r2bCy+8MBpg\nMjekKjSFSbrCUkns3VWOGEF5OMxJjY2sfPJJ1q1bx6xZs6ipqYmWKSsr48477wRgy5YtCb26du7c\nGX1hOHjwIK+99trQbtIwjCFhopOAX/7yl3zwwQdZvkoTxSY06QrLZGIFZfKJJ0bv7H1fvlAoxK9+\n9SvWrVvH+vXrOfPMM6mqqqKqqor/+I//YPLkyYTDYU466STq6+uj3Z+zZ8+mp6en33UbGxuZPn06\n5eXlNDQ0MH369Aw8BcMw0iYbk38KacNXLw6EFwpl7dq1WZzkmGye4TdyPuEyaHsozUmSfxt3njlz\n5sT8f/XVVydcEhpIKcyLf/Z+qjP1013p0zBKGVdnZr5OzsZJC2lLVXSyGy06WV2dH2G5Pk1heXQQ\n17jsssuiIWgqKiq0s7MzGm4mPu8JJ5wQjYOWKjZT3zCyR7ZEx7rXHN5Kn5nDX6fGk5tus4+QWBFu\nH6CsJNg+m+K1RYR58+ZFF1nr6+tj3759rFu3jpUrV1JfXx/NO2HCBDZu3BjtSkuVqqqqpE4EhmEU\nHiU/T8db6XPNmjV897vfHcKVLgaak1kyhHMnpgroTrNspi0KhULccsst7Nu3jyuuuILjjz+e2bNn\ns3XrVhoaGmKEwZtDAzBz5kwTDMMoMCwMTpokE52enh5mzpzJyy+/nObZkz27i4msTZcZ0v2UstWW\n+vKXv8wVV1zBBRdcwKuvvgpEBu3Xr18fIyAWM8wwihMTnTRJJjqtra2cf/75gzxjsuc1tM+n0IQl\nEXV1dTz77LNUVVVZi8UwhikWey0LeG/oA5M5oUlXWEYA2XTgvv7669m9ezeXXXYZ//AP/8Crr77K\n2LFjeeihh+js7GTOnDns2rULiBWXqqoqzjzzzCxaZhjGcKJkWzo9PT2sXr2az372swHLFNQAf0xw\nxm1AQ9JrpvtE64F0O/oGQ1lZxH/kxBNP5Ktf/Sqf+cxnYpYIsC4xwzCsey0AEZkP3ElkkutPVXVx\nQJ5+ouPFVOvo6KCuro4TTzyR3/zmaOC+BFc6Dohd/2UTcFoaNv9/wD1plBuIRYsWcdddd/Hmm28y\nevRoPvvZz3LDDTcAcN9993HhhRfy9ttvA5HlAl577TUTFcMwEmKiE4eIlAHbgTOB3cAzwAJVfTku\nXz/R2bBhA3PmzKG3tx54MdEVOCnJ0WT8B5FYA5nirLPOYtasWYwfP54XXniBkSNHUlVVxYQJE+jp\n6Ym2VAqthbJmzRqamprybUbamP35xezPLzam05+ZwA5V3QkgIs3A+aTQQ9XY2MiIERvo7f0IFbzP\na0xgDG8M6uLrgY+mYbTHlClT6Orq4t1330VEmDVrFocffjjV1dWcc845/OhHP6KqqoqamhqWL1+e\n0jmrqqqiC5sVAsX+ozP784vZPzwpZtEZD+zy/f86ESEakKqqKvbMv4MjHnsoab4bge+nbR4cfvjh\nfOMb3+Avf/kL27dvZ//+/XzkIx/hqquuGrBl8rnPfQ6AW265ZQgWGIZhFBbFLDqDYu3atTH/H/H1\nf4S/PYc/f+xjfOyCC6IrUt5www2sX7+eo446igceeAD27u13rqqqKmbOnMkpp5xCQ0MDu3bt4oMP\nPqCiooKTTz6ZU089NaUxk0JrmRiGYWSbYh7TOR24RVXnu/9vIhIraHFcvuK8QcMwjDxjjgQ+RCQE\ndBJxJPgTEYey/62q2/JqmGEYhpGQou1eU9WDIvIPwCoOuUyb4BiGYRQwRdvSMQzDMIqPYbu0gYjM\nF5GXRWS7iNyYb3s8RKRaRJ4SkQ4ReUlEvurSR4nIKhHpFJEnRWSkr8xCEdkhIttE5Bxf+iki8qK7\nxztzeA9lIvKciLQWoe0jReRXzp4OEZlVZPZfJyJb3LUfFJGKQrZfRH4qIntF5EVfWsbsdfff7Mps\nEJEJObD/+86+F0TkMRE5spjs9x27QUT6RGR0Tu3PxiI9+d6IiOnvgYlAOfACMC3fdjnbxgInu/0j\niIxLTQMWA19z6TcCt7n9BuB5Il2hNe6+vBbq08Bpbv9xYF6O7uE64JdAq/u/mGz/OfAltx8GRhaL\n/cA44BWgwv3/MHBpIdsPfAw4GXjRl5Yxe4Ergbvc/sVAcw7sPwsoc/u3Ad8rJvtdejWwkki8r9Eu\nrT4X9mf9R56PDTgdeML3/03Ajfm2K4Gt/+a+xC8DY1zaWODlINuBJ4BZLs9WX/oCYFkO7K0G2ogE\nXfBEp1hsPxL4Q0B6sdg/DtgJjHIVQ2sxfHeIvPz5K+2M2Uuk4pzl9kPAm9m2P+7Yp4BfFJv9wK+A\nk4gVnZzYP1y714Imjo7Pky0JEZEaIm8hG4n8CPcCqOoeIgHfoP+9dLm08UTuyyNX93gH8E/ExjUt\nFtsnAf8tIve57sG7ReQwisR+Vd1NZNHX15wt+1W1nSKx38dxGbQ3WkZVDwLv+LuLcsDfEXnzj7HF\nUZD2i8h5wC5VfSnuUE7sH66iU/CIyBHAo8A1qvpn+genLjgPDxH5JLBXVV8g+ZoOBWe7IwycAvxY\nVVTp2ZMAAAJHSURBVE8B/kLk7a7gnz2AiBxFJNTTRCKtnsNF5PMUif1JyKS9OVteSkS+DhxQ1RWZ\nPG0Gz9X/5CJ/BdwMLMrWJQbKMFxFpwvwD2hVu7SCQETCRATnF6ra4pL3isgYd3wsRIPBdQEn+Ip7\n95IoPZt8FDhPRF4BVgBzReQXwJ4isB0ib2i7VPVZ9/9jRESoGJ49RLrSXlHVfe6t8l+BMyge+z0y\naW/0mETm7h2pqvuyZ3oEEbkM+ATwOV9yMdh/IpHxmt+JyB+dLc+JyHEkrjczav9wFZ1ngMkiMlFE\nKoj0Qbbm2SY/PyPSR7rEl9YKXOb2LwVafOkLnJfIJGAysMl1S+wXkZkiIsAlvjJZQVVvVtUJqlpL\n5Jk+papfBH5T6LY7+/cCu0SkziWdCXRQBM/e8RpwuohUuuueCWwtAvuF2DfgTNrb6s4BcCHwVLbt\nl8iSKv8EnKeq7/vyFbz9qrpFVceqaq2qTiLyIvZhVX3D2XJx1u3P9KBVoWzAfCKeYTuAm/Jtj8+u\njwIHiXjUPQ8852wdDbQ7m1cBR/nKLCTiSbINOMeXfirwkrvHJTm+j//FIUeCorEdmEHkpeQF4NdE\nvNeKyf5FzpYXgfuJeGcWrP3AQ0SWHnmfiGh+iYgjREbsJbKo7iMufSNQkwP7dxBx6HjObXcVk/1x\nx1/BORLkyn6bHGoYhmHkjOHavWYYhmEUICY6hmEYRs4w0TEMwzByhomOYRiGkTNMdAzDMIycYaJj\nGIZh5AwTHcMwDCNnmOgYhmEYOeP/B6B1gHehiEZaAAAAAElFTkSuQmCC\n", 552 | "text/plain": [ 553 | "" 554 | ] 555 | }, 556 | "metadata": {}, 557 | "output_type": "display_data" 558 | } 559 | ], 560 | "source": [ 561 | "import matplotlib.pyplot as plt\n", 562 | "%matplotlib inline\n", 563 | "plt.plot(simple_feature_matrix,output,'k.',\n", 564 | " simple_feature_matrix,predict_output(simple_feature_matrix, simple_weight_0_penalty),'b-',\n", 565 | " simple_feature_matrix,predict_output(simple_feature_matrix, simple_weight_high_penalty),'r-')" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "### RSS function" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 104, 578 | "metadata": { 579 | "collapsed": false 580 | }, 581 | "outputs": [], 582 | "source": [ 583 | "def RSS (predicted_output, true_output):\n", 584 | " residuals = predicted_output - true_output\n", 585 | " residuals_squared = residuals * residuals\n", 586 | " residuals_sum_of_squares = residuals_squared.sum()\n", 587 | " return residuals_sum_of_squares" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": {}, 593 | "source": [ 594 | "### RSS on the TEST data: (Simple model)\n", 595 | "1. The initial weights (all zeros)\n", 596 | "2. The weights learned with no regularization\n", 597 | "3. The weights learned with high regularization" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 105, 603 | "metadata": { 604 | "collapsed": false 605 | }, 606 | "outputs": [ 607 | { 608 | "name": "stdout", 609 | "output_type": "stream", 610 | "text": [ 611 | "For simple model and initial weights:\n", 612 | "Weight (Coefficients): [ 0. 0.]\n", 613 | "RSS: 1.78427328252e+15\n" 614 | ] 615 | } 616 | ], 617 | "source": [ 618 | "predictions = predict_output(simple_test_feature_matrix, initial_weights)\n", 619 | "print \"For simple model and initial weights:\" \n", 620 | "print \"Weight (Coefficients): \" + str(initial_weights)\n", 621 | "print \"RSS: \" + str(RSS(predictions, test_output))" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 106, 627 | "metadata": { 628 | "collapsed": false 629 | }, 630 | "outputs": [ 631 | { 632 | "name": "stdout", 633 | "output_type": "stream", 634 | "text": [ 635 | "For simple model and weights with no regularization:\n", 636 | "Weight (Coefficient): [ -1.63113501e-01 2.63024369e+02]\n", 637 | "RSS: 2.75723634598e+14\n" 638 | ] 639 | } 640 | ], 641 | "source": [ 642 | "predictions = predict_output(simple_test_feature_matrix, simple_weight_0_penalty)\n", 643 | "print \"For simple model and weights with no regularization:\"\n", 644 | "print \"Weight (Coefficient): \" + str(simple_weight_0_penalty)\n", 645 | "print \"RSS: \" + str(RSS(predictions, test_output))" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": 107, 651 | "metadata": { 652 | "collapsed": false 653 | }, 654 | "outputs": [ 655 | { 656 | "name": "stdout", 657 | "output_type": "stream", 658 | "text": [ 659 | "For simple model and weights with regularization:\n", 660 | "Weight (Coefficient): [ 9.76730383 124.57217565]\n", 661 | "RSS: 6.94642100914e+14\n" 662 | ] 663 | } 664 | ], 665 | "source": [ 666 | "predictions = predict_output(simple_test_feature_matrix, simple_weight_high_penalty)\n", 667 | "print \"For simple model and weights with regularization:\"\n", 668 | "print \"Weight (Coefficient): \" + str(simple_weight_high_penalty)\n", 669 | "print \"RSS: \" + str(RSS(predictions, test_output))" 670 | ] 671 | }, 672 | { 673 | "cell_type": "markdown", 674 | "metadata": {}, 675 | "source": [ 676 | "### Which weights perform the best?\n", 677 | "#### The weights with no regularization seem to perform the best for now!" 678 | ] 679 | }, 680 | { 681 | "cell_type": "markdown", 682 | "metadata": {}, 683 | "source": [ 684 | "### Multiple regression with L2 penalty (Regularization)" 685 | ] 686 | }, 687 | { 688 | "cell_type": "markdown", 689 | "metadata": {}, 690 | "source": [ 691 | "Let us now consider a model with 2 features: `['sqft_living', 'sqft_living15']`." 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": {}, 697 | "source": [ 698 | "First, create Numpy versions of your training and test data with these two features. " 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 108, 704 | "metadata": { 705 | "collapsed": true 706 | }, 707 | "outputs": [], 708 | "source": [ 709 | "model_features = ['sqft_living', 'sqft_living15'] # sqft_living15 is the average squarefeet for the nearest 15 neighbors. \n", 710 | "my_output = 'price'\n", 711 | "(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)\n", 712 | "(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "We need to re-inialize the weights, since we have one extra parameter. Let us also set the step size and maximum number of iterations." 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": 109, 725 | "metadata": { 726 | "collapsed": true 727 | }, 728 | "outputs": [], 729 | "source": [ 730 | "initial_weights = np.array([0.0,0.0,0.0])\n", 731 | "step_size = 1e-12\n", 732 | "max_iterations = 1000" 733 | ] 734 | }, 735 | { 736 | "cell_type": "markdown", 737 | "metadata": {}, 738 | "source": [ 739 | "#### Learned weights with no regulariztion" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 110, 745 | "metadata": { 746 | "collapsed": false 747 | }, 748 | "outputs": [ 749 | { 750 | "name": "stdout", 751 | "output_type": "stream", 752 | "text": [ 753 | "Starting gradient descent with l2_penalty = 0.0\n", 754 | "Iteration = 1\n", 755 | "Cost function = 7.43305185103e+15\n", 756 | "Iteration = 2\n", 757 | "Cost function = 4.0567523315e+15\n", 758 | "Iteration = 3\n", 759 | "Cost function = 2.52956511433e+15\n", 760 | "Iteration = 4\n", 761 | "Cost function = 1.83855669428e+15\n", 762 | "Iteration = 5\n", 763 | "Cost function = 1.52567557521e+15\n", 764 | "Iteration = 6\n", 765 | "Cost function = 1.38378949867e+15\n", 766 | "Iteration = 7\n", 767 | "Cost function = 1.31923260628e+15\n", 768 | "Iteration = 8\n", 769 | "Cost function = 1.28964887203e+15\n", 770 | "Iteration = 9\n", 771 | "Cost function = 1.27588472408e+15\n", 772 | "Iteration = 10\n", 773 | "Cost function = 1.26927880758e+15\n", 774 | "Iteration = 20\n", 775 | "Cost function = 1.25781238632e+15\n", 776 | "Iteration = 30\n", 777 | "Cost function = 1.25195457127e+15\n", 778 | "Iteration = 40\n", 779 | "Cost function = 1.24675542316e+15\n", 780 | "Iteration = 50\n", 781 | "Cost function = 1.24213950875e+15\n", 782 | "Iteration = 60\n", 783 | "Cost function = 1.23804140114e+15\n", 784 | "Iteration = 70\n", 785 | "Cost function = 1.23440301346e+15\n", 786 | "Iteration = 80\n", 787 | "Cost function = 1.23117277498e+15\n", 788 | "Iteration = 90\n", 789 | "Cost function = 1.22830490006e+15\n", 790 | "Iteration = 100\n", 791 | "Cost function = 1.22575873926e+15\n", 792 | "Iteration = 200\n", 793 | "Cost function = 1.21173888142e+15\n", 794 | "Iteration = 300\n", 795 | "Cost function = 1.20747308096e+15\n", 796 | "Iteration = 400\n", 797 | "Cost function = 1.20617512577e+15\n", 798 | "Iteration = 500\n", 799 | "Cost function = 1.20578019023e+15\n", 800 | "Iteration = 600\n", 801 | "Cost function = 1.20566001447e+15\n", 802 | "Iteration = 700\n", 803 | "Cost function = 1.20562343925e+15\n", 804 | "Iteration = 800\n", 805 | "Cost function = 1.20561230098e+15\n", 806 | "Iteration = 900\n", 807 | "Cost function = 1.20560890236e+15\n", 808 | "Iteration = 1000\n", 809 | "Cost function = 1.20560785866e+15\n", 810 | "Done with gradient descent at iteration 1000\n", 811 | "Learned weights = [ -0.35743482 243.0541689 22.41481594]\n" 812 | ] 813 | } 814 | ], 815 | "source": [ 816 | "l2_penalty = 0.0\n", 817 | "multiple_weights_0_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, \n", 818 | " l2_penalty, max_iterations)" 819 | ] 820 | }, 821 | { 822 | "cell_type": "markdown", 823 | "metadata": {}, 824 | "source": [ 825 | "#### Learned weights with regulariztion" 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": 111, 831 | "metadata": { 832 | "collapsed": false 833 | }, 834 | "outputs": [ 835 | { 836 | "name": "stdout", 837 | "output_type": "stream", 838 | "text": [ 839 | "Starting gradient descent with l2_penalty = 1e+11\n", 840 | "Iteration = 1\n", 841 | "Cost function = 7.43305185103e+15\n", 842 | "Iteration = 2\n", 843 | "Cost function = 4.46048979029e+15\n", 844 | "Iteration = 3\n", 845 | "Cost function = 3.79667446884e+15\n", 846 | "Iteration = 4\n", 847 | "Cost function = 3.64831953044e+15\n", 848 | "Iteration = 5\n", 849 | "Cost function = 3.61509110322e+15\n", 850 | "Iteration = 6\n", 851 | "Cost function = 3.60760274251e+15\n", 852 | "Iteration = 7\n", 853 | "Cost function = 3.60588632216e+15\n", 854 | "Iteration = 8\n", 855 | "Cost function = 3.60547487453e+15\n", 856 | "Iteration = 9\n", 857 | "Cost function = 3.60536516777e+15\n", 858 | "Iteration = 10\n", 859 | "Cost function = 3.60532940218e+15\n", 860 | "Iteration = 20\n", 861 | "Cost function = 3.60529428102e+15\n", 862 | "Iteration = 30\n", 863 | "Cost function = 3.60529353727e+15\n", 864 | "Iteration = 40\n", 865 | "Cost function = 3.60529308275e+15\n", 866 | "Iteration = 50\n", 867 | "Cost function = 3.60529263111e+15\n", 868 | "Iteration = 60\n", 869 | "Cost function = 3.60529217949e+15\n", 870 | "Iteration = 70\n", 871 | "Cost function = 3.60529172788e+15\n", 872 | "Iteration = 80\n", 873 | "Cost function = 3.60529127626e+15\n", 874 | "Iteration = 90\n", 875 | "Cost function = 3.60529082465e+15\n", 876 | "Iteration = 100\n", 877 | "Cost function = 3.60529037303e+15\n", 878 | "Iteration = 200\n", 879 | "Cost function = 3.6052858569e+15\n", 880 | "Iteration = 300\n", 881 | "Cost function = 3.60528134078e+15\n", 882 | "Iteration = 400\n", 883 | "Cost function = 3.60527682468e+15\n", 884 | "Iteration = 500\n", 885 | "Cost function = 3.60527230859e+15\n", 886 | "Iteration = 600\n", 887 | "Cost function = 3.60526779252e+15\n", 888 | "Iteration = 700\n", 889 | "Cost function = 3.60526327646e+15\n", 890 | "Iteration = 800\n", 891 | "Cost function = 3.60525876041e+15\n", 892 | "Iteration = 900\n", 893 | "Cost function = 3.60525424438e+15\n", 894 | "Iteration = 1000\n", 895 | "Cost function = 3.60524972836e+15\n", 896 | "Done with gradient descent at iteration 1000\n", 897 | "Learned weights = [ 6.7429658 91.48927361 78.43658768]\n" 898 | ] 899 | } 900 | ], 901 | "source": [ 902 | "l2_penalty = 1e11\n", 903 | "multiple_weights_high_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, \n", 904 | " l2_penalty, max_iterations)" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "### RSS on the TEST data: (Multiple model)\n", 912 | "1. The initial weights (all zeros)\n", 913 | "2. The weights learned with no regularization\n", 914 | "3. The weights learned with high regularization" 915 | ] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": 112, 920 | "metadata": { 921 | "collapsed": false 922 | }, 923 | "outputs": [ 924 | { 925 | "name": "stdout", 926 | "output_type": "stream", 927 | "text": [ 928 | "For multiple model and initial weights:\n", 929 | "Weight (Coefficients): [ 0. 0. 0.]\n", 930 | "RSS: 1.78427328252e+15\n" 931 | ] 932 | } 933 | ], 934 | "source": [ 935 | "predictions = predict_output(test_feature_matrix, initial_weights)\n", 936 | "print \"For multiple model and initial weights:\" \n", 937 | "print \"Weight (Coefficients): \" + str(initial_weights)\n", 938 | "print \"RSS: \" + str(RSS(predictions, test_output))" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": 113, 944 | "metadata": { 945 | "collapsed": false 946 | }, 947 | "outputs": [ 948 | { 949 | "name": "stdout", 950 | "output_type": "stream", 951 | "text": [ 952 | "For multiple model and weights with no regularization:\n", 953 | "Weight (Coefficients): [ -0.35743482 243.0541689 22.41481594]\n", 954 | "RSS: 2.74067618287e+14\n" 955 | ] 956 | } 957 | ], 958 | "source": [ 959 | "predictions = predict_output(test_feature_matrix, multiple_weights_0_penalty)\n", 960 | "print \"For multiple model and weights with no regularization:\" \n", 961 | "print \"Weight (Coefficients): \" + str(multiple_weights_0_penalty)\n", 962 | "print \"RSS: \" + str(RSS(predictions, test_output))" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": 114, 968 | "metadata": { 969 | "collapsed": false 970 | }, 971 | "outputs": [ 972 | { 973 | "name": "stdout", 974 | "output_type": "stream", 975 | "text": [ 976 | "For multiple model and weights with regularization:\n", 977 | "Weight (Coefficients): [ 6.7429658 91.48927361 78.43658768]\n", 978 | "RSS: 5.0040480058e+14\n" 979 | ] 980 | } 981 | ], 982 | "source": [ 983 | "predictions = predict_output(test_feature_matrix, multiple_weights_high_penalty)\n", 984 | "print \"For multiple model and weights with regularization:\" \n", 985 | "print \"Weight (Coefficients): \" + str(multiple_weights_high_penalty)\n", 986 | "print \"RSS: \" + str(RSS(predictions, test_output))" 987 | ] 988 | }, 989 | { 990 | "cell_type": "markdown", 991 | "metadata": {}, 992 | "source": [ 993 | "### Which weights perform best?\n", 994 | "\n", 995 | "#### The weights with no regularization seem to perform the best in the multiple model too same as the simple model result!" 996 | ] 997 | }, 998 | { 999 | "cell_type": "markdown", 1000 | "metadata": {}, 1001 | "source": [ 1002 | "## House 1 price prediction from test dataset" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "execution_count": 118, 1008 | "metadata": { 1009 | "collapsed": false 1010 | }, 1011 | "outputs": [ 1012 | { 1013 | "data": { 1014 | "text/plain": [ 1015 | "array([ 1.00000000e+00, 1.43000000e+03, 1.78000000e+03])" 1016 | ] 1017 | }, 1018 | "execution_count": 118, 1019 | "metadata": {}, 1020 | "output_type": "execute_result" 1021 | } 1022 | ], 1023 | "source": [ 1024 | "#House no.1 in the test dataset\n", 1025 | "test_feature_matrix[0]" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "execution_count": 77, 1031 | "metadata": { 1032 | "collapsed": false 1033 | }, 1034 | "outputs": [ 1035 | { 1036 | "data": { 1037 | "text/plain": [ 1038 | "['sqft_living', 'sqft_living15']" 1039 | ] 1040 | }, 1041 | "execution_count": 77, 1042 | "metadata": {}, 1043 | "output_type": "execute_result" 1044 | } 1045 | ], 1046 | "source": [ 1047 | "#Features in consideration\n", 1048 | "model_features" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "code", 1053 | "execution_count": 80, 1054 | "metadata": { 1055 | "collapsed": false 1056 | }, 1057 | "outputs": [ 1058 | { 1059 | "data": { 1060 | "text/plain": [ 1061 | "'price'" 1062 | ] 1063 | }, 1064 | "execution_count": 80, 1065 | "metadata": {}, 1066 | "output_type": "execute_result" 1067 | } 1068 | ], 1069 | "source": [ 1070 | "#Output feature to predict\n", 1071 | "my_output" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "code", 1076 | "execution_count": 119, 1077 | "metadata": { 1078 | "collapsed": false 1079 | }, 1080 | "outputs": [ 1081 | { 1082 | "data": { 1083 | "text/plain": [ 1084 | "array([ -0.35743482, 243.0541689 , 22.41481594])" 1085 | ] 1086 | }, 1087 | "execution_count": 119, 1088 | "metadata": {}, 1089 | "output_type": "execute_result" 1090 | } 1091 | ], 1092 | "source": [ 1093 | "#Weights with no regularization learned\n", 1094 | "multiple_weights_0_penalty" 1095 | ] 1096 | }, 1097 | { 1098 | "cell_type": "code", 1099 | "execution_count": 120, 1100 | "metadata": { 1101 | "collapsed": false 1102 | }, 1103 | "outputs": [ 1104 | { 1105 | "data": { 1106 | "text/plain": [ 1107 | "array([ 6.7429658 , 91.48927361, 78.43658768])" 1108 | ] 1109 | }, 1110 | "execution_count": 120, 1111 | "metadata": {}, 1112 | "output_type": "execute_result" 1113 | } 1114 | ], 1115 | "source": [ 1116 | "#Weights with regularization learned\n", 1117 | "multiple_weights_high_penalty" 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "code", 1122 | "execution_count": 122, 1123 | "metadata": { 1124 | "collapsed": false 1125 | }, 1126 | "outputs": [ 1127 | { 1128 | "name": "stdout", 1129 | "output_type": "stream", 1130 | "text": [ 1131 | "The predicted house 1 price (weights with no regularization): \n" 1132 | ] 1133 | }, 1134 | { 1135 | "data": { 1136 | "text/plain": [ 1137 | "387465.47646474396" 1138 | ] 1139 | }, 1140 | "execution_count": 122, 1141 | "metadata": {}, 1142 | "output_type": "execute_result" 1143 | } 1144 | ], 1145 | "source": [ 1146 | "#Predicting the output using weights with no regularization\n", 1147 | "predicted_output = predict_output(test_feature_matrix[0:1], multiple_weights_0_penalty)\n", 1148 | "print \"The predicted house 1 price (weights with no regularization): \"\n", 1149 | "predicted_output[0]" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "code", 1154 | "execution_count": 123, 1155 | "metadata": { 1156 | "collapsed": false 1157 | }, 1158 | "outputs": [ 1159 | { 1160 | "name": "stdout", 1161 | "output_type": "stream", 1162 | "text": [ 1163 | "The predicted house 1 price (weights with regularization): \n" 1164 | ] 1165 | }, 1166 | { 1167 | "data": { 1168 | "text/plain": [ 1169 | "270453.53030485858" 1170 | ] 1171 | }, 1172 | "execution_count": 123, 1173 | "metadata": {}, 1174 | "output_type": "execute_result" 1175 | } 1176 | ], 1177 | "source": [ 1178 | "#Predicting the output using weights with regularization\n", 1179 | "predicted_output = predict_output(test_feature_matrix[0:1], multiple_weights_high_penalty)\n", 1180 | "print \"The predicted house 1 price (weights with regularization): \"\n", 1181 | "predicted_output[0]" 1182 | ] 1183 | }, 1184 | { 1185 | "cell_type": "code", 1186 | "execution_count": 125, 1187 | "metadata": { 1188 | "collapsed": false 1189 | }, 1190 | "outputs": [ 1191 | { 1192 | "name": "stdout", 1193 | "output_type": "stream", 1194 | "text": [ 1195 | "The actual house 1 price is: \n" 1196 | ] 1197 | }, 1198 | { 1199 | "data": { 1200 | "text/plain": [ 1201 | "310000.0" 1202 | ] 1203 | }, 1204 | "execution_count": 125, 1205 | "metadata": {}, 1206 | "output_type": "execute_result" 1207 | } 1208 | ], 1209 | "source": [ 1210 | "#Actual house 1 price\n", 1211 | "print \"The actual house 1 price is: \"\n", 1212 | "test_output[0]" 1213 | ] 1214 | } 1215 | ], 1216 | "metadata": { 1217 | "kernelspec": { 1218 | "display_name": "Python 2", 1219 | "language": "python", 1220 | "name": "python2" 1221 | }, 1222 | "language_info": { 1223 | "codemirror_mode": { 1224 | "name": "ipython", 1225 | "version": 2 1226 | }, 1227 | "file_extension": ".py", 1228 | "mimetype": "text/x-python", 1229 | "name": "python", 1230 | "nbconvert_exporter": "python", 1231 | "pygments_lexer": "ipython2", 1232 | "version": "2.7.13" 1233 | } 1234 | }, 1235 | "nbformat": 4, 1236 | "nbformat_minor": 0 1237 | } 1238 | -------------------------------------------------------------------------------- /simple-linear-regression/simple-linear-regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Simple Linear Regression on House Sales data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Fire up Graphlab Create" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import graphlab" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Load in house sales data\n", 33 | "\n", 34 | "Dataset is from house sales in King County, the region where the city of Seattle, WA is located." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 17, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "sales = graphlab.SFrame('kc_house_data.gl/')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Explore house sales data" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 18, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/html": [ 65 | "
\n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfront
71293005202014-10-13 00:00:00+00:00221900.03.01.01180.0565010
\n", 89 | "\n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | "
viewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelat
03711800195509817847.51123398
\n", 113 | "\n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | "
longsqft_living15sqft_lot15
-122.256775361340.05650.0
\n", 125 | "[1 rows x 21 columns]
\n", 126 | "
" 127 | ], 128 | "text/plain": [ 129 | "Columns:\n", 130 | "\tid\tstr\n", 131 | "\tdate\tdatetime\n", 132 | "\tprice\tfloat\n", 133 | "\tbedrooms\tfloat\n", 134 | "\tbathrooms\tfloat\n", 135 | "\tsqft_living\tfloat\n", 136 | "\tsqft_lot\tint\n", 137 | "\tfloors\tstr\n", 138 | "\twaterfront\tint\n", 139 | "\tview\tint\n", 140 | "\tcondition\tint\n", 141 | "\tgrade\tint\n", 142 | "\tsqft_above\tint\n", 143 | "\tsqft_basement\tint\n", 144 | "\tyr_built\tint\n", 145 | "\tyr_renovated\tint\n", 146 | "\tzipcode\tstr\n", 147 | "\tlat\tfloat\n", 148 | "\tlong\tfloat\n", 149 | "\tsqft_living15\tfloat\n", 150 | "\tsqft_lot15\tfloat\n", 151 | "\n", 152 | "Rows: 1\n", 153 | "\n", 154 | "Data:\n", 155 | "+------------+---------------------------+----------+----------+-----------+\n", 156 | "| id | date | price | bedrooms | bathrooms |\n", 157 | "+------------+---------------------------+----------+----------+-----------+\n", 158 | "| 7129300520 | 2014-10-13 00:00:00+00:00 | 221900.0 | 3.0 | 1.0 |\n", 159 | "+------------+---------------------------+----------+----------+-----------+\n", 160 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 161 | "| sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above |\n", 162 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 163 | "| 1180.0 | 5650 | 1 | 0 | 0 | 3 | 7 | 1180 |\n", 164 | "+-------------+----------+--------+------------+------+-----------+-------+------------+\n", 165 | "+---------------+----------+--------------+---------+-------------+\n", 166 | "| sqft_basement | yr_built | yr_renovated | zipcode | lat |\n", 167 | "+---------------+----------+--------------+---------+-------------+\n", 168 | "| 0 | 1955 | 0 | 98178 | 47.51123398 |\n", 169 | "+---------------+----------+--------------+---------+-------------+\n", 170 | "+---------------+---------------+-----+\n", 171 | "| long | sqft_living15 | ... |\n", 172 | "+---------------+---------------+-----+\n", 173 | "| -122.25677536 | 1340.0 | ... |\n", 174 | "+---------------+---------------+-----+\n", 175 | "[1 rows x 21 columns]" 176 | ] 177 | }, 178 | "execution_count": 18, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "sales[0:1]" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Splitting the data" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 15, 197 | "metadata": { 198 | "collapsed": false 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "train_data, test_data = sales.random_split(.8,seed=0)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## Simple linear regression algorithm - Part 1\n", 210 | "To calculate the intercept and the slope of the regression line" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 19, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "# x is the input and y is the output\n", 222 | "def simple_linear_regression(x, y):\n", 223 | " \n", 224 | " # compute the sum of input and output\n", 225 | " sum = x + y\n", 226 | " \n", 227 | " # compute the product of the output and the input and its sum\n", 228 | " product = x * y\n", 229 | " sum_of_product = product.sum()\n", 230 | " \n", 231 | " # compute the squared value of the input and its sum\n", 232 | " x_squared = x * x\n", 233 | " sum_x_squared = x_squared.sum()\n", 234 | " \n", 235 | " # use the formula for the slope\n", 236 | " numerator = sum_of_product - ((x.sum() * y.sum()) / x.size())\n", 237 | " denominator = sum_x_squared - ((x.sum() * x.sum()) / x.size())\n", 238 | " slope = numerator / denominator\n", 239 | " \n", 240 | " # use the formula for the intercept\n", 241 | " intercept = y.mean() - (slope * x.mean())\n", 242 | " \n", 243 | " return (intercept, slope)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "We can test that our function works by passing it something where we know the answer. In particular we can generate a feature and then put the output exactly on a line: output = 1 + 1\\*input_feature then we know both our slope and intercept should be 1" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 20, 256 | "metadata": { 257 | "collapsed": false, 258 | "scrolled": true 259 | }, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "[0L, 1L, 2L, 3L, 4L]\n", 266 | "[1L, 2L, 3L, 4L, 5L]\n", 267 | "Intercept: 1.0\n", 268 | "Slope: 1\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "test_feature = graphlab.SArray(range(5))\n", 274 | "test_output = graphlab.SArray(1 + 1*test_feature)\n", 275 | "(test_intercept, test_slope) = simple_linear_regression(test_feature, test_output)\n", 276 | "print test_feature\n", 277 | "print test_output\n", 278 | "print \"Intercept: \" + str(test_intercept)\n", 279 | "print \"Slope: \" + str(test_slope)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "So now it works let's build a regression model for predicting price based on sqft_living" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 21, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "Intercept: -47116.0765749\n", 301 | "Slope: 281.958838568\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "sqft_intercept, sqft_slope = simple_linear_regression(train_data['sqft_living'], train_data['price'])\n", 307 | "\n", 308 | "print \"Intercept: \" + str(sqft_intercept)\n", 309 | "print \"Slope: \" + str(sqft_slope)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "## Simple linear regression algorithm - Part 2\n", 317 | "To calculate the predicted output" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 22, 323 | "metadata": { 324 | "collapsed": false 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "def get_regression_predictions(input_feature, intercept, slope):\n", 329 | " # calculate the predicted values:\n", 330 | " predicted_values = intercept + (slope * input_feature)\n", 331 | " return predicted_values" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "**What is the predicted price for a house with 2650 sqft?**" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 23, 344 | "metadata": { 345 | "collapsed": false 346 | }, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "The estimated price for a house with 2650 squarefeet is $700074.85\n" 353 | ] 354 | } 355 | ], 356 | "source": [ 357 | "my_house_sqft = 2650\n", 358 | "estimated_price = get_regression_predictions(my_house_sqft, sqft_intercept, sqft_slope)\n", 359 | "print \"The estimated price for a house with %d squarefeet is $%.2f\" % (my_house_sqft, estimated_price)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "## Residual Sum of Squares" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": {}, 372 | "source": [ 373 | "RSS is the sum of the squares of the residuals which is the difference between the predicted output and the true output." 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 24, 379 | "metadata": { 380 | "collapsed": true 381 | }, 382 | "outputs": [], 383 | "source": [ 384 | "def get_residual_sum_of_squares(input_feature, actual_output, intercept, slope):\n", 385 | " # First get the predictions\n", 386 | " predicted_output = intercept + (slope * input_feature)\n", 387 | "\n", 388 | " # then compute the residuals (since we are squaring it doesn't matter which order you subtract)\n", 389 | " residuals = actual_output - predicted_output\n", 390 | "\n", 391 | " # square the residuals and add them up\n", 392 | " residuals_squared = residuals * residuals\n", 393 | " residual_sum_squares = residuals_squared.sum()\n", 394 | "\n", 395 | " return(residual_sum_squares)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 25, 401 | "metadata": { 402 | "collapsed": false 403 | }, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "The RSS of predicting Prices based on Square Feet is : 1.20191835632e+15\n" 410 | ] 411 | } 412 | ], 413 | "source": [ 414 | "rss_prices_on_sqft = get_residual_sum_of_squares(train_data['sqft_living'], train_data['price'], sqft_intercept, sqft_slope)\n", 415 | "print 'The RSS of predicting Prices based on Square Feet is : ' + str(rss_prices_on_sqft)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "### Function to predict the squarefeet of a house from a given price" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 26, 428 | "metadata": { 429 | "collapsed": true 430 | }, 431 | "outputs": [], 432 | "source": [ 433 | "def inverse_regression_predictions(output, intercept, slope):\n", 434 | " # solve output = intercept + slope*input_feature for input_feature. Use this equation to compute the inverse predictions:\n", 435 | " estimated_feature = (output - intercept) / slope\n", 436 | " return estimated_feature" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "**What is the estimated square-feet for a house costing $800,000?**" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 27, 449 | "metadata": { 450 | "collapsed": false 451 | }, 452 | "outputs": [ 453 | { 454 | "name": "stdout", 455 | "output_type": "stream", 456 | "text": [ 457 | "The estimated squarefeet for a house worth $800000.00 is 3004\n" 458 | ] 459 | } 460 | ], 461 | "source": [ 462 | "my_house_price = 800000\n", 463 | "estimated_squarefeet = inverse_regression_predictions(my_house_price, sqft_intercept, sqft_slope)\n", 464 | "print \"The estimated squarefeet for a house worth $%.2f is %d\" % (my_house_price, estimated_squarefeet)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "## Estimate house price from no. of bedrooms" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 28, 477 | "metadata": { 478 | "collapsed": false 479 | }, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "Intercept: 109473.180469\n", 486 | "Slope: 127588.952175\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "# Estimate the slope and intercept for predicting 'price' based on 'bedrooms'\n", 492 | "bedrm_intercept, bedrm_slope = simple_linear_regression(train_data['bedrooms'], train_data['price'])\n", 493 | "print \"Intercept: \" + str(bedrm_intercept)\n", 494 | "print \"Slope: \" + str(bedrm_slope)" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "### Test Linear Regression Algorithm for Square feet and Bedrooms Model" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "**Which model (square feet or bedrooms) has lowest RSS on TEST data?**" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 29, 514 | "metadata": { 515 | "collapsed": false 516 | }, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "275402936247141.3" 522 | ] 523 | }, 524 | "execution_count": 29, 525 | "metadata": {}, 526 | "output_type": "execute_result" 527 | } 528 | ], 529 | "source": [ 530 | "# Compute RSS when using bedrooms on TEST data:\n", 531 | "get_residual_sum_of_squares(test_data['sqft_living'], test_data['price'], sqft_intercept, sqft_slope)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 30, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/plain": [ 544 | "493364582868287.94" 545 | ] 546 | }, 547 | "execution_count": 30, 548 | "metadata": {}, 549 | "output_type": "execute_result" 550 | } 551 | ], 552 | "source": [ 553 | "# Compute RSS when using squarefeet on TEST data:\n", 554 | "get_residual_sum_of_squares(test_data['bedrooms'], test_data['price'], bedrm_intercept, bedrm_slope)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "# So the Square feet model has a lower RSS than the Bedrooms model." 562 | ] 563 | } 564 | ], 565 | "metadata": { 566 | "kernelspec": { 567 | "display_name": "Python 2", 568 | "language": "python", 569 | "name": "python2" 570 | }, 571 | "language_info": { 572 | "codemirror_mode": { 573 | "name": "ipython", 574 | "version": 2 575 | }, 576 | "file_extension": ".py", 577 | "mimetype": "text/x-python", 578 | "name": "python", 579 | "nbconvert_exporter": "python", 580 | "pygments_lexer": "ipython2", 581 | "version": "2.7.13" 582 | } 583 | }, 584 | "nbformat": 4, 585 | "nbformat_minor": 0 586 | } 587 | --------------------------------------------------------------------------------