├── README.md
├── learn-ml-2.ipynb
└── learn-ml-1.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # Learn Machine Learning (Under Development)
 2 | 
 3 | ## Notebooks
 4 | 
 5 | * **learn-ml-1:** Loading data, selecting and filtering, DecisionTreeRegressor, RandomForestRegressor, validation
 6 | * **learn-ml-2:** get_dummies (categorical variables), missing value (fillna, dropna, Imputer)
 7 | 
 8 | ## Commit History
 9 | 
10 | * Commit 1: Added End-End pipeline for building a RandomForestRegressor model and testing it
11 | * Commit 2: Added Categorical and Missing value handling to existing pipeline


--------------------------------------------------------------------------------
/learn-ml-2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "_uuid": "0707c903a31649998ec18de096c550e756deb570",
  6 |         "_cell_guid": "6db0c996-22b1-4911-bf2e-a63baa629a0f"
  7 |       },
  8 |       "cell_type": "markdown",
  9 |       "source": "# Preprocessing + Pipeline building\n\nThis notebook builds on where we left in the last notebook"
 10 |     },
 11 |     {
 12 |       "metadata": {
 13 |         "_uuid": "617248ad982879b4a1708d5933b2420db18c9759",
 14 |         "collapsed": true,
 15 |         "_cell_guid": "e036d162-eb8f-4825-a029-73069a9a8f6d",
 16 |         "trusted": true
 17 |       },
 18 |       "cell_type": "code",
 19 |       "source": "import pandas as pd\n\ntrain_path = \"../input/train.csv\"\n\ndf = pd.read_csv(train_path)",
 20 |       "execution_count": 1,
 21 |       "outputs": []
 22 |     },
 23 |     {
 24 |       "metadata": {
 25 |         "_uuid": "341625ed14ba9372d7e7116a464972d388fe448f",
 26 |         "_cell_guid": "a5cea2e0-e283-4b6e-8cc3-f44c98cfb2a3"
 27 |       },
 28 |       "cell_type": "markdown",
 29 |       "source": "### Handling missing values\n\n* Drop columns with missing values\n* Imputation: Fills in missing values"
 30 |     },
 31 |     {
 32 |       "metadata": {
 33 |         "_uuid": "bd33256d0e7856b3a80ab5c36cdb201ffcceacb9",
 34 |         "_cell_guid": "b46d6a86-d089-41c1-979c-2e3e1a0f393f"
 35 |       },
 36 |       "cell_type": "markdown",
 37 |       "source": "#### Drop columns with missing values"
 38 |     },
 39 |     {
 40 |       "metadata": {
 41 |         "_uuid": "e28b9de9695f728a7ad9ec7f0c2f495e19590ce5",
 42 |         "_cell_guid": "7bbefc4a-fc97-4f3b-a15b-b08bac23b4a7",
 43 |         "trusted": true,
 44 |         "collapsed": true
 45 |       },
 46 |       "cell_type": "code",
 47 |       "source": "cols_with_missing_data = [col for col in df.columns if df[col].isnull().any()]\nreduced_original_data = df.drop(cols_with_missing_data, axis=1)",
 48 |       "execution_count": 2,
 49 |       "outputs": []
 50 |     },
 51 |     {
 52 |       "metadata": {
 53 |         "_uuid": "438c1a85fc01ca343c286caffbf340bce9c2564b",
 54 |         "_cell_guid": "79fafe25-820c-48f2-878d-5160d651fda4"
 55 |       },
 56 |       "cell_type": "markdown",
 57 |       "source": "Testing our model on this data"
 58 |     },
 59 |     {
 60 |       "metadata": {
 61 |         "_uuid": "fe6083fb45647d52120d20eeb3177cf6336d55fb",
 62 |         "_cell_guid": "82833593-ea32-4340-ad36-8d310908e6d7",
 63 |         "trusted": true
 64 |       },
 65 |       "cell_type": "code",
 66 |       "source": "from sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_absolute_error\nfrom sklearn.model_selection import train_test_split\n\ny = df[\"SalePrice\"]\nX = df.drop(\"SalePrice\", axis=1)\n# For the sake of keeping the example simple, we'll use only numeric predictors\nnumeric_predictors = df.select_dtypes(exclude=['object'])",
 67 |       "execution_count": 12,
 68 |       "outputs": []
 69 |     },
 70 |     {
 71 |       "metadata": {
 72 |         "_uuid": "889b55d9a9f9fea5a0b27da36498ecc6a35d2e39",
 73 |         "collapsed": true,
 74 |         "_cell_guid": "3d00f2cc-5b93-4a2e-9f97-7a8848980f54",
 75 |         "trusted": true
 76 |       },
 77 |       "cell_type": "code",
 78 |       "source": "def get_mae(train_X, train_y, test_X, test_y):\n    regressor = RandomForestRegressor(random_state=0)\n    regressor.fit(train_X, train_y)\n    predictions = regressor.predict(test_X)\n    error = mean_absolute_error(test_y, predictions)\n    return error",
 79 |       "execution_count": 4,
 80 |       "outputs": []
 81 |     },
 82 |     {
 83 |       "metadata": {
 84 |         "_uuid": "436af1157d7442e66708bef0e440a63b91fdfdfe",
 85 |         "collapsed": true,
 86 |         "_cell_guid": "874f8f5b-b81b-4d67-afa2-5fcfb17ee9aa",
 87 |         "trusted": true
 88 |       },
 89 |       "cell_type": "code",
 90 |       "source": "train_X, test_X, train_y, test_y = train_test_split(numeric_predictors, y, test_size=0.30, random_state=0)",
 91 |       "execution_count": 15,
 92 |       "outputs": []
 93 |     },
 94 |     {
 95 |       "metadata": {
 96 |         "_uuid": "e58373caed2a27626c8b0465ab28cb2fd57ac06a",
 97 |         "_cell_guid": "8cb9d92e-2898-41f6-a7fb-07b2ad734c68",
 98 |         "trusted": true
 99 |       },
100 |       "cell_type": "code",
101 |       "source": "cols_with_missing = [col for col in train_X.columns if train_X[col].isnull().any()]\nreduced_train_X = train_X.drop(cols_with_missing, axis=1)\nreduced_test_X = test_X.drop(cols_with_missing, axis=1)\n\nprint(get_mae(reduced_train_X, train_y, reduced_test_X, test_y))",
102 |       "execution_count": 16,
103 |       "outputs": [
104 |         {
105 |           "output_type": "stream",
106 |           "text": "923.912785388\n",
107 |           "name": "stdout"
108 |         }
109 |       ]
110 |     },
111 |     {
112 |       "metadata": {
113 |         "_uuid": "34404c9732e45d25f1ce8fa54203f45b821d22cb",
114 |         "collapsed": true,
115 |         "_cell_guid": "0fc41176-4981-4d52-8b74-0a77c58a0cb0",
116 |         "trusted": false
117 |       },
118 |       "cell_type": "markdown",
119 |       "source": "### Imputation"
120 |     },
121 |     {
122 |       "metadata": {
123 |         "trusted": true,
124 |         "collapsed": true,
125 |         "_uuid": "1cccb3414d0161e20a84f7917fa34202a79e9a6a"
126 |       },
127 |       "cell_type": "code",
128 |       "source": "from sklearn.preprocessing import Imputer\n\nimputer = Imputer()\nimputed_train_X = imputer.fit_transform(train_X)\nimputed_test_X = imputer.transform(test_X)",
129 |       "execution_count": 9,
130 |       "outputs": []
131 |     },
132 |     {
133 |       "metadata": {
134 |         "trusted": true,
135 |         "_uuid": "c33faf12095a64a45a9fe3fedffe76a35f7e8f20"
136 |       },
137 |       "cell_type": "code",
138 |       "source": "print(get_mae(imputed_train_X, train_y, imputed_test_X, test_y))",
139 |       "execution_count": 10,
140 |       "outputs": [
141 |         {
142 |           "output_type": "stream",
143 |           "text": "1003.7890411\n",
144 |           "name": "stdout"
145 |         }
146 |       ]
147 |     },
148 |     {
149 |       "metadata": {
150 |         "_uuid": "b99b2dbe26861fdf2aced8c8e29b8aaff938cf34"
151 |       },
152 |       "cell_type": "markdown",
153 |       "source": "### Handling categorical data"
154 |     },
155 |     {
156 |       "metadata": {
157 |         "trusted": true,
158 |         "_uuid": "c72b4f3440a3cf5658883fe9bc86dc95c9e43025"
159 |       },
160 |       "cell_type": "code",
161 |       "source": "from sklearn.model_selection import cross_val_score\nfrom sklearn.ensemble import RandomForestRegressor\n\ndef get_mae(X, y):    \n    return -1 * cross_val_score(RandomForestRegressor(50), X, y, scoring = 'neg_mean_absolute_error').mean()\n\npredictors_without_categoricals = imputed_train_X.select_dtypes(exclude=[\"object\"])\n\none_hot_encoded_training_predictors = pd.get_dummies(imputed_train_X)\n\nget_mae(predictors_without_categoricals, y)",
162 |       "execution_count": 18,
163 |       "outputs": [
164 |         {
165 |           "output_type": "error",
166 |           "ename": "AttributeError",
167 |           "evalue": "'numpy.ndarray' object has no attribute 'select_dtypes'",
168 |           "traceback": [
169 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
170 |             "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
171 |             "\u001b[0;32m<ipython-input-18-de2330fd0ae0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mcross_val_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mRandomForestRegressor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'neg_mean_absolute_error'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mpredictors_without_categoricals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimputed_train_X\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect_dtypes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexclude\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"object\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mget_mae\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredictors_without_categoricals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
172 |             "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'select_dtypes'"
173 |           ]
174 |         }
175 |       ]
176 |     },
177 |     {
178 |       "metadata": {
179 |         "trusted": true,
180 |         "collapsed": true,
181 |         "_uuid": "7a999f2021bb9984bd7a673bbc8385b5b0c0aec0"
182 |       },
183 |       "cell_type": "code",
184 |       "source": "",
185 |       "execution_count": null,
186 |       "outputs": []
187 |     }
188 |   ],
189 |   "metadata": {
190 |     "language_info": {
191 |       "name": "python",
192 |       "version": "3.6.4",
193 |       "mimetype": "text/x-python",
194 |       "codemirror_mode": {
195 |         "name": "ipython",
196 |         "version": 3
197 |       },
198 |       "pygments_lexer": "ipython3",
199 |       "nbconvert_exporter": "python",
200 |       "file_extension": ".py"
201 |     },
202 |     "kernelspec": {
203 |       "display_name": "Python 3",
204 |       "language": "python",
205 |       "name": "python3"
206 |     }
207 |   },
208 |   "nbformat": 4,
209 |   "nbformat_minor": 1
210 | }


--------------------------------------------------------------------------------
/learn-ml-1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "metadata": {
  5 |         "_uuid": "8576099666feb23d9edd44381f2948bac364bde2"
  6 |       },
  7 |       "cell_type": "markdown",
  8 |       "source": "# Learn Machine Learning\n\nThe notebook takes in snippets from [kaggle.com/learn](http://kaggle.com/learn) ML program and builds on that\n\n### Dataset: House Prices: Advanced Regression Techniques"
  9 |     },
 10 |     {
 11 |       "metadata": {
 12 |         "_uuid": "46febf817a4163b3612cc6effbde443f487119c7",
 13 |         "collapsed": true,
 14 |         "_cell_guid": "55c1bfee-ff80-49be-9363-22b8b36b31c8",
 15 |         "trusted": true
 16 |       },
 17 |       "cell_type": "code",
 18 |       "source": "import pandas as pd\n\ntrain_path = \"../input/train.csv\"\n\ndf = pd.read_csv(train_path)",
 19 |       "execution_count": 1,
 20 |       "outputs": []
 21 |     },
 22 |     {
 23 |       "metadata": {
 24 |         "trusted": true,
 25 |         "_uuid": "737487395024a94cd74768128113ff7ca5e97398"
 26 |       },
 27 |       "cell_type": "code",
 28 |       "source": "df.head()",
 29 |       "execution_count": 2,
 30 |       "outputs": [
 31 |         {
 32 |           "output_type": "execute_result",
 33 |           "execution_count": 2,
 34 |           "data": {
 35 |             "text/plain": "   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n0   1          60       RL         65.0     8450   Pave   NaN      Reg   \n1   2          20       RL         80.0     9600   Pave   NaN      Reg   \n2   3          60       RL         68.0    11250   Pave   NaN      IR1   \n3   4          70       RL         60.0     9550   Pave   NaN      IR1   \n4   5          60       RL         84.0    14260   Pave   NaN      IR1   \n\n  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \\\n0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n\n  MoSold YrSold  SaleType  SaleCondition  SalePrice  \n0      2   2008        WD         Normal     208500  \n1      5   2007        WD         Normal     181500  \n2      9   2008        WD         Normal     223500  \n3      2   2006        WD        Abnorml     140000  \n4     12   2008        WD         Normal     250000  \n\n[5 rows x 81 columns]",
 36 |             "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Id</th>\n      <th>MSSubClass</th>\n      <th>MSZoning</th>\n      <th>LotFrontage</th>\n      <th>LotArea</th>\n      <th>Street</th>\n      <th>Alley</th>\n      <th>LotShape</th>\n      <th>LandContour</th>\n      <th>Utilities</th>\n      <th>...</th>\n      <th>PoolArea</th>\n      <th>PoolQC</th>\n      <th>Fence</th>\n      <th>MiscFeature</th>\n      <th>MiscVal</th>\n      <th>MoSold</th>\n      <th>YrSold</th>\n      <th>SaleType</th>\n      <th>SaleCondition</th>\n      <th>SalePrice</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>1</td>\n      <td>60</td>\n      <td>RL</td>\n      <td>65.0</td>\n      <td>8450</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>2</td>\n      <td>2008</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>208500</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>2</td>\n      <td>20</td>\n      <td>RL</td>\n      <td>80.0</td>\n      <td>9600</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>Reg</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>5</td>\n      <td>2007</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>181500</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3</td>\n      <td>60</td>\n      <td>RL</td>\n      <td>68.0</td>\n      <td>11250</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>IR1</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>9</td>\n      <td>2008</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>223500</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>4</td>\n      <td>70</td>\n      <td>RL</td>\n      <td>60.0</td>\n      <td>9550</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>IR1</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>2</td>\n      <td>2006</td>\n      <td>WD</td>\n      <td>Abnorml</td>\n      <td>140000</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>5</td>\n      <td>60</td>\n      <td>RL</td>\n      <td>84.0</td>\n      <td>14260</td>\n      <td>Pave</td>\n      <td>NaN</td>\n      <td>IR1</td>\n      <td>Lvl</td>\n      <td>AllPub</td>\n      <td>...</td>\n      <td>0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0</td>\n      <td>12</td>\n      <td>2008</td>\n      <td>WD</td>\n      <td>Normal</td>\n      <td>250000</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 81 columns</p>\n</div>"
 37 |           },
 38 |           "metadata": {}
 39 |         }
 40 |       ]
 41 |     },
 42 |     {
 43 |       "metadata": {
 44 |         "trusted": true,
 45 |         "_uuid": "8f692b94e16a8b9ce3929a1293f8d920372e9892"
 46 |       },
 47 |       "cell_type": "code",
 48 |       "source": "df.describe()",
 49 |       "execution_count": 3,
 50 |       "outputs": [
 51 |         {
 52 |           "output_type": "execute_result",
 53 |           "execution_count": 3,
 54 |           "data": {
 55 |             "text/plain": "                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \\\ncount  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   \nmean    730.500000    56.897260    70.049958   10516.828082     6.099315   \nstd     421.610009    42.300571    24.284752    9981.264932     1.382997   \nmin       1.000000    20.000000    21.000000    1300.000000     1.000000   \n25%     365.750000    20.000000    59.000000    7553.500000     5.000000   \n50%     730.500000    50.000000    69.000000    9478.500000     6.000000   \n75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   \nmax    1460.000000   190.000000   313.000000  215245.000000    10.000000   \n\n       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \\\ncount  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000   \nmean      5.575342  1971.267808   1984.865753   103.685262   443.639726   \nstd       1.112799    30.202904     20.645407   181.066207   456.098091   \nmin       1.000000  1872.000000   1950.000000     0.000000     0.000000   \n25%       5.000000  1954.000000   1967.000000     0.000000     0.000000   \n50%       5.000000  1973.000000   1994.000000     0.000000   383.500000   \n75%       6.000000  2000.000000   2004.000000   166.000000   712.250000   \nmax       9.000000  2010.000000   2010.000000  1600.000000  5644.000000   \n\n           ...         WoodDeckSF  OpenPorchSF  EnclosedPorch    3SsnPorch  \\\ncount      ...        1460.000000  1460.000000    1460.000000  1460.000000   \nmean       ...          94.244521    46.660274      21.954110     3.409589   \nstd        ...         125.338794    66.256028      61.119149    29.317331   \nmin        ...           0.000000     0.000000       0.000000     0.000000   \n25%        ...           0.000000     0.000000       0.000000     0.000000   \n50%        ...           0.000000    25.000000       0.000000     0.000000   \n75%        ...         168.000000    68.000000       0.000000     0.000000   \nmax        ...         857.000000   547.000000     552.000000   508.000000   \n\n       ScreenPorch     PoolArea       MiscVal       MoSold       YrSold  \\\ncount  1460.000000  1460.000000   1460.000000  1460.000000  1460.000000   \nmean     15.060959     2.758904     43.489041     6.321918  2007.815753   \nstd      55.757415    40.177307    496.123024     2.703626     1.328095   \nmin       0.000000     0.000000      0.000000     1.000000  2006.000000   \n25%       0.000000     0.000000      0.000000     5.000000  2007.000000   \n50%       0.000000     0.000000      0.000000     6.000000  2008.000000   \n75%       0.000000     0.000000      0.000000     8.000000  2009.000000   \nmax     480.000000   738.000000  15500.000000    12.000000  2010.000000   \n\n           SalePrice  \ncount    1460.000000  \nmean   180921.195890  \nstd     79442.502883  \nmin     34900.000000  \n25%    129975.000000  \n50%    163000.000000  \n75%    214000.000000  \nmax    755000.000000  \n\n[8 rows x 38 columns]",
 56 |             "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Id</th>\n      <th>MSSubClass</th>\n      <th>LotFrontage</th>\n      <th>LotArea</th>\n      <th>OverallQual</th>\n      <th>OverallCond</th>\n      <th>YearBuilt</th>\n      <th>YearRemodAdd</th>\n      <th>MasVnrArea</th>\n      <th>BsmtFinSF1</th>\n      <th>...</th>\n      <th>WoodDeckSF</th>\n      <th>OpenPorchSF</th>\n      <th>EnclosedPorch</th>\n      <th>3SsnPorch</th>\n      <th>ScreenPorch</th>\n      <th>PoolArea</th>\n      <th>MiscVal</th>\n      <th>MoSold</th>\n      <th>YrSold</th>\n      <th>SalePrice</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1201.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1452.000000</td>\n      <td>1460.000000</td>\n      <td>...</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n      <td>1460.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>730.500000</td>\n      <td>56.897260</td>\n      <td>70.049958</td>\n      <td>10516.828082</td>\n      <td>6.099315</td>\n      <td>5.575342</td>\n      <td>1971.267808</td>\n      <td>1984.865753</td>\n      <td>103.685262</td>\n      <td>443.639726</td>\n      <td>...</td>\n      <td>94.244521</td>\n      <td>46.660274</td>\n      <td>21.954110</td>\n      <td>3.409589</td>\n      <td>15.060959</td>\n      <td>2.758904</td>\n      <td>43.489041</td>\n      <td>6.321918</td>\n      <td>2007.815753</td>\n      <td>180921.195890</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>421.610009</td>\n      <td>42.300571</td>\n      <td>24.284752</td>\n      <td>9981.264932</td>\n      <td>1.382997</td>\n      <td>1.112799</td>\n      <td>30.202904</td>\n      <td>20.645407</td>\n      <td>181.066207</td>\n      <td>456.098091</td>\n      <td>...</td>\n      <td>125.338794</td>\n      <td>66.256028</td>\n      <td>61.119149</td>\n      <td>29.317331</td>\n      <td>55.757415</td>\n      <td>40.177307</td>\n      <td>496.123024</td>\n      <td>2.703626</td>\n      <td>1.328095</td>\n      <td>79442.502883</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>1.000000</td>\n      <td>20.000000</td>\n      <td>21.000000</td>\n      <td>1300.000000</td>\n      <td>1.000000</td>\n      <td>1.000000</td>\n      <td>1872.000000</td>\n      <td>1950.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>...</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>1.000000</td>\n      <td>2006.000000</td>\n      <td>34900.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>365.750000</td>\n      <td>20.000000</td>\n      <td>59.000000</td>\n      <td>7553.500000</td>\n      <td>5.000000</td>\n      <td>5.000000</td>\n      <td>1954.000000</td>\n      <td>1967.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>...</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>5.000000</td>\n      <td>2007.000000</td>\n      <td>129975.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>730.500000</td>\n      <td>50.000000</td>\n      <td>69.000000</td>\n      <td>9478.500000</td>\n      <td>6.000000</td>\n      <td>5.000000</td>\n      <td>1973.000000</td>\n      <td>1994.000000</td>\n      <td>0.000000</td>\n      <td>383.500000</td>\n      <td>...</td>\n      <td>0.000000</td>\n      <td>25.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>6.000000</td>\n      <td>2008.000000</td>\n      <td>163000.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>1095.250000</td>\n      <td>70.000000</td>\n      <td>80.000000</td>\n      <td>11601.500000</td>\n      <td>7.000000</td>\n      <td>6.000000</td>\n      <td>2000.000000</td>\n      <td>2004.000000</td>\n      <td>166.000000</td>\n      <td>712.250000</td>\n      <td>...</td>\n      <td>168.000000</td>\n      <td>68.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>0.000000</td>\n      <td>8.000000</td>\n      <td>2009.000000</td>\n      <td>214000.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>1460.000000</td>\n      <td>190.000000</td>\n      <td>313.000000</td>\n      <td>215245.000000</td>\n      <td>10.000000</td>\n      <td>9.000000</td>\n      <td>2010.000000</td>\n      <td>2010.000000</td>\n      <td>1600.000000</td>\n      <td>5644.000000</td>\n      <td>...</td>\n      <td>857.000000</td>\n      <td>547.000000</td>\n      <td>552.000000</td>\n      <td>508.000000</td>\n      <td>480.000000</td>\n      <td>738.000000</td>\n      <td>15500.000000</td>\n      <td>12.000000</td>\n      <td>2010.000000</td>\n      <td>755000.000000</td>\n    </tr>\n  </tbody>\n</table>\n<p>8 rows × 38 columns</p>\n</div>"
 57 |           },
 58 |           "metadata": {}
 59 |         }
 60 |       ]
 61 |     },
 62 |     {
 63 |       "metadata": {
 64 |         "_uuid": "54d361163e3de409a12cd9fabe86263d1d9582aa"
 65 |       },
 66 |       "cell_type": "markdown",
 67 |       "source": "* **count**: Shows how many rows have non-missing values\n* **mean**: Average\n* **std**: Tells how numerically spread out the values are\n* **min, 25%, 50%, 75%**: lowest values is min, quarter 25% and so on"
 68 |     },
 69 |     {
 70 |       "metadata": {
 71 |         "_uuid": "81e44e18c121cfe69bc4813ebd337fafdbd9a243"
 72 |       },
 73 |       "cell_type": "markdown",
 74 |       "source": "### Selecting and Filtering Data"
 75 |     },
 76 |     {
 77 |       "metadata": {
 78 |         "_uuid": "7a6ef3e3e34e68f8384699da6cf4ba1e31eea284"
 79 |       },
 80 |       "cell_type": "markdown",
 81 |       "source": "**df.columns**: Gives a list of all the columns in a Pandas DataFrame"
 82 |     },
 83 |     {
 84 |       "metadata": {
 85 |         "trusted": true,
 86 |         "_uuid": "8656f9ed79b3713102f9f0a4cd86658e47004506"
 87 |       },
 88 |       "cell_type": "code",
 89 |       "source": "columns = df.columns\nprint(columns)",
 90 |       "execution_count": 4,
 91 |       "outputs": [
 92 |         {
 93 |           "output_type": "stream",
 94 |           "text": "Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',\n       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',\n       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',\n       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',\n       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',\n       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',\n       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',\n       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',\n       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',\n       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',\n       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',\n       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',\n       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',\n       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',\n       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',\n       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',\n       'SaleCondition', 'SalePrice'],\n      dtype='object')\n",
 95 |           "name": "stdout"
 96 |         }
 97 |       ]
 98 |     },
 99 |     {
100 |       "metadata": {
101 |         "_uuid": "004a58b8cad2f90c4040ffc83204a5211597e649"
102 |       },
103 |       "cell_type": "markdown",
104 |       "source": "To select a single column out of the dataset, use **df[\"COLUMN_NAME\"]**"
105 |     },
106 |     {
107 |       "metadata": {
108 |         "trusted": true,
109 |         "_uuid": "d1297ff4e1124491c59798d13f67cb577aa43976"
110 |       },
111 |       "cell_type": "code",
112 |       "source": "df[\"SalePrice\"].head(5)",
113 |       "execution_count": 5,
114 |       "outputs": [
115 |         {
116 |           "output_type": "execute_result",
117 |           "execution_count": 5,
118 |           "data": {
119 |             "text/plain": "0    208500\n1    181500\n2    223500\n3    140000\n4    250000\nName: SalePrice, dtype: int64"
120 |           },
121 |           "metadata": {}
122 |         }
123 |       ]
124 |     },
125 |     {
126 |       "metadata": {
127 |         "_uuid": "d6c1f3187d8a1b5a921f9b4a0ded5a000e8e7a5c"
128 |       },
129 |       "cell_type": "markdown",
130 |       "source": "More often than not we need to subset our data for ad-hoc analysis or testing a UDF and for billion other reasons, this is one of the many ways to subset your data according to columns"
131 |     },
132 |     {
133 |       "metadata": {
134 |         "trusted": true,
135 |         "_uuid": "39d1b79e98a4a30c29567524701cf0da34aba75b"
136 |       },
137 |       "cell_type": "code",
138 |       "source": "columns_of_interest = [\"SaleCondition\",\"SalePrice\"]\ndf[columns_of_interest].head()",
139 |       "execution_count": 6,
140 |       "outputs": [
141 |         {
142 |           "output_type": "execute_result",
143 |           "execution_count": 6,
144 |           "data": {
145 |             "text/plain": "  SaleCondition  SalePrice\n0        Normal     208500\n1        Normal     181500\n2        Normal     223500\n3       Abnorml     140000\n4        Normal     250000",
146 |             "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>SaleCondition</th>\n      <th>SalePrice</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Normal</td>\n      <td>208500</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Normal</td>\n      <td>181500</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Normal</td>\n      <td>223500</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Abnorml</td>\n      <td>140000</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Normal</td>\n      <td>250000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
147 |           },
148 |           "metadata": {}
149 |         }
150 |       ]
151 |     },
152 |     {
153 |       "metadata": {
154 |         "trusted": true,
155 |         "_uuid": "c470ffcd2d6f5d4f8856daedc93626ba2742916a"
156 |       },
157 |       "cell_type": "code",
158 |       "source": "df[columns_of_interest].describe()",
159 |       "execution_count": 7,
160 |       "outputs": [
161 |         {
162 |           "output_type": "execute_result",
163 |           "execution_count": 7,
164 |           "data": {
165 |             "text/plain": "           SalePrice\ncount    1460.000000\nmean   180921.195890\nstd     79442.502883\nmin     34900.000000\n25%    129975.000000\n50%    163000.000000\n75%    214000.000000\nmax    755000.000000",
166 |             "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>SalePrice</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>count</th>\n      <td>1460.000000</td>\n    </tr>\n    <tr>\n      <th>mean</th>\n      <td>180921.195890</td>\n    </tr>\n    <tr>\n      <th>std</th>\n      <td>79442.502883</td>\n    </tr>\n    <tr>\n      <th>min</th>\n      <td>34900.000000</td>\n    </tr>\n    <tr>\n      <th>25%</th>\n      <td>129975.000000</td>\n    </tr>\n    <tr>\n      <th>50%</th>\n      <td>163000.000000</td>\n    </tr>\n    <tr>\n      <th>75%</th>\n      <td>214000.000000</td>\n    </tr>\n    <tr>\n      <th>max</th>\n      <td>755000.000000</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
167 |           },
168 |           "metadata": {}
169 |         }
170 |       ]
171 |     },
172 |     {
173 |       "metadata": {
174 |         "_uuid": "b0c4448d9f38c4197464ce3b00280c53fae85814"
175 |       },
176 |       "cell_type": "markdown",
177 |       "source": "### Building your first Scikit Learn model\n\nTarget Variable: **SalePrice**\n\nFor now, we are using the Decision Tree Regressor: [Documentation](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)"
178 |     },
179 |     {
180 |       "metadata": {
181 |         "trusted": true,
182 |         "collapsed": true,
183 |         "_uuid": "acf981357059e1f2e2bbf5c660510cfe491b6717"
184 |       },
185 |       "cell_type": "code",
186 |       "source": "from sklearn.tree import DecisionTreeRegressor",
187 |       "execution_count": 9,
188 |       "outputs": []
189 |     },
190 |     {
191 |       "metadata": {
192 |         "trusted": true,
193 |         "_uuid": "187e4fc9dd79b2aad8e61ee0b9c48752a288c9b1"
194 |       },
195 |       "cell_type": "code",
196 |       "source": "y = df[\"SalePrice\"]\npredictor_variables = [\"LotArea\", \"YearBuilt\", \"1stFlrSF\", \"2ndFlrSF\", \"FullBath\", \"BedroomAbvGr\", \"TotRmsAbvGrd\"]\nX = df[predictor_variables]",
197 |       "execution_count": 8,
198 |       "outputs": []
199 |     },
200 |     {
201 |       "metadata": {
202 |         "_uuid": "25f91bc32e4dc17c5878461110583e832293fc3a"
203 |       },
204 |       "cell_type": "markdown",
205 |       "source": "Fitting the model using **.fit()** function, takes in Training Data and Labels\n\n* Training Data: **X**\n* Labels: **y**"
206 |     },
207 |     {
208 |       "metadata": {
209 |         "trusted": true,
210 |         "_uuid": "bf3a9638e380e8a8cfe40b943de5034c2d4d36a8"
211 |       },
212 |       "cell_type": "code",
213 |       "source": "regressor = DecisionTreeRegressor()\nregressor.fit(X, y)",
214 |       "execution_count": 10,
215 |       "outputs": [
216 |         {
217 |           "output_type": "execute_result",
218 |           "execution_count": 10,
219 |           "data": {
220 |             "text/plain": "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n           max_leaf_nodes=None, min_impurity_decrease=0.0,\n           min_impurity_split=None, min_samples_leaf=1,\n           min_samples_split=2, min_weight_fraction_leaf=0.0,\n           presort=False, random_state=None, splitter='best')"
221 |           },
222 |           "metadata": {}
223 |         }
224 |       ]
225 |     },
226 |     {
227 |       "metadata": {
228 |         "_uuid": "9c8a86e8a939f135c8d8d7dead6557aea8ad1cda"
229 |       },
230 |       "cell_type": "markdown",
231 |       "source": "Prdicting Labels for testing set by using **.predict()** function, takes in the to be predicted values and should be in the exact format as provided above\n\nSide note: Here the prediction data is first 5 rows of Training set, this would never be the case in real-life problems"
232 |     },
233 |     {
234 |       "metadata": {
235 |         "trusted": true,
236 |         "_uuid": "6a731b12ce7e103ba50d3c8e39c46d7d214aef24"
237 |       },
238 |       "cell_type": "code",
239 |       "source": "regressor.predict(X.head(5))",
240 |       "execution_count": 11,
241 |       "outputs": [
242 |         {
243 |           "output_type": "execute_result",
244 |           "execution_count": 11,
245 |           "data": {
246 |             "text/plain": "array([ 208500.,  181500.,  223500.,  140000.,  250000.])"
247 |           },
248 |           "metadata": {}
249 |         }
250 |       ]
251 |     },
252 |     {
253 |       "metadata": {
254 |         "_uuid": "d73f5edd5b05223df968f4db5dcf4ac6123110a4"
255 |       },
256 |       "cell_type": "markdown",
257 |       "source": "### Model Validation\n\nTo test how our model is performing on real-world data, we divide our overal training set into 2 sets, Training set and Testing\n\nThe model is trained on Training set and predictions are derived from the test set\n\n**Mean Absolute Error** is then measured using original labels and predicted variables for testing set\n\nIdeal train test split is **60:40**, but it may differ from case to case basis"
258 |     },
259 |     {
260 |       "metadata": {
261 |         "trusted": true,
262 |         "collapsed": true,
263 |         "_uuid": "474407accf07043c0100aae8bf745f3a8745018a"
264 |       },
265 |       "cell_type": "code",
266 |       "source": "from sklearn.model_selection import train_test_split\nfrom sklearn.tree import DecisionTreeRegressor\n\ntrain_X, test_X, train_y, test_y = train_test_split(X, y, random_state=6, test_size=0.40)",
267 |       "execution_count": 13,
268 |       "outputs": []
269 |     },
270 |     {
271 |       "metadata": {
272 |         "trusted": true,
273 |         "_uuid": "46af8267d1ddb8216fbade73eaaf202e602b872d"
274 |       },
275 |       "cell_type": "code",
276 |       "source": "regressor = DecisionTreeRegressor()\nregressor.fit(train_X, train_y)",
277 |       "execution_count": 14,
278 |       "outputs": [
279 |         {
280 |           "output_type": "execute_result",
281 |           "execution_count": 14,
282 |           "data": {
283 |             "text/plain": "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n           max_leaf_nodes=None, min_impurity_decrease=0.0,\n           min_impurity_split=None, min_samples_leaf=1,\n           min_samples_split=2, min_weight_fraction_leaf=0.0,\n           presort=False, random_state=None, splitter='best')"
284 |           },
285 |           "metadata": {}
286 |         }
287 |       ]
288 |     },
289 |     {
290 |       "metadata": {
291 |         "trusted": true,
292 |         "collapsed": true,
293 |         "_uuid": "54bef6336a252d24599cc26dbd5b28a5433be79e"
294 |       },
295 |       "cell_type": "code",
296 |       "source": "predictions = regressor.predict(test_X)",
297 |       "execution_count": 15,
298 |       "outputs": []
299 |     },
300 |     {
301 |       "metadata": {
302 |         "_uuid": "ead765ac27a0579d0a68301ed6a0ca6f59fc538e"
303 |       },
304 |       "cell_type": "markdown",
305 |       "source": "**Mean Absolute Error** is the absolute value of each error"
306 |     },
307 |     {
308 |       "metadata": {
309 |         "trusted": true,
310 |         "_uuid": "3f05a6ae0653488e5363424a8a5b52321f845e10"
311 |       },
312 |       "cell_type": "code",
313 |       "source": "from sklearn.metrics import mean_absolute_error\n\nmean_absolute_error(test_y, predictions)",
314 |       "execution_count": 16,
315 |       "outputs": [
316 |         {
317 |           "output_type": "execute_result",
318 |           "execution_count": 16,
319 |           "data": {
320 |             "text/plain": "30421.69006849315"
321 |           },
322 |           "metadata": {}
323 |         }
324 |       ]
325 |     },
326 |     {
327 |       "metadata": {
328 |         "_uuid": "b5079cbf6d1538e34cf33189c14c757ad7c0d3b6"
329 |       },
330 |       "cell_type": "markdown",
331 |       "source": "Optimizing the number of leaf nodes to get better MAE results"
332 |     },
333 |     {
334 |       "metadata": {
335 |         "trusted": true,
336 |         "collapsed": true,
337 |         "_uuid": "513650b4c4d3c0abe1bd593921082c28f96e264d"
338 |       },
339 |       "cell_type": "code",
340 |       "source": "def get_mae(leaf_nodes, train_X, train_y, test_X, test_y):\n    regressor = DecisionTreeRegressor(random_state=0, max_leaf_nodes=leaf_nodes)\n    regressor.fit(train_X, train_y)\n    predictions = regressor.predict(test_X)\n    error = mean_absolute_error(test_y, predictions)\n    print(\"Leaf Nodes: \" + str(leaf_nodes) + \" Error: \" + str(error))",
341 |       "execution_count": 17,
342 |       "outputs": []
343 |     },
344 |     {
345 |       "metadata": {
346 |         "trusted": true,
347 |         "_uuid": "6f3838ea60d4f55a2573ce145eb0cdadf3a3fa3f"
348 |       },
349 |       "cell_type": "code",
350 |       "source": "nodes = [50, 100, 500, 1000, 2000]\n\nfor node in nodes:\n    get_mae(node, train_X, train_y, test_X, test_y)",
351 |       "execution_count": 18,
352 |       "outputs": [
353 |         {
354 |           "output_type": "stream",
355 |           "text": "Leaf Nodes: 50 Error: 29514.142466\nLeaf Nodes: 100 Error: 29022.8035487\nLeaf Nodes: 500 Error: 31430.6623002\nLeaf Nodes: 1000 Error: 31493.3236301\nLeaf Nodes: 2000 Error: 31493.3236301\n",
356 |           "name": "stdout"
357 |         }
358 |       ]
359 |     },
360 |     {
361 |       "metadata": {
362 |         "trusted": true,
363 |         "_uuid": "d0246fec085b440bc7773814f773e16c7931e75b"
364 |       },
365 |       "cell_type": "code",
366 |       "source": "nodes = [5, 10, 50, 100, 500]\n\nfor node in nodes:\n    get_mae(node, train_X, train_y, test_X, test_y)",
367 |       "execution_count": 19,
368 |       "outputs": [
369 |         {
370 |           "output_type": "stream",
371 |           "text": "Leaf Nodes: 5 Error: 36994.303582\nLeaf Nodes: 10 Error: 33207.8198356\nLeaf Nodes: 50 Error: 29514.142466\nLeaf Nodes: 100 Error: 29022.8035487\nLeaf Nodes: 500 Error: 31430.6623002\n",
372 |           "name": "stdout"
373 |         }
374 |       ]
375 |     },
376 |     {
377 |       "metadata": {
378 |         "trusted": true,
379 |         "_uuid": "8627cb1317010404791f4217f4854e40476427b6"
380 |       },
381 |       "cell_type": "code",
382 |       "source": "nodes = [100, 200, 300, 400, 500]\n\nfor node in nodes:\n    get_mae(node, train_X, train_y, test_X, test_y)",
383 |       "execution_count": 20,
384 |       "outputs": [
385 |         {
386 |           "output_type": "stream",
387 |           "text": "Leaf Nodes: 100 Error: 29022.8035487\nLeaf Nodes: 200 Error: 29522.5853728\nLeaf Nodes: 300 Error: 30430.6891636\nLeaf Nodes: 400 Error: 31074.9160041\nLeaf Nodes: 500 Error: 31430.6623002\n",
388 |           "name": "stdout"
389 |         }
390 |       ]
391 |     },
392 |     {
393 |       "metadata": {
394 |         "_uuid": "f6bc9e2377b99031f10f674f799ee98113459c67"
395 |       },
396 |       "cell_type": "markdown",
397 |       "source": "### Training a RandomForestRegressor\n\n[Documentation](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)"
398 |     },
399 |     {
400 |       "metadata": {
401 |         "trusted": true,
402 |         "_uuid": "52e3ba87f777226332f908423dc119167750c89a"
403 |       },
404 |       "cell_type": "code",
405 |       "source": "from sklearn.ensemble import RandomForestRegressor\n\nregressor = RandomForestRegressor(random_state=0)\nregressor.fit(train_X, train_y)\npredictions = regressor.predict(test_X)\nmean_absolute_error(test_y, predictions)",
406 |       "execution_count": 24,
407 |       "outputs": [
408 |         {
409 |           "output_type": "execute_result",
410 |           "execution_count": 24,
411 |           "data": {
412 |             "text/plain": "25041.4450913242"
413 |           },
414 |           "metadata": {}
415 |         }
416 |       ]
417 |     },
418 |     {
419 |       "metadata": {
420 |         "trusted": true,
421 |         "collapsed": true,
422 |         "_uuid": "9eb09e194ff9e88cc86775f82dd146d821b4790d"
423 |       },
424 |       "cell_type": "code",
425 |       "source": "",
426 |       "execution_count": null,
427 |       "outputs": []
428 |     }
429 |   ],
430 |   "metadata": {
431 |     "kernelspec": {
432 |       "display_name": "Python 3",
433 |       "language": "python",
434 |       "name": "python3"
435 |     },
436 |     "language_info": {
437 |       "name": "python",
438 |       "version": "3.6.4",
439 |       "mimetype": "text/x-python",
440 |       "codemirror_mode": {
441 |         "name": "ipython",
442 |         "version": 3
443 |       },
444 |       "pygments_lexer": "ipython3",
445 |       "nbconvert_exporter": "python",
446 |       "file_extension": ".py"
447 |     }
448 |   },
449 |   "nbformat": 4,
450 |   "nbformat_minor": 1
451 | }


--------------------------------------------------------------------------------