├── README.md ├── learn-ml-2.ipynb └── learn-ml-1.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Learn Machine Learning (Under Development) 2 | 3 | ## Notebooks 4 | 5 | * **learn-ml-1:** Loading data, selecting and filtering, DecisionTreeRegressor, RandomForestRegressor, validation 6 | * **learn-ml-2:** get_dummies (categorical variables), missing value (fillna, dropna, Imputer) 7 | 8 | ## Commit History 9 | 10 | * Commit 1: Added End-End pipeline for building a RandomForestRegressor model and testing it 11 | * Commit 2: Added Categorical and Missing value handling to existing pipeline -------------------------------------------------------------------------------- /learn-ml-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "_uuid": "0707c903a31649998ec18de096c550e756deb570", 6 | "_cell_guid": "6db0c996-22b1-4911-bf2e-a63baa629a0f" 7 | }, 8 | "cell_type": "markdown", 9 | "source": "# Preprocessing + Pipeline building\n\nThis notebook builds on where we left in the last notebook" 10 | }, 11 | { 12 | "metadata": { 13 | "_uuid": "617248ad982879b4a1708d5933b2420db18c9759", 14 | "collapsed": true, 15 | "_cell_guid": "e036d162-eb8f-4825-a029-73069a9a8f6d", 16 | "trusted": true 17 | }, 18 | "cell_type": "code", 19 | "source": "import pandas as pd\n\ntrain_path = \"../input/train.csv\"\n\ndf = pd.read_csv(train_path)", 20 | "execution_count": 1, 21 | "outputs": [] 22 | }, 23 | { 24 | "metadata": { 25 | "_uuid": "341625ed14ba9372d7e7116a464972d388fe448f", 26 | "_cell_guid": "a5cea2e0-e283-4b6e-8cc3-f44c98cfb2a3" 27 | }, 28 | "cell_type": "markdown", 29 | "source": "### Handling missing values\n\n* Drop columns with missing values\n* Imputation: Fills in missing values" 30 | }, 31 | { 32 | "metadata": { 33 | "_uuid": "bd33256d0e7856b3a80ab5c36cdb201ffcceacb9", 34 | "_cell_guid": "b46d6a86-d089-41c1-979c-2e3e1a0f393f" 35 | }, 36 | "cell_type": "markdown", 37 | "source": "#### Drop columns with missing values" 38 | }, 39 | { 40 | "metadata": { 41 | "_uuid": "e28b9de9695f728a7ad9ec7f0c2f495e19590ce5", 42 | "_cell_guid": "7bbefc4a-fc97-4f3b-a15b-b08bac23b4a7", 43 | "trusted": true, 44 | "collapsed": true 45 | }, 46 | "cell_type": "code", 47 | "source": "cols_with_missing_data = [col for col in df.columns if df[col].isnull().any()]\nreduced_original_data = df.drop(cols_with_missing_data, axis=1)", 48 | "execution_count": 2, 49 | "outputs": [] 50 | }, 51 | { 52 | "metadata": { 53 | "_uuid": "438c1a85fc01ca343c286caffbf340bce9c2564b", 54 | "_cell_guid": "79fafe25-820c-48f2-878d-5160d651fda4" 55 | }, 56 | "cell_type": "markdown", 57 | "source": "Testing our model on this data" 58 | }, 59 | { 60 | "metadata": { 61 | "_uuid": "fe6083fb45647d52120d20eeb3177cf6336d55fb", 62 | "_cell_guid": "82833593-ea32-4340-ad36-8d310908e6d7", 63 | "trusted": true 64 | }, 65 | "cell_type": "code", 66 | "source": "from sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_absolute_error\nfrom sklearn.model_selection import train_test_split\n\ny = df[\"SalePrice\"]\nX = df.drop(\"SalePrice\", axis=1)\n# For the sake of keeping the example simple, we'll use only numeric predictors\nnumeric_predictors = df.select_dtypes(exclude=['object'])", 67 | "execution_count": 12, 68 | "outputs": [] 69 | }, 70 | { 71 | "metadata": { 72 | "_uuid": "889b55d9a9f9fea5a0b27da36498ecc6a35d2e39", 73 | "collapsed": true, 74 | "_cell_guid": "3d00f2cc-5b93-4a2e-9f97-7a8848980f54", 75 | "trusted": true 76 | }, 77 | "cell_type": "code", 78 | "source": "def get_mae(train_X, train_y, test_X, test_y):\n regressor = RandomForestRegressor(random_state=0)\n regressor.fit(train_X, train_y)\n predictions = regressor.predict(test_X)\n error = mean_absolute_error(test_y, predictions)\n return error", 79 | "execution_count": 4, 80 | "outputs": [] 81 | }, 82 | { 83 | "metadata": { 84 | "_uuid": "436af1157d7442e66708bef0e440a63b91fdfdfe", 85 | "collapsed": true, 86 | "_cell_guid": "874f8f5b-b81b-4d67-afa2-5fcfb17ee9aa", 87 | "trusted": true 88 | }, 89 | "cell_type": "code", 90 | "source": "train_X, test_X, train_y, test_y = train_test_split(numeric_predictors, y, test_size=0.30, random_state=0)", 91 | "execution_count": 15, 92 | "outputs": [] 93 | }, 94 | { 95 | "metadata": { 96 | "_uuid": "e58373caed2a27626c8b0465ab28cb2fd57ac06a", 97 | "_cell_guid": "8cb9d92e-2898-41f6-a7fb-07b2ad734c68", 98 | "trusted": true 99 | }, 100 | "cell_type": "code", 101 | "source": "cols_with_missing = [col for col in train_X.columns if train_X[col].isnull().any()]\nreduced_train_X = train_X.drop(cols_with_missing, axis=1)\nreduced_test_X = test_X.drop(cols_with_missing, axis=1)\n\nprint(get_mae(reduced_train_X, train_y, reduced_test_X, test_y))", 102 | "execution_count": 16, 103 | "outputs": [ 104 | { 105 | "output_type": "stream", 106 | "text": "923.912785388\n", 107 | "name": "stdout" 108 | } 109 | ] 110 | }, 111 | { 112 | "metadata": { 113 | "_uuid": "34404c9732e45d25f1ce8fa54203f45b821d22cb", 114 | "collapsed": true, 115 | "_cell_guid": "0fc41176-4981-4d52-8b74-0a77c58a0cb0", 116 | "trusted": false 117 | }, 118 | "cell_type": "markdown", 119 | "source": "### Imputation" 120 | }, 121 | { 122 | "metadata": { 123 | "trusted": true, 124 | "collapsed": true, 125 | "_uuid": "1cccb3414d0161e20a84f7917fa34202a79e9a6a" 126 | }, 127 | "cell_type": "code", 128 | "source": "from sklearn.preprocessing import Imputer\n\nimputer = Imputer()\nimputed_train_X = imputer.fit_transform(train_X)\nimputed_test_X = imputer.transform(test_X)", 129 | "execution_count": 9, 130 | "outputs": [] 131 | }, 132 | { 133 | "metadata": { 134 | "trusted": true, 135 | "_uuid": "c33faf12095a64a45a9fe3fedffe76a35f7e8f20" 136 | }, 137 | "cell_type": "code", 138 | "source": "print(get_mae(imputed_train_X, train_y, imputed_test_X, test_y))", 139 | "execution_count": 10, 140 | "outputs": [ 141 | { 142 | "output_type": "stream", 143 | "text": "1003.7890411\n", 144 | "name": "stdout" 145 | } 146 | ] 147 | }, 148 | { 149 | "metadata": { 150 | "_uuid": "b99b2dbe26861fdf2aced8c8e29b8aaff938cf34" 151 | }, 152 | "cell_type": "markdown", 153 | "source": "### Handling categorical data" 154 | }, 155 | { 156 | "metadata": { 157 | "trusted": true, 158 | "_uuid": "c72b4f3440a3cf5658883fe9bc86dc95c9e43025" 159 | }, 160 | "cell_type": "code", 161 | "source": "from sklearn.model_selection import cross_val_score\nfrom sklearn.ensemble import RandomForestRegressor\n\ndef get_mae(X, y): \n return -1 * cross_val_score(RandomForestRegressor(50), X, y, scoring = 'neg_mean_absolute_error').mean()\n\npredictors_without_categoricals = imputed_train_X.select_dtypes(exclude=[\"object\"])\n\none_hot_encoded_training_predictors = pd.get_dummies(imputed_train_X)\n\nget_mae(predictors_without_categoricals, y)", 162 | "execution_count": 18, 163 | "outputs": [ 164 | { 165 | "output_type": "error", 166 | "ename": "AttributeError", 167 | "evalue": "'numpy.ndarray' object has no attribute 'select_dtypes'", 168 | "traceback": [ 169 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 170 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 171 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mcross_val_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mRandomForestRegressor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'neg_mean_absolute_error'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mpredictors_without_categoricals\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mimputed_train_X\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect_dtypes\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexclude\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"object\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mget_mae\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpredictors_without_categoricals\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 172 | "\u001b[0;31mAttributeError\u001b[0m: 'numpy.ndarray' object has no attribute 'select_dtypes'" 173 | ] 174 | } 175 | ] 176 | }, 177 | { 178 | "metadata": { 179 | "trusted": true, 180 | "collapsed": true, 181 | "_uuid": "7a999f2021bb9984bd7a673bbc8385b5b0c0aec0" 182 | }, 183 | "cell_type": "code", 184 | "source": "", 185 | "execution_count": null, 186 | "outputs": [] 187 | } 188 | ], 189 | "metadata": { 190 | "language_info": { 191 | "name": "python", 192 | "version": "3.6.4", 193 | "mimetype": "text/x-python", 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "pygments_lexer": "ipython3", 199 | "nbconvert_exporter": "python", 200 | "file_extension": ".py" 201 | }, 202 | "kernelspec": { 203 | "display_name": "Python 3", 204 | "language": "python", 205 | "name": "python3" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 1 210 | } -------------------------------------------------------------------------------- /learn-ml-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "metadata": { 5 | "_uuid": "8576099666feb23d9edd44381f2948bac364bde2" 6 | }, 7 | "cell_type": "markdown", 8 | "source": "# Learn Machine Learning\n\nThe notebook takes in snippets from [kaggle.com/learn](http://kaggle.com/learn) ML program and builds on that\n\n### Dataset: House Prices: Advanced Regression Techniques" 9 | }, 10 | { 11 | "metadata": { 12 | "_uuid": "46febf817a4163b3612cc6effbde443f487119c7", 13 | "collapsed": true, 14 | "_cell_guid": "55c1bfee-ff80-49be-9363-22b8b36b31c8", 15 | "trusted": true 16 | }, 17 | "cell_type": "code", 18 | "source": "import pandas as pd\n\ntrain_path = \"../input/train.csv\"\n\ndf = pd.read_csv(train_path)", 19 | "execution_count": 1, 20 | "outputs": [] 21 | }, 22 | { 23 | "metadata": { 24 | "trusted": true, 25 | "_uuid": "737487395024a94cd74768128113ff7ca5e97398" 26 | }, 27 | "cell_type": "code", 28 | "source": "df.head()", 29 | "execution_count": 2, 30 | "outputs": [ 31 | { 32 | "output_type": "execute_result", 33 | "execution_count": 2, 34 | "data": { 35 | "text/plain": " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n0 1 60 RL 65.0 8450 Pave NaN Reg \n1 2 20 RL 80.0 9600 Pave NaN Reg \n2 3 60 RL 68.0 11250 Pave NaN IR1 \n3 4 70 RL 60.0 9550 Pave NaN IR1 \n4 5 60 RL 84.0 14260 Pave NaN IR1 \n\n LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal \\\n0 Lvl AllPub ... 0 NaN NaN NaN 0 \n1 Lvl AllPub ... 0 NaN NaN NaN 0 \n2 Lvl AllPub ... 0 NaN NaN NaN 0 \n3 Lvl AllPub ... 0 NaN NaN NaN 0 \n4 Lvl AllPub ... 0 NaN NaN NaN 0 \n\n MoSold YrSold SaleType SaleCondition SalePrice \n0 2 2008 WD Normal 208500 \n1 5 2007 WD Normal 181500 \n2 9 2008 WD Normal 223500 \n3 2 2006 WD Abnorml 140000 \n4 12 2008 WD Normal 250000 \n\n[5 rows x 81 columns]", 36 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities...PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub...0NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPub...0NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPub...0NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPub...0NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPub...0NaNNaNNaN0122008WDNormal250000
\n

5 rows × 81 columns

\n
" 37 | }, 38 | "metadata": {} 39 | } 40 | ] 41 | }, 42 | { 43 | "metadata": { 44 | "trusted": true, 45 | "_uuid": "8f692b94e16a8b9ce3929a1293f8d920372e9892" 46 | }, 47 | "cell_type": "code", 48 | "source": "df.describe()", 49 | "execution_count": 3, 50 | "outputs": [ 51 | { 52 | "output_type": "execute_result", 53 | "execution_count": 3, 54 | "data": { 55 | "text/plain": " Id MSSubClass LotFrontage LotArea OverallQual \\\ncount 1460.000000 1460.000000 1201.000000 1460.000000 1460.000000 \nmean 730.500000 56.897260 70.049958 10516.828082 6.099315 \nstd 421.610009 42.300571 24.284752 9981.264932 1.382997 \nmin 1.000000 20.000000 21.000000 1300.000000 1.000000 \n25% 365.750000 20.000000 59.000000 7553.500000 5.000000 \n50% 730.500000 50.000000 69.000000 9478.500000 6.000000 \n75% 1095.250000 70.000000 80.000000 11601.500000 7.000000 \nmax 1460.000000 190.000000 313.000000 215245.000000 10.000000 \n\n OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 \\\ncount 1460.000000 1460.000000 1460.000000 1452.000000 1460.000000 \nmean 5.575342 1971.267808 1984.865753 103.685262 443.639726 \nstd 1.112799 30.202904 20.645407 181.066207 456.098091 \nmin 1.000000 1872.000000 1950.000000 0.000000 0.000000 \n25% 5.000000 1954.000000 1967.000000 0.000000 0.000000 \n50% 5.000000 1973.000000 1994.000000 0.000000 383.500000 \n75% 6.000000 2000.000000 2004.000000 166.000000 712.250000 \nmax 9.000000 2010.000000 2010.000000 1600.000000 5644.000000 \n\n ... WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\ncount ... 1460.000000 1460.000000 1460.000000 1460.000000 \nmean ... 94.244521 46.660274 21.954110 3.409589 \nstd ... 125.338794 66.256028 61.119149 29.317331 \nmin ... 0.000000 0.000000 0.000000 0.000000 \n25% ... 0.000000 0.000000 0.000000 0.000000 \n50% ... 0.000000 25.000000 0.000000 0.000000 \n75% ... 168.000000 68.000000 0.000000 0.000000 \nmax ... 857.000000 547.000000 552.000000 508.000000 \n\n ScreenPorch PoolArea MiscVal MoSold YrSold \\\ncount 1460.000000 1460.000000 1460.000000 1460.000000 1460.000000 \nmean 15.060959 2.758904 43.489041 6.321918 2007.815753 \nstd 55.757415 40.177307 496.123024 2.703626 1.328095 \nmin 0.000000 0.000000 0.000000 1.000000 2006.000000 \n25% 0.000000 0.000000 0.000000 5.000000 2007.000000 \n50% 0.000000 0.000000 0.000000 6.000000 2008.000000 \n75% 0.000000 0.000000 0.000000 8.000000 2009.000000 \nmax 480.000000 738.000000 15500.000000 12.000000 2010.000000 \n\n SalePrice \ncount 1460.000000 \nmean 180921.195890 \nstd 79442.502883 \nmin 34900.000000 \n25% 129975.000000 \n50% 163000.000000 \n75% 214000.000000 \nmax 755000.000000 \n\n[8 rows x 38 columns]", 56 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
IdMSSubClassLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1...WoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaMiscValMoSoldYrSoldSalePrice
count1460.0000001460.0000001201.0000001460.0000001460.0000001460.0000001460.0000001460.0000001452.0000001460.000000...1460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.0000001460.000000
mean730.50000056.89726070.04995810516.8280826.0993155.5753421971.2678081984.865753103.685262443.639726...94.24452146.66027421.9541103.40958915.0609592.75890443.4890416.3219182007.815753180921.195890
std421.61000942.30057124.2847529981.2649321.3829971.11279930.20290420.645407181.066207456.098091...125.33879466.25602861.11914929.31733155.75741540.177307496.1230242.7036261.32809579442.502883
min1.00000020.00000021.0000001300.0000001.0000001.0000001872.0000001950.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000001.0000002006.00000034900.000000
25%365.75000020.00000059.0000007553.5000005.0000005.0000001954.0000001967.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000005.0000002007.000000129975.000000
50%730.50000050.00000069.0000009478.5000006.0000005.0000001973.0000001994.0000000.000000383.500000...0.00000025.0000000.0000000.0000000.0000000.0000000.0000006.0000002008.000000163000.000000
75%1095.25000070.00000080.00000011601.5000007.0000006.0000002000.0000002004.000000166.000000712.250000...168.00000068.0000000.0000000.0000000.0000000.0000000.0000008.0000002009.000000214000.000000
max1460.000000190.000000313.000000215245.00000010.0000009.0000002010.0000002010.0000001600.0000005644.000000...857.000000547.000000552.000000508.000000480.000000738.00000015500.00000012.0000002010.000000755000.000000
\n

8 rows × 38 columns

\n
" 57 | }, 58 | "metadata": {} 59 | } 60 | ] 61 | }, 62 | { 63 | "metadata": { 64 | "_uuid": "54d361163e3de409a12cd9fabe86263d1d9582aa" 65 | }, 66 | "cell_type": "markdown", 67 | "source": "* **count**: Shows how many rows have non-missing values\n* **mean**: Average\n* **std**: Tells how numerically spread out the values are\n* **min, 25%, 50%, 75%**: lowest values is min, quarter 25% and so on" 68 | }, 69 | { 70 | "metadata": { 71 | "_uuid": "81e44e18c121cfe69bc4813ebd337fafdbd9a243" 72 | }, 73 | "cell_type": "markdown", 74 | "source": "### Selecting and Filtering Data" 75 | }, 76 | { 77 | "metadata": { 78 | "_uuid": "7a6ef3e3e34e68f8384699da6cf4ba1e31eea284" 79 | }, 80 | "cell_type": "markdown", 81 | "source": "**df.columns**: Gives a list of all the columns in a Pandas DataFrame" 82 | }, 83 | { 84 | "metadata": { 85 | "trusted": true, 86 | "_uuid": "8656f9ed79b3713102f9f0a4cd86658e47004506" 87 | }, 88 | "cell_type": "code", 89 | "source": "columns = df.columns\nprint(columns)", 90 | "execution_count": 4, 91 | "outputs": [ 92 | { 93 | "output_type": "stream", 94 | "text": "Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',\n 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',\n 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',\n 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',\n 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',\n 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',\n 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',\n 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',\n 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',\n 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',\n 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',\n 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',\n 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',\n 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',\n 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',\n 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',\n 'SaleCondition', 'SalePrice'],\n dtype='object')\n", 95 | "name": "stdout" 96 | } 97 | ] 98 | }, 99 | { 100 | "metadata": { 101 | "_uuid": "004a58b8cad2f90c4040ffc83204a5211597e649" 102 | }, 103 | "cell_type": "markdown", 104 | "source": "To select a single column out of the dataset, use **df[\"COLUMN_NAME\"]**" 105 | }, 106 | { 107 | "metadata": { 108 | "trusted": true, 109 | "_uuid": "d1297ff4e1124491c59798d13f67cb577aa43976" 110 | }, 111 | "cell_type": "code", 112 | "source": "df[\"SalePrice\"].head(5)", 113 | "execution_count": 5, 114 | "outputs": [ 115 | { 116 | "output_type": "execute_result", 117 | "execution_count": 5, 118 | "data": { 119 | "text/plain": "0 208500\n1 181500\n2 223500\n3 140000\n4 250000\nName: SalePrice, dtype: int64" 120 | }, 121 | "metadata": {} 122 | } 123 | ] 124 | }, 125 | { 126 | "metadata": { 127 | "_uuid": "d6c1f3187d8a1b5a921f9b4a0ded5a000e8e7a5c" 128 | }, 129 | "cell_type": "markdown", 130 | "source": "More often than not we need to subset our data for ad-hoc analysis or testing a UDF and for billion other reasons, this is one of the many ways to subset your data according to columns" 131 | }, 132 | { 133 | "metadata": { 134 | "trusted": true, 135 | "_uuid": "39d1b79e98a4a30c29567524701cf0da34aba75b" 136 | }, 137 | "cell_type": "code", 138 | "source": "columns_of_interest = [\"SaleCondition\",\"SalePrice\"]\ndf[columns_of_interest].head()", 139 | "execution_count": 6, 140 | "outputs": [ 141 | { 142 | "output_type": "execute_result", 143 | "execution_count": 6, 144 | "data": { 145 | "text/plain": " SaleCondition SalePrice\n0 Normal 208500\n1 Normal 181500\n2 Normal 223500\n3 Abnorml 140000\n4 Normal 250000", 146 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
SaleConditionSalePrice
0Normal208500
1Normal181500
2Normal223500
3Abnorml140000
4Normal250000
\n
" 147 | }, 148 | "metadata": {} 149 | } 150 | ] 151 | }, 152 | { 153 | "metadata": { 154 | "trusted": true, 155 | "_uuid": "c470ffcd2d6f5d4f8856daedc93626ba2742916a" 156 | }, 157 | "cell_type": "code", 158 | "source": "df[columns_of_interest].describe()", 159 | "execution_count": 7, 160 | "outputs": [ 161 | { 162 | "output_type": "execute_result", 163 | "execution_count": 7, 164 | "data": { 165 | "text/plain": " SalePrice\ncount 1460.000000\nmean 180921.195890\nstd 79442.502883\nmin 34900.000000\n25% 129975.000000\n50% 163000.000000\n75% 214000.000000\nmax 755000.000000", 166 | "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
SalePrice
count1460.000000
mean180921.195890
std79442.502883
min34900.000000
25%129975.000000
50%163000.000000
75%214000.000000
max755000.000000
\n
" 167 | }, 168 | "metadata": {} 169 | } 170 | ] 171 | }, 172 | { 173 | "metadata": { 174 | "_uuid": "b0c4448d9f38c4197464ce3b00280c53fae85814" 175 | }, 176 | "cell_type": "markdown", 177 | "source": "### Building your first Scikit Learn model\n\nTarget Variable: **SalePrice**\n\nFor now, we are using the Decision Tree Regressor: [Documentation](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)" 178 | }, 179 | { 180 | "metadata": { 181 | "trusted": true, 182 | "collapsed": true, 183 | "_uuid": "acf981357059e1f2e2bbf5c660510cfe491b6717" 184 | }, 185 | "cell_type": "code", 186 | "source": "from sklearn.tree import DecisionTreeRegressor", 187 | "execution_count": 9, 188 | "outputs": [] 189 | }, 190 | { 191 | "metadata": { 192 | "trusted": true, 193 | "_uuid": "187e4fc9dd79b2aad8e61ee0b9c48752a288c9b1" 194 | }, 195 | "cell_type": "code", 196 | "source": "y = df[\"SalePrice\"]\npredictor_variables = [\"LotArea\", \"YearBuilt\", \"1stFlrSF\", \"2ndFlrSF\", \"FullBath\", \"BedroomAbvGr\", \"TotRmsAbvGrd\"]\nX = df[predictor_variables]", 197 | "execution_count": 8, 198 | "outputs": [] 199 | }, 200 | { 201 | "metadata": { 202 | "_uuid": "25f91bc32e4dc17c5878461110583e832293fc3a" 203 | }, 204 | "cell_type": "markdown", 205 | "source": "Fitting the model using **.fit()** function, takes in Training Data and Labels\n\n* Training Data: **X**\n* Labels: **y**" 206 | }, 207 | { 208 | "metadata": { 209 | "trusted": true, 210 | "_uuid": "bf3a9638e380e8a8cfe40b943de5034c2d4d36a8" 211 | }, 212 | "cell_type": "code", 213 | "source": "regressor = DecisionTreeRegressor()\nregressor.fit(X, y)", 214 | "execution_count": 10, 215 | "outputs": [ 216 | { 217 | "output_type": "execute_result", 218 | "execution_count": 10, 219 | "data": { 220 | "text/plain": "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n presort=False, random_state=None, splitter='best')" 221 | }, 222 | "metadata": {} 223 | } 224 | ] 225 | }, 226 | { 227 | "metadata": { 228 | "_uuid": "9c8a86e8a939f135c8d8d7dead6557aea8ad1cda" 229 | }, 230 | "cell_type": "markdown", 231 | "source": "Prdicting Labels for testing set by using **.predict()** function, takes in the to be predicted values and should be in the exact format as provided above\n\nSide note: Here the prediction data is first 5 rows of Training set, this would never be the case in real-life problems" 232 | }, 233 | { 234 | "metadata": { 235 | "trusted": true, 236 | "_uuid": "6a731b12ce7e103ba50d3c8e39c46d7d214aef24" 237 | }, 238 | "cell_type": "code", 239 | "source": "regressor.predict(X.head(5))", 240 | "execution_count": 11, 241 | "outputs": [ 242 | { 243 | "output_type": "execute_result", 244 | "execution_count": 11, 245 | "data": { 246 | "text/plain": "array([ 208500., 181500., 223500., 140000., 250000.])" 247 | }, 248 | "metadata": {} 249 | } 250 | ] 251 | }, 252 | { 253 | "metadata": { 254 | "_uuid": "d73f5edd5b05223df968f4db5dcf4ac6123110a4" 255 | }, 256 | "cell_type": "markdown", 257 | "source": "### Model Validation\n\nTo test how our model is performing on real-world data, we divide our overal training set into 2 sets, Training set and Testing\n\nThe model is trained on Training set and predictions are derived from the test set\n\n**Mean Absolute Error** is then measured using original labels and predicted variables for testing set\n\nIdeal train test split is **60:40**, but it may differ from case to case basis" 258 | }, 259 | { 260 | "metadata": { 261 | "trusted": true, 262 | "collapsed": true, 263 | "_uuid": "474407accf07043c0100aae8bf745f3a8745018a" 264 | }, 265 | "cell_type": "code", 266 | "source": "from sklearn.model_selection import train_test_split\nfrom sklearn.tree import DecisionTreeRegressor\n\ntrain_X, test_X, train_y, test_y = train_test_split(X, y, random_state=6, test_size=0.40)", 267 | "execution_count": 13, 268 | "outputs": [] 269 | }, 270 | { 271 | "metadata": { 272 | "trusted": true, 273 | "_uuid": "46af8267d1ddb8216fbade73eaaf202e602b872d" 274 | }, 275 | "cell_type": "code", 276 | "source": "regressor = DecisionTreeRegressor()\nregressor.fit(train_X, train_y)", 277 | "execution_count": 14, 278 | "outputs": [ 279 | { 280 | "output_type": "execute_result", 281 | "execution_count": 14, 282 | "data": { 283 | "text/plain": "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n presort=False, random_state=None, splitter='best')" 284 | }, 285 | "metadata": {} 286 | } 287 | ] 288 | }, 289 | { 290 | "metadata": { 291 | "trusted": true, 292 | "collapsed": true, 293 | "_uuid": "54bef6336a252d24599cc26dbd5b28a5433be79e" 294 | }, 295 | "cell_type": "code", 296 | "source": "predictions = regressor.predict(test_X)", 297 | "execution_count": 15, 298 | "outputs": [] 299 | }, 300 | { 301 | "metadata": { 302 | "_uuid": "ead765ac27a0579d0a68301ed6a0ca6f59fc538e" 303 | }, 304 | "cell_type": "markdown", 305 | "source": "**Mean Absolute Error** is the absolute value of each error" 306 | }, 307 | { 308 | "metadata": { 309 | "trusted": true, 310 | "_uuid": "3f05a6ae0653488e5363424a8a5b52321f845e10" 311 | }, 312 | "cell_type": "code", 313 | "source": "from sklearn.metrics import mean_absolute_error\n\nmean_absolute_error(test_y, predictions)", 314 | "execution_count": 16, 315 | "outputs": [ 316 | { 317 | "output_type": "execute_result", 318 | "execution_count": 16, 319 | "data": { 320 | "text/plain": "30421.69006849315" 321 | }, 322 | "metadata": {} 323 | } 324 | ] 325 | }, 326 | { 327 | "metadata": { 328 | "_uuid": "b5079cbf6d1538e34cf33189c14c757ad7c0d3b6" 329 | }, 330 | "cell_type": "markdown", 331 | "source": "Optimizing the number of leaf nodes to get better MAE results" 332 | }, 333 | { 334 | "metadata": { 335 | "trusted": true, 336 | "collapsed": true, 337 | "_uuid": "513650b4c4d3c0abe1bd593921082c28f96e264d" 338 | }, 339 | "cell_type": "code", 340 | "source": "def get_mae(leaf_nodes, train_X, train_y, test_X, test_y):\n regressor = DecisionTreeRegressor(random_state=0, max_leaf_nodes=leaf_nodes)\n regressor.fit(train_X, train_y)\n predictions = regressor.predict(test_X)\n error = mean_absolute_error(test_y, predictions)\n print(\"Leaf Nodes: \" + str(leaf_nodes) + \" Error: \" + str(error))", 341 | "execution_count": 17, 342 | "outputs": [] 343 | }, 344 | { 345 | "metadata": { 346 | "trusted": true, 347 | "_uuid": "6f3838ea60d4f55a2573ce145eb0cdadf3a3fa3f" 348 | }, 349 | "cell_type": "code", 350 | "source": "nodes = [50, 100, 500, 1000, 2000]\n\nfor node in nodes:\n get_mae(node, train_X, train_y, test_X, test_y)", 351 | "execution_count": 18, 352 | "outputs": [ 353 | { 354 | "output_type": "stream", 355 | "text": "Leaf Nodes: 50 Error: 29514.142466\nLeaf Nodes: 100 Error: 29022.8035487\nLeaf Nodes: 500 Error: 31430.6623002\nLeaf Nodes: 1000 Error: 31493.3236301\nLeaf Nodes: 2000 Error: 31493.3236301\n", 356 | "name": "stdout" 357 | } 358 | ] 359 | }, 360 | { 361 | "metadata": { 362 | "trusted": true, 363 | "_uuid": "d0246fec085b440bc7773814f773e16c7931e75b" 364 | }, 365 | "cell_type": "code", 366 | "source": "nodes = [5, 10, 50, 100, 500]\n\nfor node in nodes:\n get_mae(node, train_X, train_y, test_X, test_y)", 367 | "execution_count": 19, 368 | "outputs": [ 369 | { 370 | "output_type": "stream", 371 | "text": "Leaf Nodes: 5 Error: 36994.303582\nLeaf Nodes: 10 Error: 33207.8198356\nLeaf Nodes: 50 Error: 29514.142466\nLeaf Nodes: 100 Error: 29022.8035487\nLeaf Nodes: 500 Error: 31430.6623002\n", 372 | "name": "stdout" 373 | } 374 | ] 375 | }, 376 | { 377 | "metadata": { 378 | "trusted": true, 379 | "_uuid": "8627cb1317010404791f4217f4854e40476427b6" 380 | }, 381 | "cell_type": "code", 382 | "source": "nodes = [100, 200, 300, 400, 500]\n\nfor node in nodes:\n get_mae(node, train_X, train_y, test_X, test_y)", 383 | "execution_count": 20, 384 | "outputs": [ 385 | { 386 | "output_type": "stream", 387 | "text": "Leaf Nodes: 100 Error: 29022.8035487\nLeaf Nodes: 200 Error: 29522.5853728\nLeaf Nodes: 300 Error: 30430.6891636\nLeaf Nodes: 400 Error: 31074.9160041\nLeaf Nodes: 500 Error: 31430.6623002\n", 388 | "name": "stdout" 389 | } 390 | ] 391 | }, 392 | { 393 | "metadata": { 394 | "_uuid": "f6bc9e2377b99031f10f674f799ee98113459c67" 395 | }, 396 | "cell_type": "markdown", 397 | "source": "### Training a RandomForestRegressor\n\n[Documentation](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)" 398 | }, 399 | { 400 | "metadata": { 401 | "trusted": true, 402 | "_uuid": "52e3ba87f777226332f908423dc119167750c89a" 403 | }, 404 | "cell_type": "code", 405 | "source": "from sklearn.ensemble import RandomForestRegressor\n\nregressor = RandomForestRegressor(random_state=0)\nregressor.fit(train_X, train_y)\npredictions = regressor.predict(test_X)\nmean_absolute_error(test_y, predictions)", 406 | "execution_count": 24, 407 | "outputs": [ 408 | { 409 | "output_type": "execute_result", 410 | "execution_count": 24, 411 | "data": { 412 | "text/plain": "25041.4450913242" 413 | }, 414 | "metadata": {} 415 | } 416 | ] 417 | }, 418 | { 419 | "metadata": { 420 | "trusted": true, 421 | "collapsed": true, 422 | "_uuid": "9eb09e194ff9e88cc86775f82dd146d821b4790d" 423 | }, 424 | "cell_type": "code", 425 | "source": "", 426 | "execution_count": null, 427 | "outputs": [] 428 | } 429 | ], 430 | "metadata": { 431 | "kernelspec": { 432 | "display_name": "Python 3", 433 | "language": "python", 434 | "name": "python3" 435 | }, 436 | "language_info": { 437 | "name": "python", 438 | "version": "3.6.4", 439 | "mimetype": "text/x-python", 440 | "codemirror_mode": { 441 | "name": "ipython", 442 | "version": 3 443 | }, 444 | "pygments_lexer": "ipython3", 445 | "nbconvert_exporter": "python", 446 | "file_extension": ".py" 447 | } 448 | }, 449 | "nbformat": 4, 450 | "nbformat_minor": 1 451 | } --------------------------------------------------------------------------------