├── .gitignore ├── Module 1 Quiz.pdf ├── Module 1.ipynb ├── Module 2 Quiz.pdf ├── Module 2.ipynb ├── Module 3 Quiz.pdf ├── Module 3.ipynb ├── Module 4 Quiz.pdf ├── Module 4.ipynb ├── fraud_data.csv ├── mushrooms.csv ├── polynomialreg1.png ├── week1.pdf ├── week1_Assignment.ipynb ├── week2_Assignment.ipynb ├── week3_Assignment.ipynb └── week4_Assignment.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /Module 1 Quiz.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianhuaiyuan/Applied-machine-learning-in-python/4287c946be45dd05fe7e0d1f3f447ff86879060f/Module 1 Quiz.pdf -------------------------------------------------------------------------------- /Module 1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Applied Machine Learning, Module 1: A simple classification task" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Import required modules and load data file" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "%matplotlib notebook\n", 37 | "import numpy as np\n", 38 | "import matplotlib.pyplot as plt\n", 39 | "import pandas as pd\n", 40 | "from sklearn.model_selection import train_test_split\n", 41 | "\n", 42 | "fruits = pd.read_table('fruit_data_with_colors.txt')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "fruits.head()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "# create a mapping from fruit label value to fruit name to make results easier to interpret\n", 65 | "lookup_fruit_name = dict(zip(fruits.fruit_label.unique(), fruits.fruit_name.unique())) \n", 66 | "lookup_fruit_name" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "The file contains the mass, height, and width of a selection of oranges, lemons and apples. The heights were measured along the core of the fruit. The widths were the widest width perpendicular to the height." 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### Examining the data" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": false 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "# plotting a scatter matrix\n", 92 | "from matplotlib import cm\n", 93 | "\n", 94 | "X = fruits[['height', 'width', 'mass', 'color_score']]\n", 95 | "y = fruits['fruit_label']\n", 96 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", 97 | "\n", 98 | "cmap = cm.get_cmap('gnuplot')\n", 99 | "scatter = pd.scatter_matrix(X_train, c= y_train, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(9,9), cmap=cmap)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "# plotting a 3D scatter plot\n", 111 | "from mpl_toolkits.mplot3d import Axes3D\n", 112 | "\n", 113 | "fig = plt.figure()\n", 114 | "ax = fig.add_subplot(111, projection = '3d')\n", 115 | "ax.scatter(X_train['width'], X_train['height'], X_train['color_score'], c = y_train, marker = 'o', s=100)\n", 116 | "ax.set_xlabel('width')\n", 117 | "ax.set_ylabel('height')\n", 118 | "ax.set_zlabel('color_score')\n", 119 | "plt.show()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Create train-test split" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "# For this example, we use the mass, width, and height features of each fruit instance\n", 138 | "X = fruits[['mass', 'width', 'height']]\n", 139 | "y = fruits['fruit_label']\n", 140 | "\n", 141 | "# default is 75% / 25% train-test split\n", 142 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### Create classifier object" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "from sklearn.neighbors import KNeighborsClassifier\n", 161 | "\n", 162 | "knn = KNeighborsClassifier(n_neighbors = 5)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Train the classifier (fit the estimator) using the training data" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "knn.fit(X_train, y_train)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "### Estimate the accuracy of the classifier on future data, using the test data" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "knn.score(X_test, y_test)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "### Use the trained k-NN classifier model to classify new, previously unseen objects" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "# first example: a small fruit with mass 20g, width 4.3 cm, height 5.5 cm\n", 217 | "fruit_prediction = knn.predict([[20, 4.3, 5.5]])\n", 218 | "lookup_fruit_name[fruit_prediction[0]]" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "# second example: a larger, elongated fruit with mass 100g, width 6.3 cm, height 8.5 cm\n", 230 | "fruit_prediction = knn.predict([[100, 6.3, 8.5]])\n", 231 | "lookup_fruit_name[fruit_prediction[0]]" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### Plot the decision boundaries of the k-NN classifier" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "from adspy_shared_utilities import plot_fruit_knn\n", 250 | "\n", 251 | "plot_fruit_knn(X_train, y_train, 5, 'uniform') # we choose 5 nearest neighbors" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "### How sensitive is k-NN classification accuracy to the choice of the 'k' parameter?" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "k_range = range(1,20)\n", 270 | "scores = []\n", 271 | "\n", 272 | "for k in k_range:\n", 273 | " knn = KNeighborsClassifier(n_neighbors = k)\n", 274 | " knn.fit(X_train, y_train)\n", 275 | " scores.append(knn.score(X_test, y_test))\n", 276 | "\n", 277 | "plt.figure()\n", 278 | "plt.xlabel('k')\n", 279 | "plt.ylabel('accuracy')\n", 280 | "plt.scatter(k_range, scores)\n", 281 | "plt.xticks([0,5,10,15,20]);" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "### How sensitive is k-NN classification accuracy to the train/test split proportion?" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": false 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "t = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]\n", 300 | "\n", 301 | "knn = KNeighborsClassifier(n_neighbors = 5)\n", 302 | "\n", 303 | "plt.figure()\n", 304 | "\n", 305 | "for s in t:\n", 306 | "\n", 307 | " scores = []\n", 308 | " for i in range(1,1000):\n", 309 | " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-s)\n", 310 | " knn.fit(X_train, y_train)\n", 311 | " scores.append(knn.score(X_test, y_test))\n", 312 | " plt.plot(s, np.mean(scores), 'bo')\n", 313 | "\n", 314 | "plt.xlabel('Training set proportion (%)')\n", 315 | "plt.ylabel('accuracy');" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": true 323 | }, 324 | "outputs": [], 325 | "source": [] 326 | } 327 | ], 328 | "metadata": { 329 | "anaconda-cloud": {}, 330 | "kernelspec": { 331 | "display_name": "Python 3", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.5.2" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 1 350 | } 351 | -------------------------------------------------------------------------------- /Module 2 Quiz.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianhuaiyuan/Applied-machine-learning-in-python/4287c946be45dd05fe7e0d1f3f447ff86879060f/Module 2 Quiz.pdf -------------------------------------------------------------------------------- /Module 2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Applied Machine Learning: Module 2 (Supervised Learning, Part I)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Preamble and Review" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "%matplotlib notebook\n", 37 | "import numpy as np\n", 38 | "import pandas as pd\n", 39 | "import seaborn as sn\n", 40 | "import matplotlib.pyplot as plt\n", 41 | "\n", 42 | "from sklearn.model_selection import train_test_split\n", 43 | "from sklearn.neighbors import KNeighborsClassifier\n", 44 | "\n", 45 | "np.set_printoptions(precision=2)\n", 46 | "\n", 47 | "\n", 48 | "fruits = pd.read_table('fruit_data_with_colors.txt')\n", 49 | "\n", 50 | "feature_names_fruits = ['height', 'width', 'mass', 'color_score']\n", 51 | "X_fruits = fruits[feature_names_fruits]\n", 52 | "y_fruits = fruits['fruit_label']\n", 53 | "target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']\n", 54 | "\n", 55 | "X_fruits_2d = fruits[['height', 'width']]\n", 56 | "y_fruits_2d = fruits['fruit_label']\n", 57 | "\n", 58 | "X_train, X_test, y_train, y_test = train_test_split(X_fruits, y_fruits, random_state=0)\n", 59 | "\n", 60 | "from sklearn.preprocessing import MinMaxScaler\n", 61 | "scaler = MinMaxScaler()\n", 62 | "X_train_scaled = scaler.fit_transform(X_train)\n", 63 | "# we must apply the scaling to the test set that we computed for the training set\n", 64 | "X_test_scaled = scaler.transform(X_test)\n", 65 | "\n", 66 | "knn = KNeighborsClassifier(n_neighbors = 5)\n", 67 | "knn.fit(X_train_scaled, y_train)\n", 68 | "print('Accuracy of K-NN classifier on training set: {:.2f}'\n", 69 | " .format(knn.score(X_train_scaled, y_train)))\n", 70 | "print('Accuracy of K-NN classifier on test set: {:.2f}'\n", 71 | " .format(knn.score(X_test_scaled, y_test)))\n", 72 | "\n", 73 | "example_fruit = [[5.5, 2.2, 10, 0.70]]\n", 74 | "print('Predicted fruit type for ', example_fruit, ' is ', \n", 75 | " target_names_fruits[knn.predict(example_fruit)[0]-1])" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Datasets" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false, 90 | "scrolled": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "from sklearn.datasets import make_classification, make_blobs\n", 95 | "from matplotlib.colors import ListedColormap\n", 96 | "from sklearn.datasets import load_breast_cancer\n", 97 | "from adspy_shared_utilities import load_crime_dataset\n", 98 | "\n", 99 | "cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])\n", 100 | "\n", 101 | "\n", 102 | "# synthetic dataset for simple regression\n", 103 | "from sklearn.datasets import make_regression\n", 104 | "plt.figure()\n", 105 | "plt.title('Sample regression problem with one input variable')\n", 106 | "X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,\n", 107 | " n_informative=1, bias = 150.0,\n", 108 | " noise = 30, random_state=0)\n", 109 | "plt.scatter(X_R1, y_R1, marker= 'o', s=50)\n", 110 | "plt.show()\n", 111 | "\n", 112 | "\n", 113 | "# synthetic dataset for more complex regression\n", 114 | "from sklearn.datasets import make_friedman1\n", 115 | "plt.figure()\n", 116 | "plt.title('Complex regression problem with one input variable')\n", 117 | "X_F1, y_F1 = make_friedman1(n_samples = 100,\n", 118 | " n_features = 7, random_state=0)\n", 119 | "\n", 120 | "plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50)\n", 121 | "plt.show()\n", 122 | "\n", 123 | "# synthetic dataset for classification (binary) \n", 124 | "plt.figure()\n", 125 | "plt.title('Sample binary classification problem with two informative features')\n", 126 | "X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,\n", 127 | " n_redundant=0, n_informative=2,\n", 128 | " n_clusters_per_class=1, flip_y = 0.1,\n", 129 | " class_sep = 0.5, random_state=0)\n", 130 | "plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2,\n", 131 | " marker= 'o', s=50, cmap=cmap_bold)\n", 132 | "plt.show()\n", 133 | "\n", 134 | "\n", 135 | "# more difficult synthetic dataset for classification (binary) \n", 136 | "# with classes that are not linearly separable\n", 137 | "X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8,\n", 138 | " cluster_std = 1.3, random_state = 4)\n", 139 | "y_D2 = y_D2 % 2\n", 140 | "plt.figure()\n", 141 | "plt.title('Sample binary classification problem with non-linearly separable classes')\n", 142 | "plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,\n", 143 | " marker= 'o', s=50, cmap=cmap_bold)\n", 144 | "plt.show()\n", 145 | "\n", 146 | "\n", 147 | "# Breast cancer dataset for classification\n", 148 | "cancer = load_breast_cancer()\n", 149 | "(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)\n", 150 | "\n", 151 | "\n", 152 | "# Communities and Crime dataset\n", 153 | "(X_crime, y_crime) = load_crime_dataset()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "## K-Nearest Neighbors" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "### Classification" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": false, 175 | "scrolled": false 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "from adspy_shared_utilities import plot_two_class_knn\n", 180 | "\n", 181 | "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,\n", 182 | " random_state=0)\n", 183 | "\n", 184 | "plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test)\n", 185 | "plot_two_class_knn(X_train, y_train, 3, 'uniform', X_test, y_test)\n", 186 | "plot_two_class_knn(X_train, y_train, 11, 'uniform', X_test, y_test)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### Regression" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "from sklearn.neighbors import KNeighborsRegressor\n", 205 | "\n", 206 | "X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state = 0)\n", 207 | "\n", 208 | "knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)\n", 209 | "\n", 210 | "print(knnreg.predict(X_test))\n", 211 | "print('R-squared test score: {:.3f}'\n", 212 | " .format(knnreg.score(X_test, y_test)))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "fig, subaxes = plt.subplots(1, 2, figsize=(8,4))\n", 224 | "X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)\n", 225 | "X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0)\n", 226 | "\n", 227 | "for thisaxis, K in zip(subaxes, [1, 3]):\n", 228 | " knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)\n", 229 | " y_predict_output = knnreg.predict(X_predict_input)\n", 230 | " thisaxis.set_xlim([-2.5, 0.75])\n", 231 | " thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10,\n", 232 | " label='Predicted', alpha=0.8)\n", 233 | " thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.8)\n", 234 | " thisaxis.set_xlabel('Input feature')\n", 235 | " thisaxis.set_ylabel('Target value')\n", 236 | " thisaxis.set_title('KNN regression (K={})'.format(K))\n", 237 | " thisaxis.legend()\n", 238 | "plt.tight_layout()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### Regression model complexity as a function of K" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": { 252 | "collapsed": false, 253 | "scrolled": false 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "# plot k-NN regression on sample dataset for different values of K\n", 258 | "fig, subaxes = plt.subplots(5, 1, figsize=(5,20))\n", 259 | "X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1)\n", 260 | "X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,\n", 261 | " random_state = 0)\n", 262 | "\n", 263 | "for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]):\n", 264 | " knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)\n", 265 | " y_predict_output = knnreg.predict(X_predict_input)\n", 266 | " train_score = knnreg.score(X_train, y_train)\n", 267 | " test_score = knnreg.score(X_test, y_test)\n", 268 | " thisaxis.plot(X_predict_input, y_predict_output)\n", 269 | " thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train')\n", 270 | " thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test')\n", 271 | " thisaxis.set_xlabel('Input feature')\n", 272 | " thisaxis.set_ylabel('Target value')\n", 273 | " thisaxis.set_title('KNN Regression (K={})\\n\\\n", 274 | "Train $R^2 = {:.3f}$, Test $R^2 = {:.3f}$'\n", 275 | " .format(K, train_score, test_score))\n", 276 | " thisaxis.legend()\n", 277 | " plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)\n" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "## Linear models for regression" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "### Linear regression" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "collapsed": false 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "from sklearn.linear_model import LinearRegression\n", 303 | "\n", 304 | "X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,\n", 305 | " random_state = 0)\n", 306 | "linreg = LinearRegression().fit(X_train, y_train)\n", 307 | "\n", 308 | "print('linear model coeff (w): {}'\n", 309 | " .format(linreg.coef_))\n", 310 | "print('linear model intercept (b): {:.3f}'\n", 311 | " .format(linreg.intercept_))\n", 312 | "print('R-squared score (training): {:.3f}'\n", 313 | " .format(linreg.score(X_train, y_train)))\n", 314 | "print('R-squared score (test): {:.3f}'\n", 315 | " .format(linreg.score(X_test, y_test)))" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "### Linear regression: example plot " 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": false 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "plt.figure(figsize=(5,4))\n", 334 | "plt.scatter(X_R1, y_R1, marker= 'o', s=50, alpha=0.8)\n", 335 | "plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-')\n", 336 | "plt.title('Least-squares linear regression')\n", 337 | "plt.xlabel('Feature value (x)')\n", 338 | "plt.ylabel('Target value (y)')\n", 339 | "plt.show()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,\n", 351 | " random_state = 0)\n", 352 | "linreg = LinearRegression().fit(X_train, y_train)\n", 353 | "\n", 354 | "print('Crime dataset')\n", 355 | "print('linear model intercept: {}'\n", 356 | " .format(linreg.intercept_))\n", 357 | "print('linear model coeff:\\n{}'\n", 358 | " .format(linreg.coef_))\n", 359 | "print('R-squared score (training): {:.3f}'\n", 360 | " .format(linreg.score(X_train, y_train)))\n", 361 | "print('R-squared score (test): {:.3f}'\n", 362 | " .format(linreg.score(X_test, y_test)))" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "### Ridge regression" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "collapsed": false 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "from sklearn.linear_model import Ridge\n", 381 | "X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,\n", 382 | " random_state = 0)\n", 383 | "\n", 384 | "linridge = Ridge(alpha=20.0).fit(X_train, y_train)\n", 385 | "\n", 386 | "print('Crime dataset')\n", 387 | "print('ridge regression linear model intercept: {}'\n", 388 | " .format(linridge.intercept_))\n", 389 | "print('ridge regression linear model coeff:\\n{}'\n", 390 | " .format(linridge.coef_))\n", 391 | "print('R-squared score (training): {:.3f}'\n", 392 | " .format(linridge.score(X_train, y_train)))\n", 393 | "print('R-squared score (test): {:.3f}'\n", 394 | " .format(linridge.score(X_test, y_test)))\n", 395 | "print('Number of non-zero features: {}'\n", 396 | " .format(np.sum(linridge.coef_ != 0)))" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "#### Ridge regression with feature normalization" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "from sklearn.preprocessing import MinMaxScaler\n", 415 | "scaler = MinMaxScaler()\n", 416 | "\n", 417 | "from sklearn.linear_model import Ridge\n", 418 | "X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,\n", 419 | " random_state = 0)\n", 420 | "\n", 421 | "X_train_scaled = scaler.fit_transform(X_train)\n", 422 | "X_test_scaled = scaler.transform(X_test)\n", 423 | "\n", 424 | "linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)\n", 425 | "\n", 426 | "print('Crime dataset')\n", 427 | "print('ridge regression linear model intercept: {}'\n", 428 | " .format(linridge.intercept_))\n", 429 | "print('ridge regression linear model coeff:\\n{}'\n", 430 | " .format(linridge.coef_))\n", 431 | "print('R-squared score (training): {:.3f}'\n", 432 | " .format(linridge.score(X_train_scaled, y_train)))\n", 433 | "print('R-squared score (test): {:.3f}'\n", 434 | " .format(linridge.score(X_test_scaled, y_test)))\n", 435 | "print('Number of non-zero features: {}'\n", 436 | " .format(np.sum(linridge.coef_ != 0)))" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "#### Ridge regression with regularization parameter: alpha" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": { 450 | "collapsed": false 451 | }, 452 | "outputs": [], 453 | "source": [ 454 | "print('Ridge regression: effect of alpha regularization parameter\\n')\n", 455 | "for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:\n", 456 | " linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)\n", 457 | " r2_train = linridge.score(X_train_scaled, y_train)\n", 458 | " r2_test = linridge.score(X_test_scaled, y_test)\n", 459 | " num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)\n", 460 | " print('Alpha = {:.2f}\\nnum abs(coeff) > 1.0: {}, \\\n", 461 | "r-squared training: {:.2f}, r-squared test: {:.2f}\\n'\n", 462 | " .format(this_alpha, num_coeff_bigger, r2_train, r2_test))" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "### Lasso regression" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": { 476 | "collapsed": false 477 | }, 478 | "outputs": [], 479 | "source": [ 480 | "from sklearn.linear_model import Lasso\n", 481 | "from sklearn.preprocessing import MinMaxScaler\n", 482 | "scaler = MinMaxScaler()\n", 483 | "\n", 484 | "X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,\n", 485 | " random_state = 0)\n", 486 | "\n", 487 | "X_train_scaled = scaler.fit_transform(X_train)\n", 488 | "X_test_scaled = scaler.transform(X_test)\n", 489 | "\n", 490 | "linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train)\n", 491 | "\n", 492 | "print('Crime dataset')\n", 493 | "print('lasso regression linear model intercept: {}'\n", 494 | " .format(linlasso.intercept_))\n", 495 | "print('lasso regression linear model coeff:\\n{}'\n", 496 | " .format(linlasso.coef_))\n", 497 | "print('Non-zero features: {}'\n", 498 | " .format(np.sum(linlasso.coef_ != 0)))\n", 499 | "print('R-squared score (training): {:.3f}'\n", 500 | " .format(linlasso.score(X_train_scaled, y_train)))\n", 501 | "print('R-squared score (test): {:.3f}\\n'\n", 502 | " .format(linlasso.score(X_test_scaled, y_test)))\n", 503 | "print('Features with non-zero weight (sorted by absolute magnitude):')\n", 504 | "\n", 505 | "for e in sorted (list(zip(list(X_crime), linlasso.coef_)),\n", 506 | " key = lambda e: -abs(e[1])):\n", 507 | " if e[1] != 0:\n", 508 | " print('\\t{}, {:.3f}'.format(e[0], e[1]))" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "#### Lasso regression with regularization parameter: alpha" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "collapsed": false 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "print('Lasso regression: effect of alpha regularization\\n\\\n", 527 | "parameter on number of features kept in final model\\n')\n", 528 | "\n", 529 | "for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:\n", 530 | " linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)\n", 531 | " r2_train = linlasso.score(X_train_scaled, y_train)\n", 532 | " r2_test = linlasso.score(X_test_scaled, y_test)\n", 533 | " \n", 534 | " print('Alpha = {:.2f}\\nFeatures kept: {}, r-squared training: {:.2f}, \\\n", 535 | "r-squared test: {:.2f}\\n'\n", 536 | " .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "### Polynomial regression" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": { 550 | "collapsed": false 551 | }, 552 | "outputs": [], 553 | "source": [ 554 | "from sklearn.linear_model import LinearRegression\n", 555 | "from sklearn.linear_model import Ridge\n", 556 | "from sklearn.preprocessing import PolynomialFeatures\n", 557 | "\n", 558 | "\n", 559 | "X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1,\n", 560 | " random_state = 0)\n", 561 | "linreg = LinearRegression().fit(X_train, y_train)\n", 562 | "\n", 563 | "print('linear model coeff (w): {}'\n", 564 | " .format(linreg.coef_))\n", 565 | "print('linear model intercept (b): {:.3f}'\n", 566 | " .format(linreg.intercept_))\n", 567 | "print('R-squared score (training): {:.3f}'\n", 568 | " .format(linreg.score(X_train, y_train)))\n", 569 | "print('R-squared score (test): {:.3f}'\n", 570 | " .format(linreg.score(X_test, y_test)))\n", 571 | "\n", 572 | "print('\\nNow we transform the original input data to add\\n\\\n", 573 | "polynomial features up to degree 2 (quadratic)\\n')\n", 574 | "poly = PolynomialFeatures(degree=2)\n", 575 | "X_F1_poly = poly.fit_transform(X_F1)\n", 576 | "\n", 577 | "X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,\n", 578 | " random_state = 0)\n", 579 | "linreg = LinearRegression().fit(X_train, y_train)\n", 580 | "\n", 581 | "print('(poly deg 2) linear model coeff (w):\\n{}'\n", 582 | " .format(linreg.coef_))\n", 583 | "print('(poly deg 2) linear model intercept (b): {:.3f}'\n", 584 | " .format(linreg.intercept_))\n", 585 | "print('(poly deg 2) R-squared score (training): {:.3f}'\n", 586 | " .format(linreg.score(X_train, y_train)))\n", 587 | "print('(poly deg 2) R-squared score (test): {:.3f}\\n'\n", 588 | " .format(linreg.score(X_test, y_test)))\n", 589 | "\n", 590 | "print('\\nAddition of many polynomial features often leads to\\n\\\n", 591 | "overfitting, so we often use polynomial features in combination\\n\\\n", 592 | "with regression that has a regularization penalty, like ridge\\n\\\n", 593 | "regression.\\n')\n", 594 | "\n", 595 | "X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,\n", 596 | " random_state = 0)\n", 597 | "linreg = Ridge().fit(X_train, y_train)\n", 598 | "\n", 599 | "print('(poly deg 2 + ridge) linear model coeff (w):\\n{}'\n", 600 | " .format(linreg.coef_))\n", 601 | "print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'\n", 602 | " .format(linreg.intercept_))\n", 603 | "print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'\n", 604 | " .format(linreg.score(X_train, y_train)))\n", 605 | "print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'\n", 606 | " .format(linreg.score(X_test, y_test)))" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": {}, 612 | "source": [ 613 | "## Linear models for classification" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "### Logistic regression" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "#### Logistic regression for binary classification on fruits dataset using height, width features (positive class: apple, negative class: others)" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "metadata": { 634 | "collapsed": false 635 | }, 636 | "outputs": [], 637 | "source": [ 638 | "from sklearn.linear_model import LogisticRegression\n", 639 | "from adspy_shared_utilities import (\n", 640 | "plot_class_regions_for_classifier_subplot)\n", 641 | "\n", 642 | "fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))\n", 643 | "y_fruits_apple = y_fruits_2d == 1 # make into a binary problem: apples vs everything else\n", 644 | "X_train, X_test, y_train, y_test = (\n", 645 | "train_test_split(X_fruits_2d.as_matrix(),\n", 646 | " y_fruits_apple.as_matrix(),\n", 647 | " random_state = 0))\n", 648 | "\n", 649 | "clf = LogisticRegression(C=100).fit(X_train, y_train)\n", 650 | "plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,\n", 651 | " None, 'Logistic regression \\\n", 652 | "for binary classification\\nFruit dataset: Apple vs others',\n", 653 | " subaxes)\n", 654 | "\n", 655 | "h = 6\n", 656 | "w = 8\n", 657 | "print('A fruit with height {} and width {} is predicted to be: {}'\n", 658 | " .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]]))\n", 659 | "\n", 660 | "h = 10\n", 661 | "w = 7\n", 662 | "print('A fruit with height {} and width {} is predicted to be: {}'\n", 663 | " .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]]))\n", 664 | "subaxes.set_xlabel('height')\n", 665 | "subaxes.set_ylabel('width')\n", 666 | "\n", 667 | "print('Accuracy of Logistic regression classifier on training set: {:.2f}'\n", 668 | " .format(clf.score(X_train, y_train)))\n", 669 | "print('Accuracy of Logistic regression classifier on test set: {:.2f}'\n", 670 | " .format(clf.score(X_test, y_test)))" 671 | ] 672 | }, 673 | { 674 | "cell_type": "markdown", 675 | "metadata": {}, 676 | "source": [ 677 | "#### Logistic regression on simple synthetic dataset" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": { 684 | "collapsed": false, 685 | "scrolled": false 686 | }, 687 | "outputs": [], 688 | "source": [ 689 | "from sklearn.linear_model import LogisticRegression\n", 690 | "from adspy_shared_utilities import (\n", 691 | "plot_class_regions_for_classifier_subplot)\n", 692 | "\n", 693 | "\n", 694 | "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,\n", 695 | " random_state = 0)\n", 696 | "\n", 697 | "fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))\n", 698 | "clf = LogisticRegression().fit(X_train, y_train)\n", 699 | "title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(1.0)\n", 700 | "plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n", 701 | " None, None, title, subaxes)\n", 702 | "\n", 703 | "print('Accuracy of Logistic regression classifier on training set: {:.2f}'\n", 704 | " .format(clf.score(X_train, y_train)))\n", 705 | "print('Accuracy of Logistic regression classifier on test set: {:.2f}'\n", 706 | " .format(clf.score(X_test, y_test)))\n", 707 | " " 708 | ] 709 | }, 710 | { 711 | "cell_type": "markdown", 712 | "metadata": {}, 713 | "source": [ 714 | "#### Logistic regression regularization: C parameter" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": null, 720 | "metadata": { 721 | "collapsed": false, 722 | "scrolled": false 723 | }, 724 | "outputs": [], 725 | "source": [ 726 | "X_train, X_test, y_train, y_test = (\n", 727 | "train_test_split(X_fruits_2d.as_matrix(),\n", 728 | " y_fruits_apple.as_matrix(),\n", 729 | " random_state=0))\n", 730 | "\n", 731 | "fig, subaxes = plt.subplots(3, 1, figsize=(4, 10))\n", 732 | "\n", 733 | "for this_C, subplot in zip([0.1, 1, 100], subaxes):\n", 734 | " clf = LogisticRegression(C=this_C).fit(X_train, y_train)\n", 735 | " title ='Logistic regression (apple vs rest), C = {:.3f}'.format(this_C)\n", 736 | " \n", 737 | " plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n", 738 | " X_test, y_test, title,\n", 739 | " subplot)\n", 740 | "plt.tight_layout()" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "#### Application to real dataset" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": { 754 | "collapsed": false 755 | }, 756 | "outputs": [], 757 | "source": [ 758 | "from sklearn.linear_model import LogisticRegression\n", 759 | "\n", 760 | "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n", 761 | "\n", 762 | "clf = LogisticRegression().fit(X_train, y_train)\n", 763 | "print('Breast cancer dataset')\n", 764 | "print('Accuracy of Logistic regression classifier on training set: {:.2f}'\n", 765 | " .format(clf.score(X_train, y_train)))\n", 766 | "print('Accuracy of Logistic regression classifier on test set: {:.2f}'\n", 767 | " .format(clf.score(X_test, y_test)))" 768 | ] 769 | }, 770 | { 771 | "cell_type": "markdown", 772 | "metadata": {}, 773 | "source": [ 774 | "### Support Vector Machines" 775 | ] 776 | }, 777 | { 778 | "cell_type": "markdown", 779 | "metadata": {}, 780 | "source": [ 781 | "#### Linear Support Vector Machine" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "metadata": { 788 | "collapsed": false 789 | }, 790 | "outputs": [], 791 | "source": [ 792 | "from sklearn.svm import SVC\n", 793 | "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n", 794 | "\n", 795 | "\n", 796 | "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)\n", 797 | "\n", 798 | "fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))\n", 799 | "this_C = 1.0\n", 800 | "clf = SVC(kernel = 'linear', C=this_C).fit(X_train, y_train)\n", 801 | "title = 'Linear SVC, C = {:.3f}'.format(this_C)\n", 802 | "plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)" 803 | ] 804 | }, 805 | { 806 | "cell_type": "markdown", 807 | "metadata": {}, 808 | "source": [ 809 | "#### Linear Support Vector Machine: C parameter" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "metadata": { 816 | "collapsed": false 817 | }, 818 | "outputs": [], 819 | "source": [ 820 | "from sklearn.svm import LinearSVC\n", 821 | "from adspy_shared_utilities import plot_class_regions_for_classifier\n", 822 | "\n", 823 | "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)\n", 824 | "fig, subaxes = plt.subplots(1, 2, figsize=(8, 4))\n", 825 | "\n", 826 | "for this_C, subplot in zip([0.00001, 100], subaxes):\n", 827 | " clf = LinearSVC(C=this_C).fit(X_train, y_train)\n", 828 | " title = 'Linear SVC, C = {:.5f}'.format(this_C)\n", 829 | " plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n", 830 | " None, None, title, subplot)\n", 831 | "plt.tight_layout()" 832 | ] 833 | }, 834 | { 835 | "cell_type": "markdown", 836 | "metadata": {}, 837 | "source": [ 838 | "#### Application to real dataset" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": null, 844 | "metadata": { 845 | "collapsed": false 846 | }, 847 | "outputs": [], 848 | "source": [ 849 | "from sklearn.svm import LinearSVC\n", 850 | "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n", 851 | "\n", 852 | "clf = LinearSVC().fit(X_train, y_train)\n", 853 | "print('Breast cancer dataset')\n", 854 | "print('Accuracy of Linear SVC classifier on training set: {:.2f}'\n", 855 | " .format(clf.score(X_train, y_train)))\n", 856 | "print('Accuracy of Linear SVC classifier on test set: {:.2f}'\n", 857 | " .format(clf.score(X_test, y_test)))" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": {}, 863 | "source": [ 864 | "### Multi-class classification with linear models" 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": {}, 870 | "source": [ 871 | "#### LinearSVC with M classes generates M one vs rest classifiers." 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": null, 877 | "metadata": { 878 | "collapsed": false 879 | }, 880 | "outputs": [], 881 | "source": [ 882 | "from sklearn.svm import LinearSVC\n", 883 | "\n", 884 | "X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_2d, random_state = 0)\n", 885 | "\n", 886 | "clf = LinearSVC(C=5, random_state = 67).fit(X_train, y_train)\n", 887 | "print('Coefficients:\\n', clf.coef_)\n", 888 | "print('Intercepts:\\n', clf.intercept_)" 889 | ] 890 | }, 891 | { 892 | "cell_type": "markdown", 893 | "metadata": {}, 894 | "source": [ 895 | "#### Multi-class results on the fruit dataset" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": { 902 | "collapsed": false 903 | }, 904 | "outputs": [], 905 | "source": [ 906 | "plt.figure(figsize=(6,6))\n", 907 | "colors = ['r', 'g', 'b', 'y']\n", 908 | "cmap_fruits = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#FFFF00'])\n", 909 | "\n", 910 | "plt.scatter(X_fruits_2d[['height']], X_fruits_2d[['width']],\n", 911 | " c=y_fruits_2d, cmap=cmap_fruits, edgecolor = 'black', alpha=.7)\n", 912 | "\n", 913 | "x_0_range = np.linspace(-10, 15)\n", 914 | "\n", 915 | "for w, b, color in zip(clf.coef_, clf.intercept_, ['r', 'g', 'b', 'y']):\n", 916 | " # Since class prediction with a linear model uses the formula y = w_0 x_0 + w_1 x_1 + b, \n", 917 | " # and the decision boundary is defined as being all points with y = 0, to plot x_1 as a \n", 918 | " # function of x_0 we just solve w_0 x_0 + w_1 x_1 + b = 0 for x_1:\n", 919 | " plt.plot(x_0_range, -(x_0_range * w[0] + b) / w[1], c=color, alpha=.8)\n", 920 | " \n", 921 | "plt.legend(target_names_fruits)\n", 922 | "plt.xlabel('height')\n", 923 | "plt.ylabel('width')\n", 924 | "plt.xlim(-2, 12)\n", 925 | "plt.ylim(-2, 15)\n", 926 | "plt.show()" 927 | ] 928 | }, 929 | { 930 | "cell_type": "markdown", 931 | "metadata": {}, 932 | "source": [ 933 | "## Kernelized Support Vector Machines" 934 | ] 935 | }, 936 | { 937 | "cell_type": "markdown", 938 | "metadata": {}, 939 | "source": [ 940 | "### Classification" 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": null, 946 | "metadata": { 947 | "collapsed": false, 948 | "scrolled": false 949 | }, 950 | "outputs": [], 951 | "source": [ 952 | "from sklearn.svm import SVC\n", 953 | "from adspy_shared_utilities import plot_class_regions_for_classifier\n", 954 | "\n", 955 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)\n", 956 | "\n", 957 | "# The default SVC kernel is radial basis function (RBF)\n", 958 | "plot_class_regions_for_classifier(SVC().fit(X_train, y_train),\n", 959 | " X_train, y_train, None, None,\n", 960 | " 'Support Vector Classifier: RBF kernel')\n", 961 | "\n", 962 | "# Compare decision boundries with polynomial kernel, degree = 3\n", 963 | "plot_class_regions_for_classifier(SVC(kernel = 'poly', degree = 3)\n", 964 | " .fit(X_train, y_train), X_train,\n", 965 | " y_train, None, None,\n", 966 | " 'Support Vector Classifier: Polynomial kernel, degree = 3')" 967 | ] 968 | }, 969 | { 970 | "cell_type": "markdown", 971 | "metadata": {}, 972 | "source": [ 973 | "#### Support Vector Machine with RBF kernel: gamma parameter" 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": null, 979 | "metadata": { 980 | "collapsed": false 981 | }, 982 | "outputs": [], 983 | "source": [ 984 | "from adspy_shared_utilities import plot_class_regions_for_classifier\n", 985 | "\n", 986 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)\n", 987 | "fig, subaxes = plt.subplots(3, 1, figsize=(4, 11))\n", 988 | "\n", 989 | "for this_gamma, subplot in zip([0.01, 1.0, 10.0], subaxes):\n", 990 | " clf = SVC(kernel = 'rbf', gamma=this_gamma).fit(X_train, y_train)\n", 991 | " title = 'Support Vector Classifier: \\nRBF kernel, gamma = {:.2f}'.format(this_gamma)\n", 992 | " plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n", 993 | " None, None, title, subplot)\n", 994 | " plt.tight_layout()" 995 | ] 996 | }, 997 | { 998 | "cell_type": "markdown", 999 | "metadata": {}, 1000 | "source": [ 1001 | "#### Support Vector Machine with RBF kernel: using both C and gamma parameter " 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "code", 1006 | "execution_count": null, 1007 | "metadata": { 1008 | "collapsed": false 1009 | }, 1010 | "outputs": [], 1011 | "source": [ 1012 | "from sklearn.svm import SVC\n", 1013 | "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n", 1014 | "\n", 1015 | "from sklearn.model_selection import train_test_split\n", 1016 | "\n", 1017 | "\n", 1018 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)\n", 1019 | "fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50)\n", 1020 | "\n", 1021 | "for this_gamma, this_axis in zip([0.01, 1, 5], subaxes):\n", 1022 | " \n", 1023 | " for this_C, subplot in zip([0.1, 1, 15, 250], this_axis):\n", 1024 | " title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C)\n", 1025 | " clf = SVC(kernel = 'rbf', gamma = this_gamma,\n", 1026 | " C = this_C).fit(X_train, y_train)\n", 1027 | " plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n", 1028 | " X_test, y_test, title,\n", 1029 | " subplot)\n", 1030 | " plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)" 1031 | ] 1032 | }, 1033 | { 1034 | "cell_type": "markdown", 1035 | "metadata": {}, 1036 | "source": [ 1037 | "### Application of SVMs to a real dataset: unnormalized data" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "metadata": { 1044 | "collapsed": false 1045 | }, 1046 | "outputs": [], 1047 | "source": [ 1048 | "from sklearn.svm import SVC\n", 1049 | "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,\n", 1050 | " random_state = 0)\n", 1051 | "\n", 1052 | "clf = SVC(C=10).fit(X_train, y_train)\n", 1053 | "print('Breast cancer dataset (unnormalized features)')\n", 1054 | "print('Accuracy of RBF-kernel SVC on training set: {:.2f}'\n", 1055 | " .format(clf.score(X_train, y_train)))\n", 1056 | "print('Accuracy of RBF-kernel SVC on test set: {:.2f}'\n", 1057 | " .format(clf.score(X_test, y_test)))" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "markdown", 1062 | "metadata": {}, 1063 | "source": [ 1064 | "### Application of SVMs to a real dataset: normalized data with feature preprocessing using minmax scaling" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": null, 1070 | "metadata": { 1071 | "collapsed": false 1072 | }, 1073 | "outputs": [], 1074 | "source": [ 1075 | "from sklearn.preprocessing import MinMaxScaler\n", 1076 | "scaler = MinMaxScaler()\n", 1077 | "X_train_scaled = scaler.fit_transform(X_train)\n", 1078 | "X_test_scaled = scaler.transform(X_test)\n", 1079 | "\n", 1080 | "clf = SVC(C=10).fit(X_train_scaled, y_train)\n", 1081 | "print('Breast cancer dataset (normalized with MinMax scaling)')\n", 1082 | "print('RBF-kernel SVC (with MinMax scaling) training set accuracy: {:.2f}'\n", 1083 | " .format(clf.score(X_train_scaled, y_train)))\n", 1084 | "print('RBF-kernel SVC (with MinMax scaling) test set accuracy: {:.2f}'\n", 1085 | " .format(clf.score(X_test_scaled, y_test)))" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "markdown", 1090 | "metadata": { 1091 | "collapsed": true 1092 | }, 1093 | "source": [ 1094 | "## Cross-validation" 1095 | ] 1096 | }, 1097 | { 1098 | "cell_type": "markdown", 1099 | "metadata": {}, 1100 | "source": [ 1101 | "### Example based on k-NN classifier with fruit dataset (2 features)" 1102 | ] 1103 | }, 1104 | { 1105 | "cell_type": "code", 1106 | "execution_count": null, 1107 | "metadata": { 1108 | "collapsed": false 1109 | }, 1110 | "outputs": [], 1111 | "source": [ 1112 | "from sklearn.model_selection import cross_val_score\n", 1113 | "\n", 1114 | "clf = KNeighborsClassifier(n_neighbors = 5)\n", 1115 | "X = X_fruits_2d.as_matrix()\n", 1116 | "y = y_fruits_2d.as_matrix()\n", 1117 | "cv_scores = cross_val_score(clf, X, y)\n", 1118 | "\n", 1119 | "print('Cross-validation scores (3-fold):', cv_scores)\n", 1120 | "print('Mean cross-validation score (3-fold): {:.3f}'\n", 1121 | " .format(np.mean(cv_scores)))" 1122 | ] 1123 | }, 1124 | { 1125 | "cell_type": "markdown", 1126 | "metadata": {}, 1127 | "source": [ 1128 | "### A note on performing cross-validation for more advanced scenarios.\n", 1129 | "\n", 1130 | "In some cases (e.g. when feature values have very different ranges), we've seen the need to scale or normalize the training and test sets before use with a classifier. The proper way to do cross-validation when you need to scale the data is *not* to scale the entire dataset with a single transform, since this will indirectly leak information into the training data about the whole dataset, including the test data (see the lecture on data leakage later in the course). Instead, scaling/normalizing must be computed and applied for each cross-validation fold separately. To do this, the easiest way in scikit-learn is to use *pipelines*. While these are beyond the scope of this course, further information is available in the scikit-learn documentation here:\n", 1131 | "\n", 1132 | "http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html\n", 1133 | "\n", 1134 | "or the Pipeline section in the recommended textbook: Introduction to Machine Learning with Python by Andreas C. Müller and Sarah Guido (O'Reilly Media)." 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "markdown", 1139 | "metadata": {}, 1140 | "source": [ 1141 | "## Validation curve example" 1142 | ] 1143 | }, 1144 | { 1145 | "cell_type": "code", 1146 | "execution_count": null, 1147 | "metadata": { 1148 | "collapsed": true 1149 | }, 1150 | "outputs": [], 1151 | "source": [ 1152 | "from sklearn.svm import SVC\n", 1153 | "from sklearn.model_selection import validation_curve\n", 1154 | "\n", 1155 | "param_range = np.logspace(-3, 3, 4)\n", 1156 | "train_scores, test_scores = validation_curve(SVC(), X, y,\n", 1157 | " param_name='gamma',\n", 1158 | " param_range=param_range, cv=3)" 1159 | ] 1160 | }, 1161 | { 1162 | "cell_type": "code", 1163 | "execution_count": null, 1164 | "metadata": { 1165 | "collapsed": false 1166 | }, 1167 | "outputs": [], 1168 | "source": [ 1169 | "print(train_scores)" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "code", 1174 | "execution_count": null, 1175 | "metadata": { 1176 | "collapsed": false 1177 | }, 1178 | "outputs": [], 1179 | "source": [ 1180 | "print(test_scores)" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "code", 1185 | "execution_count": null, 1186 | "metadata": { 1187 | "collapsed": false 1188 | }, 1189 | "outputs": [], 1190 | "source": [ 1191 | "# This code based on scikit-learn validation_plot example\n", 1192 | "# See: http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html\n", 1193 | "plt.figure()\n", 1194 | "\n", 1195 | "train_scores_mean = np.mean(train_scores, axis=1)\n", 1196 | "train_scores_std = np.std(train_scores, axis=1)\n", 1197 | "test_scores_mean = np.mean(test_scores, axis=1)\n", 1198 | "test_scores_std = np.std(test_scores, axis=1)\n", 1199 | "\n", 1200 | "plt.title('Validation Curve with SVM')\n", 1201 | "plt.xlabel('$\\gamma$ (gamma)')\n", 1202 | "plt.ylabel('Score')\n", 1203 | "plt.ylim(0.0, 1.1)\n", 1204 | "lw = 2\n", 1205 | "\n", 1206 | "plt.semilogx(param_range, train_scores_mean, label='Training score',\n", 1207 | " color='darkorange', lw=lw)\n", 1208 | "\n", 1209 | "plt.fill_between(param_range, train_scores_mean - train_scores_std,\n", 1210 | " train_scores_mean + train_scores_std, alpha=0.2,\n", 1211 | " color='darkorange', lw=lw)\n", 1212 | "\n", 1213 | "plt.semilogx(param_range, test_scores_mean, label='Cross-validation score',\n", 1214 | " color='navy', lw=lw)\n", 1215 | "\n", 1216 | "plt.fill_between(param_range, test_scores_mean - test_scores_std,\n", 1217 | " test_scores_mean + test_scores_std, alpha=0.2,\n", 1218 | " color='navy', lw=lw)\n", 1219 | "\n", 1220 | "plt.legend(loc='best')\n", 1221 | "plt.show()" 1222 | ] 1223 | }, 1224 | { 1225 | "cell_type": "markdown", 1226 | "metadata": { 1227 | "collapsed": true 1228 | }, 1229 | "source": [ 1230 | "## Decision Trees" 1231 | ] 1232 | }, 1233 | { 1234 | "cell_type": "code", 1235 | "execution_count": null, 1236 | "metadata": { 1237 | "collapsed": false 1238 | }, 1239 | "outputs": [], 1240 | "source": [ 1241 | "from sklearn.datasets import load_iris\n", 1242 | "from sklearn.tree import DecisionTreeClassifier\n", 1243 | "from adspy_shared_utilities import plot_decision_tree\n", 1244 | "from sklearn.model_selection import train_test_split\n", 1245 | "\n", 1246 | "\n", 1247 | "iris = load_iris()\n", 1248 | "\n", 1249 | "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)\n", 1250 | "clf = DecisionTreeClassifier().fit(X_train, y_train)\n", 1251 | "\n", 1252 | "print('Accuracy of Decision Tree classifier on training set: {:.2f}'\n", 1253 | " .format(clf.score(X_train, y_train)))\n", 1254 | "print('Accuracy of Decision Tree classifier on test set: {:.2f}'\n", 1255 | " .format(clf.score(X_test, y_test)))" 1256 | ] 1257 | }, 1258 | { 1259 | "cell_type": "markdown", 1260 | "metadata": {}, 1261 | "source": [ 1262 | "#### Setting max decision tree depth to help avoid overfitting" 1263 | ] 1264 | }, 1265 | { 1266 | "cell_type": "code", 1267 | "execution_count": null, 1268 | "metadata": { 1269 | "collapsed": false 1270 | }, 1271 | "outputs": [], 1272 | "source": [ 1273 | "clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)\n", 1274 | "\n", 1275 | "print('Accuracy of Decision Tree classifier on training set: {:.2f}'\n", 1276 | " .format(clf2.score(X_train, y_train)))\n", 1277 | "print('Accuracy of Decision Tree classifier on test set: {:.2f}'\n", 1278 | " .format(clf2.score(X_test, y_test)))" 1279 | ] 1280 | }, 1281 | { 1282 | "cell_type": "markdown", 1283 | "metadata": {}, 1284 | "source": [ 1285 | "#### Visualizing decision trees" 1286 | ] 1287 | }, 1288 | { 1289 | "cell_type": "code", 1290 | "execution_count": null, 1291 | "metadata": { 1292 | "collapsed": false 1293 | }, 1294 | "outputs": [], 1295 | "source": [ 1296 | "plot_decision_tree(clf, iris.feature_names, iris.target_names)" 1297 | ] 1298 | }, 1299 | { 1300 | "cell_type": "markdown", 1301 | "metadata": {}, 1302 | "source": [ 1303 | "#### Pre-pruned version (max_depth = 3)" 1304 | ] 1305 | }, 1306 | { 1307 | "cell_type": "code", 1308 | "execution_count": null, 1309 | "metadata": { 1310 | "collapsed": false 1311 | }, 1312 | "outputs": [], 1313 | "source": [ 1314 | "plot_decision_tree(clf2, iris.feature_names, iris.target_names)" 1315 | ] 1316 | }, 1317 | { 1318 | "cell_type": "markdown", 1319 | "metadata": {}, 1320 | "source": [ 1321 | "#### Feature importance" 1322 | ] 1323 | }, 1324 | { 1325 | "cell_type": "code", 1326 | "execution_count": null, 1327 | "metadata": { 1328 | "collapsed": false 1329 | }, 1330 | "outputs": [], 1331 | "source": [ 1332 | "from adspy_shared_utilities import plot_feature_importances\n", 1333 | "\n", 1334 | "plt.figure(figsize=(10,4), dpi=80)\n", 1335 | "plot_feature_importances(clf, iris.feature_names)\n", 1336 | "plt.show()\n", 1337 | "\n", 1338 | "print('Feature importances: {}'.format(clf.feature_importances_))" 1339 | ] 1340 | }, 1341 | { 1342 | "cell_type": "code", 1343 | "execution_count": null, 1344 | "metadata": { 1345 | "collapsed": false 1346 | }, 1347 | "outputs": [], 1348 | "source": [ 1349 | "from sklearn.tree import DecisionTreeClassifier\n", 1350 | "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n", 1351 | "\n", 1352 | "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 0)\n", 1353 | "fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))\n", 1354 | "\n", 1355 | "pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]\n", 1356 | "tree_max_depth = 4\n", 1357 | "\n", 1358 | "for pair, axis in zip(pair_list, subaxes):\n", 1359 | " X = X_train[:, pair]\n", 1360 | " y = y_train\n", 1361 | " \n", 1362 | " clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)\n", 1363 | " title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth)\n", 1364 | " plot_class_regions_for_classifier_subplot(clf, X, y, None,\n", 1365 | " None, title, axis,\n", 1366 | " iris.target_names)\n", 1367 | " \n", 1368 | " axis.set_xlabel(iris.feature_names[pair[0]])\n", 1369 | " axis.set_ylabel(iris.feature_names[pair[1]])\n", 1370 | " \n", 1371 | "plt.tight_layout()\n", 1372 | "plt.show()" 1373 | ] 1374 | }, 1375 | { 1376 | "cell_type": "markdown", 1377 | "metadata": {}, 1378 | "source": [ 1379 | "#### Decision Trees on a real-world dataset" 1380 | ] 1381 | }, 1382 | { 1383 | "cell_type": "code", 1384 | "execution_count": null, 1385 | "metadata": { 1386 | "collapsed": false 1387 | }, 1388 | "outputs": [], 1389 | "source": [ 1390 | "from sklearn.tree import DecisionTreeClassifier\n", 1391 | "from adspy_shared_utilities import plot_decision_tree\n", 1392 | "from adspy_shared_utilities import plot_feature_importances\n", 1393 | "\n", 1394 | "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n", 1395 | "\n", 1396 | "clf = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8,\n", 1397 | " random_state = 0).fit(X_train, y_train)\n", 1398 | "\n", 1399 | "plot_decision_tree(clf, cancer.feature_names, cancer.target_names)" 1400 | ] 1401 | }, 1402 | { 1403 | "cell_type": "code", 1404 | "execution_count": null, 1405 | "metadata": { 1406 | "collapsed": false 1407 | }, 1408 | "outputs": [], 1409 | "source": [ 1410 | "print('Breast cancer dataset: decision tree')\n", 1411 | "print('Accuracy of DT classifier on training set: {:.2f}'\n", 1412 | " .format(clf.score(X_train, y_train)))\n", 1413 | "print('Accuracy of DT classifier on test set: {:.2f}'\n", 1414 | " .format(clf.score(X_test, y_test)))\n", 1415 | "\n", 1416 | "plt.figure(figsize=(10,6),dpi=80)\n", 1417 | "plot_feature_importances(clf, cancer.feature_names)\n", 1418 | "plt.tight_layout()\n", 1419 | "\n", 1420 | "plt.show()" 1421 | ] 1422 | } 1423 | ], 1424 | "metadata": { 1425 | "anaconda-cloud": {}, 1426 | "kernelspec": { 1427 | "display_name": "Python 3", 1428 | "language": "python", 1429 | "name": "python3" 1430 | }, 1431 | "language_info": { 1432 | "codemirror_mode": { 1433 | "name": "ipython", 1434 | "version": 3 1435 | }, 1436 | "file_extension": ".py", 1437 | "mimetype": "text/x-python", 1438 | "name": "python", 1439 | "nbconvert_exporter": "python", 1440 | "pygments_lexer": "ipython3", 1441 | "version": "3.5.2" 1442 | } 1443 | }, 1444 | "nbformat": 4, 1445 | "nbformat_minor": 2 1446 | } 1447 | -------------------------------------------------------------------------------- /Module 3 Quiz.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianhuaiyuan/Applied-machine-learning-in-python/4287c946be45dd05fe7e0d1f3f447ff86879060f/Module 3 Quiz.pdf -------------------------------------------------------------------------------- /Module 3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "source": [ 20 | "# Applied Machine Learning: Module 3 (Evaluation)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Evaluation for Classification" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "### Preamble" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "%matplotlib notebook\n", 46 | "import numpy as np\n", 47 | "import pandas as pd\n", 48 | "import seaborn as sns\n", 49 | "import matplotlib.pyplot as plt\n", 50 | "from sklearn.model_selection import train_test_split\n", 51 | "from sklearn.datasets import load_digits\n", 52 | "\n", 53 | "dataset = load_digits()\n", 54 | "X, y = dataset.data, dataset.target\n", 55 | "\n", 56 | "for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):\n", 57 | " print(class_name,class_count)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "# Creating a dataset with imbalanced binary classes: \n", 69 | "# Negative class (0) is 'not digit 1' \n", 70 | "# Positive class (1) is 'digit 1'\n", 71 | "y_binary_imbalanced = y.copy()\n", 72 | "y_binary_imbalanced[y_binary_imbalanced != 1] = 0\n", 73 | "\n", 74 | "print('Original labels:\\t', y[1:30])\n", 75 | "print('New binary labels:\\t', y_binary_imbalanced[1:30])" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": false, 83 | "scrolled": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "np.bincount(y_binary_imbalanced) # Negative class (0) is the most frequent class" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n", 99 | "\n", 100 | "# Accuracy of Support Vector Machine classifier\n", 101 | "from sklearn.svm import SVC\n", 102 | "\n", 103 | "svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)\n", 104 | "svm.score(X_test, y_test)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Dummy Classifiers" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": { 117 | "collapsed": true 118 | }, 119 | "source": [ 120 | "DummyClassifier is a classifier that makes predictions using simple rules, which can be useful as a baseline for comparison against actual classifiers, especially with imbalanced classes." 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "from sklearn.dummy import DummyClassifier\n", 132 | "\n", 133 | "# Negative class (0) is most frequent\n", 134 | "dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)\n", 135 | "# Therefore the dummy 'most_frequent' classifier always predicts class 0\n", 136 | "y_dummy_predictions = dummy_majority.predict(X_test)\n", 137 | "\n", 138 | "y_dummy_predictions" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "dummy_majority.score(X_test, y_test)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "svm = SVC(kernel='linear', C=1).fit(X_train, y_train)\n", 161 | "svm.score(X_test, y_test)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "### Confusion matrices" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "#### Binary (two-class) confusion matrix" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "from sklearn.metrics import confusion_matrix\n", 187 | "\n", 188 | "# Negative class (0) is most frequent\n", 189 | "dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)\n", 190 | "y_majority_predicted = dummy_majority.predict(X_test)\n", 191 | "confusion = confusion_matrix(y_test, y_majority_predicted)\n", 192 | "\n", 193 | "print('Most frequent class (dummy classifier)\\n', confusion)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "# produces random predictions w/ same class proportion as training set\n", 205 | "dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train, y_train)\n", 206 | "y_classprop_predicted = dummy_classprop.predict(X_test)\n", 207 | "confusion = confusion_matrix(y_test, y_classprop_predicted)\n", 208 | "\n", 209 | "print('Random class-proportional prediction (dummy classifier)\\n', confusion)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": false, 217 | "scrolled": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "svm = SVC(kernel='linear', C=1).fit(X_train, y_train)\n", 222 | "svm_predicted = svm.predict(X_test)\n", 223 | "confusion = confusion_matrix(y_test, svm_predicted)\n", 224 | "\n", 225 | "print('Support vector machine classifier (linear kernel, C=1)\\n', confusion)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": { 232 | "collapsed": false 233 | }, 234 | "outputs": [], 235 | "source": [ 236 | "from sklearn.linear_model import LogisticRegression\n", 237 | "\n", 238 | "lr = LogisticRegression().fit(X_train, y_train)\n", 239 | "lr_predicted = lr.predict(X_test)\n", 240 | "confusion = confusion_matrix(y_test, lr_predicted)\n", 241 | "\n", 242 | "print('Logistic regression classifier (default settings)\\n', confusion)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "from sklearn.tree import DecisionTreeClassifier\n", 254 | "\n", 255 | "dt = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)\n", 256 | "tree_predicted = dt.predict(X_test)\n", 257 | "confusion = confusion_matrix(y_test, tree_predicted)\n", 258 | "\n", 259 | "print('Decision tree classifier (max_depth = 2)\\n', confusion)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "### Evaluation metrics for binary classification" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", 278 | "# Accuracy = TP + TN / (TP + TN + FP + FN)\n", 279 | "# Precision = TP / (TP + FP)\n", 280 | "# Recall = TP / (TP + FN) Also known as sensitivity, or True Positive Rate\n", 281 | "# F1 = 2 * Precision * Recall / (Precision + Recall) \n", 282 | "print('Accuracy: {:.2f}'.format(accuracy_score(y_test, tree_predicted)))\n", 283 | "print('Precision: {:.2f}'.format(precision_score(y_test, tree_predicted)))\n", 284 | "print('Recall: {:.2f}'.format(recall_score(y_test, tree_predicted)))\n", 285 | "print('F1: {:.2f}'.format(f1_score(y_test, tree_predicted)))" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "# Combined report with all above metrics\n", 297 | "from sklearn.metrics import classification_report\n", 298 | "\n", 299 | "print(classification_report(y_test, tree_predicted, target_names=['not 1', '1']))" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "collapsed": false, 307 | "scrolled": false 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "print('Random class-proportional (dummy)\\n', \n", 312 | " classification_report(y_test, y_classprop_predicted, target_names=['not 1', '1']))\n", 313 | "print('SVM\\n', \n", 314 | " classification_report(y_test, svm_predicted, target_names = ['not 1', '1']))\n", 315 | "print('Logistic regression\\n', \n", 316 | " classification_report(y_test, lr_predicted, target_names = ['not 1', '1']))\n", 317 | "print('Decision tree\\n', \n", 318 | " classification_report(y_test, tree_predicted, target_names = ['not 1', '1']))" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "### Decision functions" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n", 337 | "y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test)\n", 338 | "y_score_list = list(zip(y_test[0:20], y_scores_lr[0:20]))\n", 339 | "\n", 340 | "# show the decision_function scores for first 20 instances\n", 341 | "y_score_list" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n", 353 | "y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test)\n", 354 | "y_proba_list = list(zip(y_test[0:20], y_proba_lr[0:20,1]))\n", 355 | "\n", 356 | "# show the probability of positive class for first 20 instances\n", 357 | "y_proba_list" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "### Precision-recall curves" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": { 371 | "collapsed": false 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "from sklearn.metrics import precision_recall_curve\n", 376 | "\n", 377 | "precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)\n", 378 | "closest_zero = np.argmin(np.abs(thresholds))\n", 379 | "closest_zero_p = precision[closest_zero]\n", 380 | "closest_zero_r = recall[closest_zero]\n", 381 | "\n", 382 | "plt.figure()\n", 383 | "plt.xlim([0.0, 1.01])\n", 384 | "plt.ylim([0.0, 1.01])\n", 385 | "plt.plot(precision, recall, label='Precision-Recall Curve')\n", 386 | "plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)\n", 387 | "plt.xlabel('Precision', fontsize=16)\n", 388 | "plt.ylabel('Recall', fontsize=16)\n", 389 | "plt.axes().set_aspect('equal')\n", 390 | "plt.show()" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "### ROC curves, Area-Under-Curve (AUC)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "collapsed": false 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "from sklearn.metrics import roc_curve, auc\n", 409 | "\n", 410 | "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n", 411 | "\n", 412 | "y_score_lr = lr.fit(X_train, y_train).decision_function(X_test)\n", 413 | "fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr)\n", 414 | "roc_auc_lr = auc(fpr_lr, tpr_lr)\n", 415 | "\n", 416 | "plt.figure()\n", 417 | "plt.xlim([-0.01, 1.00])\n", 418 | "plt.ylim([-0.01, 1.01])\n", 419 | "plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))\n", 420 | "plt.xlabel('False Positive Rate', fontsize=16)\n", 421 | "plt.ylabel('True Positive Rate', fontsize=16)\n", 422 | "plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)\n", 423 | "plt.legend(loc='lower right', fontsize=13)\n", 424 | "plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')\n", 425 | "plt.axes().set_aspect('equal')\n", 426 | "plt.show()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "collapsed": false, 434 | "scrolled": false 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "from matplotlib import cm\n", 439 | "\n", 440 | "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n", 441 | "\n", 442 | "plt.figure()\n", 443 | "plt.xlim([-0.01, 1.00])\n", 444 | "plt.ylim([-0.01, 1.01])\n", 445 | "for g in [0.01, 0.1, 0.20, 1]:\n", 446 | " svm = SVC(gamma=g).fit(X_train, y_train)\n", 447 | " y_score_svm = svm.decision_function(X_test)\n", 448 | " fpr_svm, tpr_svm, _ = roc_curve(y_test, y_score_svm)\n", 449 | " roc_auc_svm = auc(fpr_svm, tpr_svm)\n", 450 | " accuracy_svm = svm.score(X_test, y_test)\n", 451 | " print(\"gamma = {:.2f} accuracy = {:.2f} AUC = {:.2f}\".format(g, accuracy_svm, \n", 452 | " roc_auc_svm))\n", 453 | " plt.plot(fpr_svm, tpr_svm, lw=3, alpha=0.7, \n", 454 | " label='SVM (gamma = {:0.2f}, area = {:0.2f})'.format(g, roc_auc_svm))\n", 455 | "\n", 456 | "plt.xlabel('False Positive Rate', fontsize=16)\n", 457 | "plt.ylabel('True Positive Rate (Recall)', fontsize=16)\n", 458 | "plt.plot([0, 1], [0, 1], color='k', lw=0.5, linestyle='--')\n", 459 | "plt.legend(loc=\"lower right\", fontsize=11)\n", 460 | "plt.title('ROC curve: (1-of-10 digits classifier)', fontsize=16)\n", 461 | "plt.axes().set_aspect('equal')\n", 462 | "\n", 463 | "plt.show()" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "### Evaluation measures for multi-class classification" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "#### Multi-class confusion matrix" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "collapsed": false, 485 | "scrolled": false 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "dataset = load_digits()\n", 490 | "X, y = dataset.data, dataset.target\n", 491 | "X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X, y, random_state=0)\n", 492 | "\n", 493 | "\n", 494 | "svm = SVC(kernel = 'linear').fit(X_train_mc, y_train_mc)\n", 495 | "svm_predicted_mc = svm.predict(X_test_mc)\n", 496 | "confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc)\n", 497 | "df_cm = pd.DataFrame(confusion_mc, \n", 498 | " index = [i for i in range(0,10)], columns = [i for i in range(0,10)])\n", 499 | "\n", 500 | "plt.figure(figsize=(5.5,4))\n", 501 | "sns.heatmap(df_cm, annot=True)\n", 502 | "plt.title('SVM Linear Kernel \\nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, \n", 503 | " svm_predicted_mc)))\n", 504 | "plt.ylabel('True label')\n", 505 | "plt.xlabel('Predicted label')\n", 506 | "\n", 507 | "\n", 508 | "svm = SVC(kernel = 'rbf').fit(X_train_mc, y_train_mc)\n", 509 | "svm_predicted_mc = svm.predict(X_test_mc)\n", 510 | "confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc)\n", 511 | "df_cm = pd.DataFrame(confusion_mc, index = [i for i in range(0,10)],\n", 512 | " columns = [i for i in range(0,10)])\n", 513 | "\n", 514 | "plt.figure(figsize = (5.5,4))\n", 515 | "sns.heatmap(df_cm, annot=True)\n", 516 | "plt.title('SVM RBF Kernel \\nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, \n", 517 | " svm_predicted_mc)))\n", 518 | "plt.ylabel('True label')\n", 519 | "plt.xlabel('Predicted label');" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "#### Multi-class classification report" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "collapsed": false 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "print(classification_report(y_test_mc, svm_predicted_mc))" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "#### Micro- vs. macro-averaged metrics" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": { 551 | "collapsed": false 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "print('Micro-averaged precision = {:.2f} (treat instances equally)'\n", 556 | " .format(precision_score(y_test_mc, svm_predicted_mc, average = 'micro')))\n", 557 | "print('Macro-averaged precision = {:.2f} (treat classes equally)'\n", 558 | " .format(precision_score(y_test_mc, svm_predicted_mc, average = 'macro')))" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": { 565 | "collapsed": false 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "print('Micro-averaged f1 = {:.2f} (treat instances equally)'\n", 570 | " .format(f1_score(y_test_mc, svm_predicted_mc, average = 'micro')))\n", 571 | "print('Macro-averaged f1 = {:.2f} (treat classes equally)'\n", 572 | " .format(f1_score(y_test_mc, svm_predicted_mc, average = 'macro')))" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": {}, 578 | "source": [ 579 | "### Regression evaluation metrics" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": { 586 | "collapsed": false 587 | }, 588 | "outputs": [], 589 | "source": [ 590 | "%matplotlib notebook\n", 591 | "import matplotlib.pyplot as plt\n", 592 | "import numpy as np\n", 593 | "from sklearn.model_selection import train_test_split\n", 594 | "from sklearn import datasets\n", 595 | "from sklearn.linear_model import LinearRegression\n", 596 | "from sklearn.metrics import mean_squared_error, r2_score\n", 597 | "from sklearn.dummy import DummyRegressor\n", 598 | "\n", 599 | "diabetes = datasets.load_diabetes()\n", 600 | "\n", 601 | "X = diabetes.data[:, None, 6]\n", 602 | "y = diabetes.target\n", 603 | "\n", 604 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", 605 | "\n", 606 | "lm = LinearRegression().fit(X_train, y_train)\n", 607 | "lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)\n", 608 | "\n", 609 | "y_predict = lm.predict(X_test)\n", 610 | "y_predict_dummy_mean = lm_dummy_mean.predict(X_test)\n", 611 | "\n", 612 | "print('Linear model, coefficients: ', lm.coef_)\n", 613 | "print(\"Mean squared error (dummy): {:.2f}\".format(mean_squared_error(y_test, \n", 614 | " y_predict_dummy_mean)))\n", 615 | "print(\"Mean squared error (linear model): {:.2f}\".format(mean_squared_error(y_test, y_predict)))\n", 616 | "print(\"r2_score (dummy): {:.2f}\".format(r2_score(y_test, y_predict_dummy_mean)))\n", 617 | "print(\"r2_score (linear model): {:.2f}\".format(r2_score(y_test, y_predict)))\n", 618 | "\n", 619 | "# Plot outputs\n", 620 | "plt.scatter(X_test, y_test, color='black')\n", 621 | "plt.plot(X_test, y_predict, color='green', linewidth=2)\n", 622 | "plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed', \n", 623 | " linewidth=2, label = 'dummy')\n", 624 | "\n", 625 | "plt.show()" 626 | ] 627 | }, 628 | { 629 | "cell_type": "markdown", 630 | "metadata": {}, 631 | "source": [ 632 | "### Model selection using evaluation metrics" 633 | ] 634 | }, 635 | { 636 | "cell_type": "markdown", 637 | "metadata": {}, 638 | "source": [ 639 | "#### Cross-validation example" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": { 646 | "collapsed": false 647 | }, 648 | "outputs": [], 649 | "source": [ 650 | "from sklearn.model_selection import cross_val_score\n", 651 | "from sklearn.svm import SVC\n", 652 | "\n", 653 | "dataset = load_digits()\n", 654 | "# again, making this a binary problem with 'digit 1' as positive class \n", 655 | "# and 'not 1' as negative class\n", 656 | "X, y = dataset.data, dataset.target == 1\n", 657 | "clf = SVC(kernel='linear', C=1)\n", 658 | "\n", 659 | "# accuracy is the default scoring metric\n", 660 | "print('Cross-validation (accuracy)', cross_val_score(clf, X, y, cv=5))\n", 661 | "# use AUC as scoring metric\n", 662 | "print('Cross-validation (AUC)', cross_val_score(clf, X, y, cv=5, scoring = 'roc_auc'))\n", 663 | "# use recall as scoring metric\n", 664 | "print('Cross-validation (recall)', cross_val_score(clf, X, y, cv=5, scoring = 'recall'))" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "metadata": {}, 670 | "source": [ 671 | "#### Grid search example" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "metadata": { 678 | "collapsed": false 679 | }, 680 | "outputs": [], 681 | "source": [ 682 | "from sklearn.svm import SVC\n", 683 | "from sklearn.model_selection import GridSearchCV\n", 684 | "from sklearn.metrics import roc_auc_score\n", 685 | "\n", 686 | "dataset = load_digits()\n", 687 | "X, y = dataset.data, dataset.target == 1\n", 688 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", 689 | "\n", 690 | "clf = SVC(kernel='rbf')\n", 691 | "grid_values = {'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100]}\n", 692 | "\n", 693 | "# default metric to optimize over grid parameters: accuracy\n", 694 | "grid_clf_acc = GridSearchCV(clf, param_grid = grid_values)\n", 695 | "grid_clf_acc.fit(X_train, y_train)\n", 696 | "y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test) \n", 697 | "\n", 698 | "print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)\n", 699 | "print('Grid best score (accuracy): ', grid_clf_acc.best_score_)\n", 700 | "\n", 701 | "# alternative metric to optimize over grid parameters: AUC\n", 702 | "grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')\n", 703 | "grid_clf_auc.fit(X_train, y_train)\n", 704 | "y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) \n", 705 | "\n", 706 | "print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))\n", 707 | "print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)\n", 708 | "print('Grid best score (AUC): ', grid_clf_auc.best_score_)\n" 709 | ] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "#### Evaluation metrics supported for model selection" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": { 722 | "collapsed": false 723 | }, 724 | "outputs": [], 725 | "source": [ 726 | "from sklearn.metrics.scorer import SCORERS\n", 727 | "\n", 728 | "print(sorted(list(SCORERS.keys())))" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "### Two-feature classification example using the digits dataset" 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "metadata": {}, 741 | "source": [ 742 | "#### Optimizing a classifier using different evaluation metrics" 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": null, 748 | "metadata": { 749 | "collapsed": false, 750 | "scrolled": false 751 | }, 752 | "outputs": [], 753 | "source": [ 754 | "from sklearn.datasets import load_digits\n", 755 | "from sklearn.model_selection import train_test_split\n", 756 | "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n", 757 | "from sklearn.svm import SVC\n", 758 | "from sklearn.model_selection import GridSearchCV\n", 759 | "\n", 760 | "\n", 761 | "dataset = load_digits()\n", 762 | "X, y = dataset.data, dataset.target == 1\n", 763 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", 764 | "\n", 765 | "# Create a two-feature input vector matching the example plot above\n", 766 | "# We jitter the points (add a small amount of random noise) in case there are areas\n", 767 | "# in feature space where many instances have the same features.\n", 768 | "jitter_delta = 0.25\n", 769 | "X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta\n", 770 | "X_twovar_test = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta\n", 771 | "\n", 772 | "clf = SVC(kernel = 'linear').fit(X_twovar_train, y_train)\n", 773 | "grid_values = {'class_weight':['balanced', {1:2},{1:3},{1:4},{1:5},{1:10},{1:20},{1:50}]}\n", 774 | "plt.figure(figsize=(9,6))\n", 775 | "for i, eval_metric in enumerate(('precision','recall', 'f1','roc_auc')):\n", 776 | " grid_clf_custom = GridSearchCV(clf, param_grid=grid_values, scoring=eval_metric)\n", 777 | " grid_clf_custom.fit(X_twovar_train, y_train)\n", 778 | " print('Grid best parameter (max. {0}): {1}'\n", 779 | " .format(eval_metric, grid_clf_custom.best_params_))\n", 780 | " print('Grid best score ({0}): {1}'\n", 781 | " .format(eval_metric, grid_clf_custom.best_score_))\n", 782 | " plt.subplots_adjust(wspace=0.3, hspace=0.3)\n", 783 | " plot_class_regions_for_classifier_subplot(grid_clf_custom, X_twovar_test, y_test, None,\n", 784 | " None, None, plt.subplot(2, 2, i+1))\n", 785 | " \n", 786 | " plt.title(eval_metric+'-oriented SVC')\n", 787 | "plt.tight_layout()\n", 788 | "plt.show()" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "#### Precision-recall curve for the default SVC classifier (with balanced class weights)" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": null, 801 | "metadata": { 802 | "collapsed": false, 803 | "scrolled": false 804 | }, 805 | "outputs": [], 806 | "source": [ 807 | "from sklearn.model_selection import train_test_split\n", 808 | "from sklearn.metrics import precision_recall_curve\n", 809 | "from adspy_shared_utilities import plot_class_regions_for_classifier\n", 810 | "from sklearn.svm import SVC\n", 811 | "\n", 812 | "dataset = load_digits()\n", 813 | "X, y = dataset.data, dataset.target == 1\n", 814 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n", 815 | "\n", 816 | "# create a two-feature input vector matching the example plot above\n", 817 | "jitter_delta = 0.25\n", 818 | "X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta\n", 819 | "X_twovar_test = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta\n", 820 | "\n", 821 | "clf = SVC(kernel='linear', class_weight='balanced').fit(X_twovar_train, y_train)\n", 822 | "\n", 823 | "y_scores = clf.decision_function(X_twovar_test)\n", 824 | "\n", 825 | "precision, recall, thresholds = precision_recall_curve(y_test, y_scores)\n", 826 | "closest_zero = np.argmin(np.abs(thresholds))\n", 827 | "closest_zero_p = precision[closest_zero]\n", 828 | "closest_zero_r = recall[closest_zero]\n", 829 | "\n", 830 | "plot_class_regions_for_classifier(clf, X_twovar_test, y_test)\n", 831 | "plt.title(\"SVC, class_weight = 'balanced', optimized for accuracy\")\n", 832 | "plt.show()\n", 833 | "\n", 834 | "plt.figure()\n", 835 | "plt.xlim([0.0, 1.01])\n", 836 | "plt.ylim([0.0, 1.01])\n", 837 | "plt.title (\"Precision-recall curve: SVC, class_weight = 'balanced'\")\n", 838 | "plt.plot(precision, recall, label = 'Precision-Recall Curve')\n", 839 | "plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3)\n", 840 | "plt.xlabel('Precision', fontsize=16)\n", 841 | "plt.ylabel('Recall', fontsize=16)\n", 842 | "plt.axes().set_aspect('equal')\n", 843 | "plt.show()\n", 844 | "print('At zero threshold, precision: {:.2f}, recall: {:.2f}'\n", 845 | " .format(closest_zero_p, closest_zero_r))" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": null, 851 | "metadata": { 852 | "collapsed": true 853 | }, 854 | "outputs": [], 855 | "source": [] 856 | } 857 | ], 858 | "metadata": { 859 | "anaconda-cloud": {}, 860 | "kernelspec": { 861 | "display_name": "Python 3", 862 | "language": "python", 863 | "name": "python3" 864 | }, 865 | "language_info": { 866 | "codemirror_mode": { 867 | "name": "ipython", 868 | "version": 3 869 | }, 870 | "file_extension": ".py", 871 | "mimetype": "text/x-python", 872 | "name": "python", 873 | "nbconvert_exporter": "python", 874 | "pygments_lexer": "ipython3", 875 | "version": "3.5.2" 876 | } 877 | }, 878 | "nbformat": 4, 879 | "nbformat_minor": 1 880 | } 881 | -------------------------------------------------------------------------------- /Module 4 Quiz.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianhuaiyuan/Applied-machine-learning-in-python/4287c946be45dd05fe7e0d1f3f447ff86879060f/Module 4 Quiz.pdf -------------------------------------------------------------------------------- /Module 4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "source": [ 20 | "# Applied Machine Learning: Module 4 (Supervised Learning, Part II)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "## Preamble and Datasets" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false, 35 | "scrolled": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "%matplotlib notebook\n", 40 | "import numpy as np\n", 41 | "import pandas as pd\n", 42 | "import seaborn as sn\n", 43 | "import matplotlib.pyplot as plt\n", 44 | "\n", 45 | "from sklearn.model_selection import train_test_split\n", 46 | "from sklearn.datasets import make_classification, make_blobs\n", 47 | "from matplotlib.colors import ListedColormap\n", 48 | "from sklearn.datasets import load_breast_cancer\n", 49 | "from adspy_shared_utilities import load_crime_dataset\n", 50 | "\n", 51 | "\n", 52 | "cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])\n", 53 | "\n", 54 | "# fruits dataset\n", 55 | "fruits = pd.read_table('fruit_data_with_colors.txt')\n", 56 | "\n", 57 | "feature_names_fruits = ['height', 'width', 'mass', 'color_score']\n", 58 | "X_fruits = fruits[feature_names_fruits]\n", 59 | "y_fruits = fruits['fruit_label']\n", 60 | "target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']\n", 61 | "\n", 62 | "X_fruits_2d = fruits[['height', 'width']]\n", 63 | "y_fruits_2d = fruits['fruit_label']\n", 64 | "\n", 65 | "# synthetic dataset for simple regression\n", 66 | "from sklearn.datasets import make_regression\n", 67 | "plt.figure()\n", 68 | "plt.title('Sample regression problem with one input variable')\n", 69 | "X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,\n", 70 | " n_informative=1, bias = 150.0,\n", 71 | " noise = 30, random_state=0)\n", 72 | "plt.scatter(X_R1, y_R1, marker= 'o', s=50)\n", 73 | "plt.show()\n", 74 | "\n", 75 | "# synthetic dataset for more complex regression\n", 76 | "from sklearn.datasets import make_friedman1\n", 77 | "plt.figure()\n", 78 | "plt.title('Complex regression problem with one input variable')\n", 79 | "X_F1, y_F1 = make_friedman1(n_samples = 100, n_features = 7,\n", 80 | " random_state=0)\n", 81 | "\n", 82 | "plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50)\n", 83 | "plt.show()\n", 84 | "\n", 85 | "# synthetic dataset for classification (binary)\n", 86 | "plt.figure()\n", 87 | "plt.title('Sample binary classification problem with two informative features')\n", 88 | "X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,\n", 89 | " n_redundant=0, n_informative=2,\n", 90 | " n_clusters_per_class=1, flip_y = 0.1,\n", 91 | " class_sep = 0.5, random_state=0)\n", 92 | "plt.scatter(X_C2[:, 0], X_C2[:, 1], marker= 'o',\n", 93 | " c=y_C2, s=50, cmap=cmap_bold)\n", 94 | "plt.show()\n", 95 | "\n", 96 | "# more difficult synthetic dataset for classification (binary)\n", 97 | "# with classes that are not linearly separable\n", 98 | "X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2,\n", 99 | " centers = 8, cluster_std = 1.3,\n", 100 | " random_state = 4)\n", 101 | "y_D2 = y_D2 % 2\n", 102 | "plt.figure()\n", 103 | "plt.title('Sample binary classification problem with non-linearly separable classes')\n", 104 | "plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,\n", 105 | " marker= 'o', s=50, cmap=cmap_bold)\n", 106 | "plt.show()\n", 107 | "\n", 108 | "# Breast cancer dataset for classification\n", 109 | "cancer = load_breast_cancer()\n", 110 | "(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)\n", 111 | "\n", 112 | "# Communities and Crime dataset\n", 113 | "(X_crime, y_crime) = load_crime_dataset()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## Naive Bayes classifiers" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "from sklearn.naive_bayes import GaussianNB\n", 132 | "from adspy_shared_utilities import plot_class_regions_for_classifier\n", 133 | "\n", 134 | "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)\n", 135 | "\n", 136 | "nbclf = GaussianNB().fit(X_train, y_train)\n", 137 | "plot_class_regions_for_classifier(nbclf, X_train, y_train, X_test, y_test,\n", 138 | " 'Gaussian Naive Bayes classifier: Dataset 1')" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2,\n", 150 | " random_state=0)\n", 151 | "\n", 152 | "nbclf = GaussianNB().fit(X_train, y_train)\n", 153 | "plot_class_regions_for_classifier(nbclf, X_train, y_train, X_test, y_test,\n", 154 | " 'Gaussian Naive Bayes classifier: Dataset 2')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Application to a real-world dataset" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n", 173 | "\n", 174 | "nbclf = GaussianNB().fit(X_train, y_train)\n", 175 | "print('Breast cancer dataset')\n", 176 | "print('Accuracy of GaussianNB classifier on training set: {:.2f}'\n", 177 | " .format(nbclf.score(X_train, y_train)))\n", 178 | "print('Accuracy of GaussianNB classifier on test set: {:.2f}'\n", 179 | " .format(nbclf.score(X_test, y_test)))" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## Ensembles of Decision Trees" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### Random forests" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": false, 201 | "scrolled": false 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "from sklearn.ensemble import RandomForestClassifier\n", 206 | "from sklearn.model_selection import train_test_split\n", 207 | "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n", 208 | "\n", 209 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2,\n", 210 | " random_state = 0)\n", 211 | "fig, subaxes = plt.subplots(1, 1, figsize=(6, 6))\n", 212 | "\n", 213 | "clf = RandomForestClassifier().fit(X_train, y_train)\n", 214 | "title = 'Random Forest Classifier, complex binary dataset, default settings'\n", 215 | "plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test,\n", 216 | " y_test, title, subaxes)\n", 217 | "\n", 218 | "plt.show()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "### Random forest: Fruit dataset" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": { 232 | "collapsed": false, 233 | "scrolled": false 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "from sklearn.ensemble import RandomForestClassifier\n", 238 | "from sklearn.model_selection import train_test_split\n", 239 | "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n", 240 | "\n", 241 | "X_train, X_test, y_train, y_test = train_test_split(X_fruits.as_matrix(),\n", 242 | " y_fruits.as_matrix(),\n", 243 | " random_state = 0)\n", 244 | "fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))\n", 245 | "\n", 246 | "title = 'Random Forest, fruits dataset, default settings'\n", 247 | "pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]\n", 248 | "\n", 249 | "for pair, axis in zip(pair_list, subaxes):\n", 250 | " X = X_train[:, pair]\n", 251 | " y = y_train\n", 252 | " \n", 253 | " clf = RandomForestClassifier().fit(X, y)\n", 254 | " plot_class_regions_for_classifier_subplot(clf, X, y, None,\n", 255 | " None, title, axis,\n", 256 | " target_names_fruits)\n", 257 | " \n", 258 | " axis.set_xlabel(feature_names_fruits[pair[0]])\n", 259 | " axis.set_ylabel(feature_names_fruits[pair[1]])\n", 260 | " \n", 261 | "plt.tight_layout()\n", 262 | "plt.show()\n", 263 | "\n", 264 | "clf = RandomForestClassifier(n_estimators = 10,\n", 265 | " random_state=0).fit(X_train, y_train)\n", 266 | "\n", 267 | "print('Random Forest, Fruit dataset, default settings')\n", 268 | "print('Accuracy of RF classifier on training set: {:.2f}'\n", 269 | " .format(clf.score(X_train, y_train)))\n", 270 | "print('Accuracy of RF classifier on test set: {:.2f}'\n", 271 | " .format(clf.score(X_test, y_test)))" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "#### Random Forests on a real-world dataset" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": false 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "from sklearn.ensemble import RandomForestClassifier\n", 290 | "\n", 291 | "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n", 292 | "\n", 293 | "clf = RandomForestClassifier(max_features = 8, random_state = 0)\n", 294 | "clf.fit(X_train, y_train)\n", 295 | "\n", 296 | "print('Breast cancer dataset')\n", 297 | "print('Accuracy of RF classifier on training set: {:.2f}'\n", 298 | " .format(clf.score(X_train, y_train)))\n", 299 | "print('Accuracy of RF classifier on test set: {:.2f}'\n", 300 | " .format(clf.score(X_test, y_test)))" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "### Gradient-boosted decision trees" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "from sklearn.ensemble import GradientBoostingClassifier\n", 319 | "from sklearn.model_selection import train_test_split\n", 320 | "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n", 321 | "\n", 322 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)\n", 323 | "fig, subaxes = plt.subplots(1, 1, figsize=(6, 6))\n", 324 | "\n", 325 | "clf = GradientBoostingClassifier().fit(X_train, y_train)\n", 326 | "title = 'GBDT, complex binary dataset, default settings'\n", 327 | "plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test,\n", 328 | " y_test, title, subaxes)\n", 329 | "\n", 330 | "plt.show()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "#### Gradient boosted decision trees on the fruit dataset" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false, 345 | "scrolled": false 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "X_train, X_test, y_train, y_test = train_test_split(X_fruits.as_matrix(),\n", 350 | " y_fruits.as_matrix(),\n", 351 | " random_state = 0)\n", 352 | "fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))\n", 353 | "\n", 354 | "pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]\n", 355 | "\n", 356 | "for pair, axis in zip(pair_list, subaxes):\n", 357 | " X = X_train[:, pair]\n", 358 | " y = y_train\n", 359 | " \n", 360 | " clf = GradientBoostingClassifier().fit(X, y)\n", 361 | " plot_class_regions_for_classifier_subplot(clf, X, y, None,\n", 362 | " None, title, axis,\n", 363 | " target_names_fruits)\n", 364 | " \n", 365 | " axis.set_xlabel(feature_names_fruits[pair[0]])\n", 366 | " axis.set_ylabel(feature_names_fruits[pair[1]])\n", 367 | " \n", 368 | "plt.tight_layout()\n", 369 | "plt.show()\n", 370 | "clf = GradientBoostingClassifier().fit(X_train, y_train)\n", 371 | "\n", 372 | "print('GBDT, Fruit dataset, default settings')\n", 373 | "print('Accuracy of GBDT classifier on training set: {:.2f}'\n", 374 | " .format(clf.score(X_train, y_train)))\n", 375 | "print('Accuracy of GBDT classifier on test set: {:.2f}'\n", 376 | " .format(clf.score(X_test, y_test)))" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "#### Gradient-boosted decision trees on a real-world dataset" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "from sklearn.ensemble import GradientBoostingClassifier\n", 395 | "\n", 396 | "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n", 397 | "\n", 398 | "clf = GradientBoostingClassifier(random_state = 0)\n", 399 | "clf.fit(X_train, y_train)\n", 400 | "\n", 401 | "print('Breast cancer dataset (learning_rate=0.1, max_depth=3)')\n", 402 | "print('Accuracy of GBDT classifier on training set: {:.2f}'\n", 403 | " .format(clf.score(X_train, y_train)))\n", 404 | "print('Accuracy of GBDT classifier on test set: {:.2f}\\n'\n", 405 | " .format(clf.score(X_test, y_test)))\n", 406 | "\n", 407 | "clf = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 2, random_state = 0)\n", 408 | "clf.fit(X_train, y_train)\n", 409 | "\n", 410 | "print('Breast cancer dataset (learning_rate=0.01, max_depth=2)')\n", 411 | "print('Accuracy of GBDT classifier on training set: {:.2f}'\n", 412 | " .format(clf.score(X_train, y_train)))\n", 413 | "print('Accuracy of GBDT classifier on test set: {:.2f}'\n", 414 | " .format(clf.score(X_test, y_test)))" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "## Neural networks" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "#### Activation functions" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "collapsed": false 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "xrange = np.linspace(-2, 2, 200)\n", 440 | "\n", 441 | "plt.figure(figsize=(7,6))\n", 442 | "\n", 443 | "plt.plot(xrange, np.maximum(xrange, 0), label = 'relu')\n", 444 | "plt.plot(xrange, np.tanh(xrange), label = 'tanh')\n", 445 | "plt.plot(xrange, 1 / (1 + np.exp(-xrange)), label = 'logistic')\n", 446 | "plt.legend()\n", 447 | "plt.title('Neural network activation functions')\n", 448 | "plt.xlabel('Input value (x)')\n", 449 | "plt.ylabel('Activation function output')\n", 450 | "\n", 451 | "plt.show()" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "### Neural networks: Classification" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "#### Synthetic dataset 1: single hidden layer" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": { 472 | "collapsed": false, 473 | "scrolled": false 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "from sklearn.neural_network import MLPClassifier\n", 478 | "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n", 479 | "\n", 480 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)\n", 481 | "\n", 482 | "fig, subaxes = plt.subplots(3, 1, figsize=(6,18))\n", 483 | "\n", 484 | "for units, axis in zip([1, 10, 100], subaxes):\n", 485 | " nnclf = MLPClassifier(hidden_layer_sizes = [units], solver='lbfgs',\n", 486 | " random_state = 0).fit(X_train, y_train)\n", 487 | " \n", 488 | " title = 'Dataset 1: Neural net classifier, 1 layer, {} units'.format(units)\n", 489 | " \n", 490 | " plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train,\n", 491 | " X_test, y_test, title, axis)\n", 492 | " plt.tight_layout()" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "#### Synthetic dataset 1: two hidden layers" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": { 506 | "collapsed": false 507 | }, 508 | "outputs": [], 509 | "source": [ 510 | "from adspy_shared_utilities import plot_class_regions_for_classifier\n", 511 | "\n", 512 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)\n", 513 | "\n", 514 | "nnclf = MLPClassifier(hidden_layer_sizes = [10, 10], solver='lbfgs',\n", 515 | " random_state = 0).fit(X_train, y_train)\n", 516 | "\n", 517 | "plot_class_regions_for_classifier(nnclf, X_train, y_train, X_test, y_test,\n", 518 | " 'Dataset 1: Neural net classifier, 2 layers, 10/10 units')" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "#### Regularization parameter: alpha" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": { 532 | "collapsed": false, 533 | "scrolled": false 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)\n", 538 | "\n", 539 | "fig, subaxes = plt.subplots(4, 1, figsize=(6, 23))\n", 540 | "\n", 541 | "for this_alpha, axis in zip([0.01, 0.1, 1.0, 5.0], subaxes):\n", 542 | " nnclf = MLPClassifier(solver='lbfgs', activation = 'tanh',\n", 543 | " alpha = this_alpha,\n", 544 | " hidden_layer_sizes = [100, 100],\n", 545 | " random_state = 0).fit(X_train, y_train)\n", 546 | " \n", 547 | " title = 'Dataset 2: NN classifier, alpha = {:.3f} '.format(this_alpha)\n", 548 | " \n", 549 | " plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train,\n", 550 | " X_test, y_test, title, axis)\n", 551 | " plt.tight_layout()\n", 552 | " " 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": {}, 558 | "source": [ 559 | "#### The effect of different choices of activation function" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": { 566 | "collapsed": false, 567 | "scrolled": false 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)\n", 572 | "\n", 573 | "fig, subaxes = plt.subplots(3, 1, figsize=(6,18))\n", 574 | "\n", 575 | "for this_activation, axis in zip(['logistic', 'tanh', 'relu'], subaxes):\n", 576 | " nnclf = MLPClassifier(solver='lbfgs', activation = this_activation,\n", 577 | " alpha = 0.1, hidden_layer_sizes = [10, 10],\n", 578 | " random_state = 0).fit(X_train, y_train)\n", 579 | " \n", 580 | " title = 'Dataset 2: NN classifier, 2 layers 10/10, {} \\\n", 581 | "activation function'.format(this_activation)\n", 582 | " \n", 583 | " plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train,\n", 584 | " X_test, y_test, title, axis)\n", 585 | " plt.tight_layout()" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "### Neural networks: Regression" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": null, 598 | "metadata": { 599 | "collapsed": false 600 | }, 601 | "outputs": [], 602 | "source": [ 603 | "from sklearn.neural_network import MLPRegressor\n", 604 | "\n", 605 | "fig, subaxes = plt.subplots(2, 3, figsize=(11,8), dpi=70)\n", 606 | "\n", 607 | "X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)\n", 608 | "\n", 609 | "X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0)\n", 610 | "\n", 611 | "for thisaxisrow, thisactivation in zip(subaxes, ['tanh', 'relu']):\n", 612 | " for thisalpha, thisaxis in zip([0.0001, 1.0, 100], thisaxisrow):\n", 613 | " mlpreg = MLPRegressor(hidden_layer_sizes = [100,100],\n", 614 | " activation = thisactivation,\n", 615 | " alpha = thisalpha,\n", 616 | " solver = 'lbfgs').fit(X_train, y_train)\n", 617 | " y_predict_output = mlpreg.predict(X_predict_input)\n", 618 | " thisaxis.set_xlim([-2.5, 0.75])\n", 619 | " thisaxis.plot(X_predict_input, y_predict_output,\n", 620 | " '^', markersize = 10)\n", 621 | " thisaxis.plot(X_train, y_train, 'o')\n", 622 | " thisaxis.set_xlabel('Input feature')\n", 623 | " thisaxis.set_ylabel('Target value')\n", 624 | " thisaxis.set_title('MLP regression\\nalpha={}, activation={})'\n", 625 | " .format(thisalpha, thisactivation))\n", 626 | " plt.tight_layout()" 627 | ] 628 | }, 629 | { 630 | "cell_type": "markdown", 631 | "metadata": {}, 632 | "source": [ 633 | "#### Application to real-world dataset for classification" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": { 640 | "collapsed": false 641 | }, 642 | "outputs": [], 643 | "source": [ 644 | "from sklearn.neural_network import MLPClassifier\n", 645 | "from sklearn.preprocessing import MinMaxScaler\n", 646 | "\n", 647 | "\n", 648 | "scaler = MinMaxScaler()\n", 649 | "\n", 650 | "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n", 651 | "X_train_scaled = scaler.fit_transform(X_train)\n", 652 | "X_test_scaled = scaler.transform(X_test)\n", 653 | "\n", 654 | "clf = MLPClassifier(hidden_layer_sizes = [100, 100], alpha = 5.0,\n", 655 | " random_state = 0, solver='lbfgs').fit(X_train_scaled, y_train)\n", 656 | "\n", 657 | "print('Breast cancer dataset')\n", 658 | "print('Accuracy of NN classifier on training set: {:.2f}'\n", 659 | " .format(clf.score(X_train_scaled, y_train)))\n", 660 | "print('Accuracy of NN classifier on test set: {:.2f}'\n", 661 | " .format(clf.score(X_test_scaled, y_test)))" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 1, 667 | "metadata": { 668 | "collapsed": false 669 | }, 670 | "outputs": [ 671 | { 672 | "name": "stdout", 673 | "output_type": "stream", 674 | "text": [ 675 | "./addresses.csv\r\n", 676 | "./train.csv\r\n", 677 | "./Module 2.ipynb\r\n", 678 | "./Assignment 3.ipynb\r\n", 679 | "./Module 4.ipynb\r\n", 680 | "./Assignment 1.ipynb\r\n", 681 | "./test.csv\r\n", 682 | "./CommViolPredUnnormalizedData.txt\r\n", 683 | "./adspy_shared_utilities.py\r\n", 684 | "./Module 3.ipynb\r\n", 685 | "./fraud_data.csv\r\n", 686 | "./fruit_data_with_colors.txt\r\n", 687 | "./Assignment 4.ipynb\r\n", 688 | "./Assignment 2.ipynb\r\n", 689 | "./mushrooms.csv\r\n", 690 | "./Classifier Visualization.ipynb\r\n", 691 | "./latlons.csv\r\n", 692 | "./Module 1.ipynb\r\n" 693 | ] 694 | } 695 | ], 696 | "source": [ 697 | "!find . -maxdepth 1 -not -type d" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 7, 703 | "metadata": { 704 | "collapsed": false 705 | }, 706 | "outputs": [ 707 | { 708 | "name": "stdout", 709 | "output_type": "stream", 710 | "text": [ 711 | "addresses.csv adspy_temp.dot polynomialreg1.png train.csv\r\n" 712 | ] 713 | } 714 | ], 715 | "source": [ 716 | "!ls readonly" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": 8, 722 | "metadata": { 723 | "collapsed": false 724 | }, 725 | "outputs": [ 726 | { 727 | "name": "stdout", 728 | "output_type": "stream", 729 | "text": [ 730 | "cp: target ‘2.ipynb’ is not a directory\r\n" 731 | ] 732 | } 733 | ], 734 | "source": [ 735 | "!cp ./Module 2.ipynb readonly/Module 2.ipynb" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": { 742 | "collapsed": true 743 | }, 744 | "outputs": [], 745 | "source": [] 746 | } 747 | ], 748 | "metadata": { 749 | "anaconda-cloud": {}, 750 | "kernelspec": { 751 | "display_name": "Python 3", 752 | "language": "python", 753 | "name": "python3" 754 | }, 755 | "language_info": { 756 | "codemirror_mode": { 757 | "name": "ipython", 758 | "version": 3 759 | }, 760 | "file_extension": ".py", 761 | "mimetype": "text/x-python", 762 | "name": "python", 763 | "nbconvert_exporter": "python", 764 | "pygments_lexer": "ipython3", 765 | "version": "3.5.2" 766 | } 767 | }, 768 | "nbformat": 4, 769 | "nbformat_minor": 2 770 | } 771 | -------------------------------------------------------------------------------- /polynomialreg1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianhuaiyuan/Applied-machine-learning-in-python/4287c946be45dd05fe7e0d1f3f447ff86879060f/polynomialreg1.png -------------------------------------------------------------------------------- /week1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianhuaiyuan/Applied-machine-learning-in-python/4287c946be45dd05fe7e0d1f3f447ff86879060f/week1.pdf -------------------------------------------------------------------------------- /week2_Assignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.2** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Assignment 2\n", 19 | "\n", 20 | "In this assignment you'll explore the relationship between model complexity and generalization performance, by adjusting key parameters of various supervised learning models. Part 1 of this assignment will look at regression and Part 2 will look at classification.\n", 21 | "\n", 22 | "## Part 1 - Regression" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "First, run the following block to set up the variables needed for later sections." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": { 36 | "collapsed": false, 37 | "scrolled": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import numpy as np\n", 42 | "import pandas as pd\n", 43 | "# import matplotlib.pyplot as plt\n", 44 | "from sklearn.model_selection import train_test_split\n", 45 | "\n", 46 | "\n", 47 | "np.random.seed(0)\n", 48 | "n = 15\n", 49 | "x = np.linspace(0,10,n) + np.random.randn(n)/5\n", 50 | "y = np.sin(x)+x/6 + np.random.randn(n)/10\n", 51 | "\n", 52 | "# print(y)\n", 53 | "\n", 54 | "X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)\n", 55 | "\n", 56 | "# You can use this function to help you visualize the dataset by\n", 57 | "# plotting a scatterplot of the data points\n", 58 | "# in the training and test sets.\n", 59 | "def part1_scatter():\n", 60 | " %matplotlib notebook\n", 61 | " plt.figure()\n", 62 | " plt.scatter(X_train, y_train, label='training data')\n", 63 | " plt.scatter(X_test, y_test, label='test data')\n", 64 | " plt.legend(loc=4);\n", 65 | " \n", 66 | " \n", 67 | "# NOTE: Uncomment the function below to visualize the data, but be sure \n", 68 | "# to **re-comment it before submitting this assignment to the autograder**. \n", 69 | "# part1_scatter()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Question 1\n", 77 | "\n", 78 | "Write a function that fits a polynomial LinearRegression model on the *training data* `X_train` for degrees 1, 3, 6, and 9. (Use PolynomialFeatures in sklearn.preprocessing to create the polynomial features and then fit a linear regression model) For each model, find 100 predicted values over the interval x = 0 to 10 (e.g. `np.linspace(0,10,100)`) and store this in a numpy array. The first row of this array should correspond to the output from the model trained on degree 1, the second row degree 3, the third row degree 6, and the fourth row degree 9.\n", 79 | "\n", 80 | "\n", 81 | "\n", 82 | "The figure above shows the fitted models plotted on top of the original data (using `plot_one()`).\n", 83 | "\n", 84 | "
\n", 85 | "*This function should return a numpy array with shape `(4, 100)`*" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 2, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "array([[ 2.53040195e-01, 2.69201547e-01, 2.85362899e-01,\n", 99 | " 3.01524251e-01, 3.17685603e-01, 3.33846955e-01,\n", 100 | " 3.50008306e-01, 3.66169658e-01, 3.82331010e-01,\n", 101 | " 3.98492362e-01, 4.14653714e-01, 4.30815066e-01,\n", 102 | " 4.46976417e-01, 4.63137769e-01, 4.79299121e-01,\n", 103 | " 4.95460473e-01, 5.11621825e-01, 5.27783177e-01,\n", 104 | " 5.43944529e-01, 5.60105880e-01, 5.76267232e-01,\n", 105 | " 5.92428584e-01, 6.08589936e-01, 6.24751288e-01,\n", 106 | " 6.40912640e-01, 6.57073992e-01, 6.73235343e-01,\n", 107 | " 6.89396695e-01, 7.05558047e-01, 7.21719399e-01,\n", 108 | " 7.37880751e-01, 7.54042103e-01, 7.70203454e-01,\n", 109 | " 7.86364806e-01, 8.02526158e-01, 8.18687510e-01,\n", 110 | " 8.34848862e-01, 8.51010214e-01, 8.67171566e-01,\n", 111 | " 8.83332917e-01, 8.99494269e-01, 9.15655621e-01,\n", 112 | " 9.31816973e-01, 9.47978325e-01, 9.64139677e-01,\n", 113 | " 9.80301028e-01, 9.96462380e-01, 1.01262373e+00,\n", 114 | " 1.02878508e+00, 1.04494644e+00, 1.06110779e+00,\n", 115 | " 1.07726914e+00, 1.09343049e+00, 1.10959184e+00,\n", 116 | " 1.12575320e+00, 1.14191455e+00, 1.15807590e+00,\n", 117 | " 1.17423725e+00, 1.19039860e+00, 1.20655995e+00,\n", 118 | " 1.22272131e+00, 1.23888266e+00, 1.25504401e+00,\n", 119 | " 1.27120536e+00, 1.28736671e+00, 1.30352807e+00,\n", 120 | " 1.31968942e+00, 1.33585077e+00, 1.35201212e+00,\n", 121 | " 1.36817347e+00, 1.38433482e+00, 1.40049618e+00,\n", 122 | " 1.41665753e+00, 1.43281888e+00, 1.44898023e+00,\n", 123 | " 1.46514158e+00, 1.48130294e+00, 1.49746429e+00,\n", 124 | " 1.51362564e+00, 1.52978699e+00, 1.54594834e+00,\n", 125 | " 1.56210969e+00, 1.57827105e+00, 1.59443240e+00,\n", 126 | " 1.61059375e+00, 1.62675510e+00, 1.64291645e+00,\n", 127 | " 1.65907781e+00, 1.67523916e+00, 1.69140051e+00,\n", 128 | " 1.70756186e+00, 1.72372321e+00, 1.73988457e+00,\n", 129 | " 1.75604592e+00, 1.77220727e+00, 1.78836862e+00,\n", 130 | " 1.80452997e+00, 1.82069132e+00, 1.83685268e+00,\n", 131 | " 1.85301403e+00],\n", 132 | " [ 1.22989539e+00, 1.15143628e+00, 1.07722393e+00,\n", 133 | " 1.00717881e+00, 9.41221419e-01, 8.79272234e-01,\n", 134 | " 8.21251741e-01, 7.67080426e-01, 7.16678772e-01,\n", 135 | " 6.69967266e-01, 6.26866391e-01, 5.87296632e-01,\n", 136 | " 5.51178474e-01, 5.18432402e-01, 4.88978901e-01,\n", 137 | " 4.62738455e-01, 4.39631549e-01, 4.19578668e-01,\n", 138 | " 4.02500297e-01, 3.88316920e-01, 3.76949022e-01,\n", 139 | " 3.68317088e-01, 3.62341603e-01, 3.58943051e-01,\n", 140 | " 3.58041918e-01, 3.59558687e-01, 3.63413845e-01,\n", 141 | " 3.69527874e-01, 3.77821261e-01, 3.88214491e-01,\n", 142 | " 4.00628046e-01, 4.14982414e-01, 4.31198078e-01,\n", 143 | " 4.49195522e-01, 4.68895233e-01, 4.90217694e-01,\n", 144 | " 5.13083391e-01, 5.37412808e-01, 5.63126429e-01,\n", 145 | " 5.90144741e-01, 6.18388226e-01, 6.47777371e-01,\n", 146 | " 6.78232660e-01, 7.09674578e-01, 7.42023609e-01,\n", 147 | " 7.75200238e-01, 8.09124950e-01, 8.43718230e-01,\n", 148 | " 8.78900563e-01, 9.14592432e-01, 9.50714324e-01,\n", 149 | " 9.87186723e-01, 1.02393011e+00, 1.06086498e+00,\n", 150 | " 1.09791181e+00, 1.13499108e+00, 1.17202328e+00,\n", 151 | " 1.20892890e+00, 1.24562842e+00, 1.28204233e+00,\n", 152 | " 1.31809110e+00, 1.35369523e+00, 1.38877520e+00,\n", 153 | " 1.42325149e+00, 1.45704459e+00, 1.49007498e+00,\n", 154 | " 1.52226316e+00, 1.55352959e+00, 1.58379478e+00,\n", 155 | " 1.61297919e+00, 1.64100332e+00, 1.66778766e+00,\n", 156 | " 1.69325268e+00, 1.71731887e+00, 1.73990672e+00,\n", 157 | " 1.76093671e+00, 1.78032933e+00, 1.79800506e+00,\n", 158 | " 1.81388438e+00, 1.82788778e+00, 1.83993575e+00,\n", 159 | " 1.84994877e+00, 1.85784732e+00, 1.86355189e+00,\n", 160 | " 1.86698296e+00, 1.86806103e+00, 1.86670656e+00,\n", 161 | " 1.86284006e+00, 1.85638200e+00, 1.84725286e+00,\n", 162 | " 1.83537314e+00, 1.82066332e+00, 1.80304388e+00,\n", 163 | " 1.78243530e+00, 1.75875808e+00, 1.73193269e+00,\n", 164 | " 1.70187963e+00, 1.66851936e+00, 1.63177240e+00,\n", 165 | " 1.59155920e+00],\n", 166 | " [ -1.99554310e-01, -3.95192729e-03, 1.79851752e-01,\n", 167 | " 3.51005136e-01, 5.08831706e-01, 6.52819233e-01,\n", 168 | " 7.82609240e-01, 8.97986721e-01, 9.98870117e-01,\n", 169 | " 1.08530155e+00, 1.15743729e+00, 1.21553852e+00,\n", 170 | " 1.25996233e+00, 1.29115292e+00, 1.30963316e+00,\n", 171 | " 1.31599632e+00, 1.31089811e+00, 1.29504889e+00,\n", 172 | " 1.26920626e+00, 1.23416782e+00, 1.19076415e+00,\n", 173 | " 1.13985218e+00, 1.08230867e+00, 1.01902405e+00,\n", 174 | " 9.50896441e-01, 8.78825970e-01, 8.03709344e-01,\n", 175 | " 7.26434655e-01, 6.47876457e-01, 5.68891088e-01,\n", 176 | " 4.90312256e-01, 4.12946874e-01, 3.37571147e-01,\n", 177 | " 2.64926923e-01, 1.95718291e-01, 1.30608438e-01,\n", 178 | " 7.02167560e-02, 1.51162118e-02, -3.41690366e-02,\n", 179 | " -7.71657636e-02, -1.13453547e-01, -1.42666382e-01,\n", 180 | " -1.64494044e-01, -1.78683194e-01, -1.85038228e-01,\n", 181 | " -1.83421873e-01, -1.73755533e-01, -1.56019368e-01,\n", 182 | " -1.30252132e-01, -9.65507463e-02, -5.50696232e-02,\n", 183 | " -6.01973203e-03, 5.03325882e-02, 1.13667071e-01,\n", 184 | " 1.83611221e-01, 2.59742264e-01, 3.41589357e-01,\n", 185 | " 4.28636046e-01, 5.20322987e-01, 6.16050916e-01,\n", 186 | " 7.15183874e-01, 8.17052690e-01, 9.20958717e-01,\n", 187 | " 1.02617782e+00, 1.13196463e+00, 1.23755703e+00,\n", 188 | " 1.34218093e+00, 1.44505526e+00, 1.54539723e+00,\n", 189 | " 1.64242789e+00, 1.73537785e+00, 1.82349336e+00,\n", 190 | " 1.90604254e+00, 1.98232198e+00, 2.05166348e+00,\n", 191 | " 2.11344114e+00, 2.16707864e+00, 2.21205680e+00,\n", 192 | " 2.24792141e+00, 2.27429129e+00, 2.29086658e+00,\n", 193 | " 2.29743739e+00, 2.29389257e+00, 2.28022881e+00,\n", 194 | " 2.25656001e+00, 2.22312684e+00, 2.18030664e+00,\n", 195 | " 2.12862347e+00, 2.06875850e+00, 2.00156065e+00,\n", 196 | " 1.92805743e+00, 1.84946605e+00, 1.76720485e+00,\n", 197 | " 1.68290491e+00, 1.59842194e+00, 1.51584842e+00,\n", 198 | " 1.43752602e+00, 1.36605824e+00, 1.30432333e+00,\n", 199 | " 1.25548743e+00],\n", 200 | " [ 6.79502315e+00, 4.14319965e+00, 2.23123316e+00,\n", 201 | " 9.10495361e-01, 5.49800913e-02, -4.41344739e-01,\n", 202 | " -6.66950747e-01, -6.94943197e-01, -5.85049920e-01,\n", 203 | " -3.85418713e-01, -1.34236348e-01, 1.38818291e-01,\n", 204 | " 4.11274949e-01, 6.66715203e-01, 8.93747233e-01,\n", 205 | " 1.08510180e+00, 1.23683958e+00, 1.34766048e+00,\n", 206 | " 1.41830612e+00, 1.45104704e+00, 1.44924675e+00,\n", 207 | " 1.41699514e+00, 1.35880424e+00, 1.27935965e+00,\n", 208 | " 1.18332161e+00, 1.07516974e+00, 9.59086200e-01,\n", 209 | " 8.38872245e-01, 7.17893445e-01, 5.99049383e-01,\n", 210 | " 4.84763838e-01, 3.76991851e-01, 2.77240389e-01,\n", 211 | " 1.86599613e-01, 1.05782065e-01, 3.51673710e-02,\n", 212 | " -2.51496888e-02, -7.53096020e-02, -1.15638682e-01,\n", 213 | " -1.46601154e-01, -1.68753939e-01, -1.82705104e-01,\n", 214 | " -1.89076735e-01, -1.88472830e-01, -1.81452582e-01,\n", 215 | " -1.68509336e-01, -1.50055280e-01, -1.26411838e-01,\n", 216 | " -9.78055951e-02, -6.43694666e-02, -2.61487236e-02,\n", 217 | " 1.68885955e-02, 6.48374453e-02, 1.17838320e-01,\n", 218 | " 1.76057261e-01, 2.39664033e-01, 3.08809213e-01,\n", 219 | " 3.83600954e-01, 4.64082174e-01, 5.50208937e-01,\n", 220 | " 6.41830759e-01, 7.38673538e-01, 8.40325779e-01,\n", 221 | " 9.46228700e-01, 1.05567078e+00, 1.16778720e+00,\n", 222 | " 1.28156451e+00, 1.39585081e+00, 1.50937164e+00,\n", 223 | " 1.62075148e+00, 1.72854081e+00, 1.83124847e+00,\n", 224 | " 1.92737884e+00, 2.01547318e+00, 2.09415447e+00,\n", 225 | " 2.16217455e+00, 2.21846249e+00, 2.26217266e+00,\n", 226 | " 2.29273088e+00, 2.30987664e+00, 2.31369924e+00,\n", 227 | " 2.30466539e+00, 2.28363552e+00, 2.25186572e+00,\n", 228 | " 2.21099192e+00, 2.16299272e+00, 2.11012682e+00,\n", 229 | " 2.05484055e+00, 1.99964106e+00, 1.94692977e+00,\n", 230 | " 1.89879084e+00, 1.85672866e+00, 1.82134809e+00,\n", 231 | " 1.79197090e+00, 1.76618106e+00, 1.73929145e+00,\n", 232 | " 1.70372403e+00, 1.64829476e+00, 1.55739451e+00,\n", 233 | " 1.41005645e+00]])" 234 | ] 235 | }, 236 | "execution_count": 2, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "def answer_one():\n", 243 | " from sklearn.linear_model import LinearRegression\n", 244 | " from sklearn.preprocessing import PolynomialFeatures\n", 245 | "\n", 246 | " # Your code here\n", 247 | "# print('\\nNow we transform the original input data to add\\n\\\n", 248 | "# polynomial features up to degree 2 (quadratic)\\n')\n", 249 | "\n", 250 | " results = np.zeros([4,100])\n", 251 | "\n", 252 | " X_predict = np.linspace(0,10,100).reshape(-1,1)\n", 253 | " \n", 254 | "# print(y_train)\n", 255 | "\n", 256 | " count = 0\n", 257 | " \n", 258 | " for i in [1,3,6,9]:\n", 259 | " \n", 260 | " poly = PolynomialFeatures(degree=i)\n", 261 | "\n", 262 | " X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))\n", 263 | " \n", 264 | " X_predict_poly = poly.fit_transform(X_predict)\n", 265 | "\n", 266 | " linreg = LinearRegression().fit(X_train_poly, y_train)\n", 267 | "\n", 268 | " tmp_ans = linreg.predict(X_predict_poly).reshape(1,-1)\n", 269 | " \n", 270 | " results[count,:] = tmp_ans\n", 271 | " \n", 272 | " count = count + 1\n", 273 | "\n", 274 | " \n", 275 | " answer = results \n", 276 | " \n", 277 | "# return\n", 278 | " return answer\n", 279 | "\n", 280 | "answer_one()" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 3, 286 | "metadata": { 287 | "collapsed": false 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "# feel free to use the function plot_one() to replicate the figure \n", 292 | "# from the prompt once you have completed question one\n", 293 | "def plot_one(degree_predictions):\n", 294 | " plt.figure(figsize=(10,5))\n", 295 | " plt.plot(X_train, y_train, 'o', label='training data', markersize=10)\n", 296 | " plt.plot(X_test, y_test, 'o', label='test data', markersize=10)\n", 297 | " for i,degree in enumerate([1,3,6,9]):\n", 298 | " plt.plot(np.linspace(0,10,100), degree_predictions[i], alpha=0.8, lw=2, label='degree={}'.format(degree))\n", 299 | " plt.ylim(-1,2.5)\n", 300 | " plt.legend(loc=4)\n", 301 | "\n", 302 | "# plot_one(answer_one())" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "### Question 2\n", 310 | "\n", 311 | "Write a function that fits a polynomial LinearRegression model on the training data `X_train` for degrees 0 through 9. For each model compute the $R^2$ (coefficient of determination) regression score on the training data as well as the the test data, and return both of these arrays in a tuple.\n", 312 | "\n", 313 | "*This function should return one tuple of numpy arrays `(r2_train, r2_test)`. Both arrays should have shape `(10,)`*" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 4, 319 | "metadata": { 320 | "collapsed": false 321 | }, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "(array([ 0. , 0.42924578, 0.4510998 , 0.58719954, 0.91941945,\n", 327 | " 0.97578641, 0.99018233, 0.99352509, 0.99637545, 0.99803706]),\n", 328 | " array([-0.47808642, -0.45237104, -0.06856984, 0.00533105, 0.73004943,\n", 329 | " 0.87708301, 0.9214094 , 0.92021504, 0.63247944, -0.64525447]))" 330 | ] 331 | }, 332 | "execution_count": 4, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "def answer_two():\n", 339 | " from sklearn.linear_model import LinearRegression\n", 340 | " from sklearn.preprocessing import PolynomialFeatures\n", 341 | " from sklearn.metrics.regression import r2_score\n", 342 | "\n", 343 | " # Your code here\n", 344 | " results_train = np.zeros([10, 1])\n", 345 | " results_test = np.zeros([10, 1])\n", 346 | "\n", 347 | " \n", 348 | " for i in range(0,10):\n", 349 | " \n", 350 | " poly = PolynomialFeatures(degree=i)\n", 351 | "\n", 352 | " X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))\n", 353 | " \n", 354 | " X_test_poly = poly.fit_transform(X_test.reshape(-1, 1))\n", 355 | "\n", 356 | " linreg = LinearRegression().fit(X_train_poly, y_train)\n", 357 | "\n", 358 | " score_train = r2_score( y_train,linreg.predict(X_train_poly))\n", 359 | " \n", 360 | " score_test = r2_score( y_test,linreg.predict(X_test_poly))\n", 361 | " \n", 362 | " results_train[i] = score_train\n", 363 | " \n", 364 | " results_test[i] = score_test\n", 365 | " \n", 366 | " results_train = results_train.flatten()\n", 367 | " \n", 368 | " results_test = results_test.flatten()\n", 369 | " \n", 370 | "# print(results_train.shape)\n", 371 | "\n", 372 | "# print(results_train.shape)\n", 373 | " \n", 374 | " answer = (results_train, results_test)\n", 375 | "\n", 376 | " return (results_train, results_test)\n", 377 | "\n", 378 | "answer_two()" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "### Question 3\n", 386 | "\n", 387 | "Based on the $R^2$ scores from question 2 (degree levels 0 through 9), what degree level corresponds to a model that is underfitting? What degree level corresponds to a model that is overfitting? What choice of degree level would provide a model with good generalization performance on this dataset? Note: there may be multiple correct solutions to this question.\n", 388 | "\n", 389 | "(Hint: Try plotting the $R^2$ scores from question 2 to visualize the relationship between degree level and $R^2$)\n", 390 | "\n", 391 | "*This function should return one tuple with the degree values in this order: `(Underfitting, Overfitting, Good_Generalization)`*" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 5, 397 | "metadata": { 398 | "collapsed": false 399 | }, 400 | "outputs": [], 401 | "source": [ 402 | "def answer_three():\n", 403 | " \n", 404 | "# import matplotlib.pyplot as plt\n", 405 | " \n", 406 | "# # Your code here \n", 407 | "# (results_train, results_test) = answer_two()\n", 408 | "# %matplotlib notebook\n", 409 | "# plt.figure()\n", 410 | "# plt.figure(figsize=(10,5))\n", 411 | "# plt.plot(range(0,10,1), results_train, 'o', label='training data', markersize=10)\n", 412 | "# plt.plot(range(0,10,1), results_test, 'o', label='test data', markersize=10)\n", 413 | " \n", 414 | "# # for i,degree in enumerate([1,3,6,9]):\n", 415 | "# # plt.plot(np.linspace(0,10,100), degree_predictions[i], alpha=0.8, lw=2, label='degree={}'.format(degree))\n", 416 | "# # plt.ylim(-1,2.5)\n", 417 | "# plt.legend()\n", 418 | " return (0,9,6)\n", 419 | "\n", 420 | "# answer_three()" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "### Question 4\n", 428 | "\n", 429 | "Training models on high degree polynomial features can result in overly complex models that overfit, so we often use regularized versions of the model to constrain model complexity, as we saw with Ridge and Lasso linear regression.\n", 430 | "\n", 431 | "For this question, train two models: a non-regularized LinearRegression model (default parameters) and a regularized Lasso Regression model (with parameters `alpha=0.01`, `max_iter=10000`) on polynomial features of degree 12. Return the $R^2$ score for both the LinearRegression and Lasso model's test sets.\n", 432 | "\n", 433 | "*This function should return one tuple `(LinearRegression_R2_test_score, Lasso_R2_test_score)`*" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 6, 439 | "metadata": { 440 | "collapsed": false 441 | }, 442 | "outputs": [], 443 | "source": [ 444 | "def answer_four():\n", 445 | " from sklearn.preprocessing import PolynomialFeatures\n", 446 | " from sklearn.linear_model import Lasso, LinearRegression\n", 447 | " from sklearn.metrics.regression import r2_score\n", 448 | "\n", 449 | " # Your code here\n", 450 | " poly = PolynomialFeatures(degree=12)\n", 451 | "\n", 452 | " X_train_poly = poly.fit_transform(X_train.reshape(-1, 1))\n", 453 | "\n", 454 | " X_test_poly = poly.fit_transform(X_test.reshape(-1, 1))\n", 455 | "\n", 456 | " linreg = LinearRegression().fit(X_train_poly, y_train)\n", 457 | " \n", 458 | " linlasso = Lasso(alpha=0.01, max_iter = 10000).fit(X_train_poly, y_train)\n", 459 | "\n", 460 | " score_linreg_test = linreg.score(X_test_poly, y_test)\n", 461 | "\n", 462 | " score_lasso_test = linlasso.score(X_test_poly, y_test)\n", 463 | "\n", 464 | " return (score_linreg_test, score_lasso_test)\n", 465 | "\n", 466 | "# answer_four()" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "## Part 2 - Classification\n", 474 | "\n", 475 | "Here's an application of machine learning that could save your life! For this section of the assignment we will be working with the [UCI Mushroom Data Set](http://archive.ics.uci.edu/ml/datasets/Mushroom?ref=datanews.io) stored in `mushrooms.csv`. The data will be used to train a model to predict whether or not a mushroom is poisonous. The following attributes are provided:\n", 476 | "\n", 477 | "*Attribute Information:*\n", 478 | "\n", 479 | "1. cap-shape: bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s \n", 480 | "2. cap-surface: fibrous=f, grooves=g, scaly=y, smooth=s \n", 481 | "3. cap-color: brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y \n", 482 | "4. bruises?: bruises=t, no=f \n", 483 | "5. odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s \n", 484 | "6. gill-attachment: attached=a, descending=d, free=f, notched=n \n", 485 | "7. gill-spacing: close=c, crowded=w, distant=d \n", 486 | "8. gill-size: broad=b, narrow=n \n", 487 | "9. gill-color: black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y \n", 488 | "10. stalk-shape: enlarging=e, tapering=t \n", 489 | "11. stalk-root: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=? \n", 490 | "12. stalk-surface-above-ring: fibrous=f, scaly=y, silky=k, smooth=s \n", 491 | "13. stalk-surface-below-ring: fibrous=f, scaly=y, silky=k, smooth=s \n", 492 | "14. stalk-color-above-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y \n", 493 | "15. stalk-color-below-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y \n", 494 | "16. veil-type: partial=p, universal=u \n", 495 | "17. veil-color: brown=n, orange=o, white=w, yellow=y \n", 496 | "18. ring-number: none=n, one=o, two=t \n", 497 | "19. ring-type: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z \n", 498 | "20. spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y \n", 499 | "21. population: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y \n", 500 | "22. habitat: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d\n", 501 | "\n", 502 | "
\n", 503 | "\n", 504 | "The data in the mushrooms dataset is currently encoded with strings. These values will need to be encoded to numeric to work with sklearn. We'll use pd.get_dummies to convert the categorical variables into indicator variables. " 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 7, 510 | "metadata": { 511 | "collapsed": false 512 | }, 513 | "outputs": [], 514 | "source": [ 515 | "import pandas as pd\n", 516 | "import numpy as np\n", 517 | "from sklearn.model_selection import train_test_split\n", 518 | "\n", 519 | "\n", 520 | "mush_df = pd.read_csv('mushrooms.csv')\n", 521 | "mush_df2 = pd.get_dummies(mush_df)\n", 522 | "\n", 523 | "X_mush = mush_df2.iloc[:,2:]\n", 524 | "y_mush = mush_df2.iloc[:,1]\n", 525 | "\n", 526 | "# use the variables X_train2, y_train2 for Question 5\n", 527 | "X_train2, X_test2, y_train2, y_test2 = train_test_split(X_mush, y_mush, random_state=0)\n", 528 | "\n", 529 | "# print(X_mush)\n", 530 | "\n", 531 | "# For performance reasons in Questions 6 and 7, we will create a smaller version of the\n", 532 | "# entire mushroom dataset for use in those questions. For simplicity we'll just re-use\n", 533 | "# the 25% test split created above as the representative subset.\n", 534 | "#\n", 535 | "# Use the variables X_subset, y_subset for Questions 6 and 7.\n", 536 | "X_subset = X_test2\n", 537 | "y_subset = y_test2" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "### Question 5\n", 545 | "\n", 546 | "Using `X_train2` and `y_train2` from the preceeding cell, train a DecisionTreeClassifier with default parameters and random_state=0. What are the 5 most important features found by the decision tree?\n", 547 | "\n", 548 | "As a reminder, the feature names are available in the `X_train2.columns` property, and the order of the features in `X_train2.columns` matches the order of the feature importance values in the classifier's `feature_importances_` property. \n", 549 | "\n", 550 | "*This function should return a list of length 5 containing the feature names in descending order of importance.*\n" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 8, 556 | "metadata": { 557 | "collapsed": false 558 | }, 559 | "outputs": [ 560 | { 561 | "data": { 562 | "text/plain": [ 563 | "['odor_n', 'stalk-root_c', 'stalk-root_r', 'spore-print-color_r', 'odor_l']" 564 | ] 565 | }, 566 | "execution_count": 8, 567 | "metadata": {}, 568 | "output_type": "execute_result" 569 | } 570 | ], 571 | "source": [ 572 | "def answer_five():\n", 573 | " from sklearn.tree import DecisionTreeClassifier\n", 574 | "\n", 575 | " # Your code here\n", 576 | " clf = DecisionTreeClassifier(random_state = 0).fit(X_train2, y_train2)\n", 577 | " Series = pd.Series(data = clf.feature_importances_, index = X_train2.columns.values)\n", 578 | " \n", 579 | " results = Series.sort_values(axis=0, ascending=False).index.tolist()\n", 580 | " \n", 581 | " answer = results[:5]\n", 582 | " \n", 583 | " return answer\n", 584 | "\n", 585 | "answer_five()" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "### Question 6\n", 593 | "\n", 594 | "For this question, we're going to use the `validation_curve` function in `sklearn.model_selection` to determine training and test scores for a Support Vector Classifier (`SVC`) with varying parameter values. Recall that the validation_curve function, in addition to taking an initialized unfitted classifier object, takes a dataset as input and does its own internal train-test splits to compute results.\n", 595 | "\n", 596 | "**Because creating a validation curve requires fitting multiple models, for performance reasons this question will use just a subset of the original mushroom dataset: please use the variables X_subset and y_subset as input to the validation curve function (instead of X_mush and y_mush) to reduce computation time.**\n", 597 | "\n", 598 | "The initialized unfitted classifier object we'll be using is a Support Vector Classifier with radial basis kernel. So your first step is to create an `SVC` object with default parameters (i.e. `kernel='rbf', C=1`) and `random_state=0`. Recall that the kernel width of the RBF kernel is controlled using the `gamma` parameter. \n", 599 | "\n", 600 | "With this classifier, and the dataset in X_subset, y_subset, explore the effect of `gamma` on classifier accuracy by using the `validation_curve` function to find the training and test scores for 6 values of `gamma` from `0.0001` to `10` (i.e. `np.logspace(-4,1,6)`). Recall that you can specify what scoring metric you want validation_curve to use by setting the \"scoring\" parameter. In this case, we want to use \"accuracy\" as the scoring metric.\n", 601 | "\n", 602 | "For each level of `gamma`, `validation_curve` will fit 3 models on different subsets of the data, returning two 6x3 (6 levels of gamma x 3 fits per level) arrays of the scores for the training and test sets.\n", 603 | "\n", 604 | "Find the mean score across the three models for each level of `gamma` for both arrays, creating two arrays of length 6, and return a tuple with the two arrays.\n", 605 | "\n", 606 | "e.g.\n", 607 | "\n", 608 | "if one of your array of scores is\n", 609 | "\n", 610 | " array([[ 0.5, 0.4, 0.6],\n", 611 | " [ 0.7, 0.8, 0.7],\n", 612 | " [ 0.9, 0.8, 0.8],\n", 613 | " [ 0.8, 0.7, 0.8],\n", 614 | " [ 0.7, 0.6, 0.6],\n", 615 | " [ 0.4, 0.6, 0.5]])\n", 616 | " \n", 617 | "it should then become\n", 618 | "\n", 619 | " array([ 0.5, 0.73333333, 0.83333333, 0.76666667, 0.63333333, 0.5])\n", 620 | "\n", 621 | "*This function should return one tuple of numpy arrays `(training_scores, test_scores)` where each array in the tuple has shape `(6,)`.*" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 22, 627 | "metadata": { 628 | "collapsed": false 629 | }, 630 | "outputs": [ 631 | { 632 | "data": { 633 | "text/plain": [ 634 | "(array([ 0.56647847, 0.93155951, 0.99039881, 1. , 1. , 1. ]),\n", 635 | " array([ 0.56768547, 0.92959558, 0.98965952, 1. , 0.99507994,\n", 636 | " 0.52240279]))" 637 | ] 638 | }, 639 | "execution_count": 22, 640 | "metadata": {}, 641 | "output_type": "execute_result" 642 | } 643 | ], 644 | "source": [ 645 | "def answer_six():\n", 646 | " from sklearn.svm import SVC\n", 647 | " from sklearn.model_selection import validation_curve\n", 648 | "\n", 649 | " # Your code here\n", 650 | " this_C = 1.0\n", 651 | " clf = SVC(kernel = 'rbf', C=this_C).fit(X_train2, y_train2)\n", 652 | " \n", 653 | " param_range = np.logspace(-4,1,6)\n", 654 | "# print(C)\n", 655 | "\n", 656 | " train_scores, test_scores = validation_curve(clf, X_subset, y_subset,param_name='gamma',\n", 657 | " param_range=param_range, cv=3)\n", 658 | " training_scores_mean = np.mean(train_scores, axis = 1)\n", 659 | " test_scores_mean = np.mean(test_scores, axis = 1)\n", 660 | "\n", 661 | " \n", 662 | " return (training_scores_mean, test_scores_mean)\n", 663 | "\n", 664 | "answer_six()" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "metadata": {}, 670 | "source": [ 671 | "### Question 7\n", 672 | "\n", 673 | "Based on the scores from question 6, what gamma value corresponds to a model that is underfitting (and has the worst test set accuracy)? What gamma value corresponds to a model that is overfitting (and has the worst test set accuracy)? What choice of gamma would be the best choice for a model with good generalization performance on this dataset (high accuracy on both training and test set)? Note: there may be multiple correct solutions to this question.\n", 674 | "\n", 675 | "(Hint: Try plotting the scores from question 6 to visualize the relationship between gamma and accuracy.)\n", 676 | "\n", 677 | "*This function should return one tuple with the degree values in this order: `(Underfitting, Overfitting, Good_Generalization)`*" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 23, 683 | "metadata": { 684 | "collapsed": false 685 | }, 686 | "outputs": [ 687 | { 688 | "name": "stdout", 689 | "output_type": "stream", 690 | "text": [ 691 | "[ 1.00000000e-04 1.00000000e-03 1.00000000e-02 1.00000000e-01\n", 692 | " 1.00000000e+00 1.00000000e+01]\n" 693 | ] 694 | } 695 | ], 696 | "source": [ 697 | "def answer_seven():\n", 698 | " \n", 699 | " # Your code here\n", 700 | "# param_range = np.logspace(-4,1,6)\n", 701 | " \n", 702 | "# print(param_range)\n", 703 | " \n", 704 | " return (1.00000000e-04, 1.00000000e+01, 1.00000000e-01)\n", 705 | "answer_seven()" 706 | ] 707 | } 708 | ], 709 | "metadata": { 710 | "coursera": { 711 | "course_slug": "python-machine-learning", 712 | "graded_item_id": "eWYHL", 713 | "launcher_item_id": "BAqef", 714 | "part_id": "fXXRp" 715 | }, 716 | "kernelspec": { 717 | "display_name": "Python 3", 718 | "language": "python", 719 | "name": "python3" 720 | }, 721 | "language_info": { 722 | "codemirror_mode": { 723 | "name": "ipython", 724 | "version": 3 725 | }, 726 | "file_extension": ".py", 727 | "mimetype": "text/x-python", 728 | "name": "python", 729 | "nbconvert_exporter": "python", 730 | "pygments_lexer": "ipython3", 731 | "version": "3.5.2" 732 | } 733 | }, 734 | "nbformat": 4, 735 | "nbformat_minor": 2 736 | } 737 | -------------------------------------------------------------------------------- /week3_Assignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.1** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Assignment 3 - Evaluation\n", 19 | "\n", 20 | "In this assignment you will train several models and evaluate how effectively they predict instances of fraud using data based on [this dataset from Kaggle](https://www.kaggle.com/dalpozz/creditcardfraud).\n", 21 | " \n", 22 | "Each row in `fraud_data.csv` corresponds to a credit card transaction. Features include confidential variables `V1` through `V28` as well as `Amount` which is the amount of the transaction. \n", 23 | " \n", 24 | "The target is stored in the `class` column, where a value of 1 corresponds to an instance of fraud and 0 corresponds to an instance of not fraud." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import numpy as np\n", 36 | "import pandas as pd" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Question 1\n", 44 | "Import the data from `fraud_data.csv`. What percentage of the observations in the dataset are instances of fraud?\n", 45 | "\n", 46 | "*This function should return a float between 0 and 1.* " 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "0.016684632328818484" 60 | ] 61 | }, 62 | "execution_count": 2, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "def answer_one():\n", 69 | " \n", 70 | " # Your code here\n", 71 | " df = pd.read_csv('fraud_data.csv')\n", 72 | " ans = (len(df[df['Class'] == 1]) / len(df[df['Class'] == 0]))\n", 73 | " return ans\n", 74 | "\n", 75 | "answer_one()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "# Use X_train, X_test, y_train, y_test for all of the following questions\n", 87 | "from sklearn.model_selection import train_test_split\n", 88 | "\n", 89 | "df = pd.read_csv('fraud_data.csv')\n", 90 | "\n", 91 | "X = df.iloc[:,:-1]\n", 92 | "y = df.iloc[:,-1]\n", 93 | "\n", 94 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Question 2\n", 102 | "\n", 103 | "Using `X_train`, `X_test`, `y_train`, and `y_test` (as defined above), train a dummy classifier that classifies everything as the majority class of the training data. What is the accuracy of this classifier? What is the recall?\n", 104 | "\n", 105 | "*This function should a return a tuple with two floats, i.e. `(accuracy score, recall score)`.*" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 8, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "0.9833153676711816\n" 120 | ] 121 | }, 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "(0.98525073746312686, 0.0)" 126 | ] 127 | }, 128 | "execution_count": 8, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "def answer_two():\n", 135 | " from sklearn.dummy import DummyClassifier\n", 136 | " from sklearn.metrics import recall_score, accuracy_score\n", 137 | " \n", 138 | " # Your code here\n", 139 | " \n", 140 | " dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)\n", 141 | " # Therefore the dummy 'most_frequent' classifier always predicts class 0\n", 142 | " y_dummy_predictions = dummy_majority.predict(X_test)\n", 143 | "\n", 144 | " ans = (accuracy_score(y_test, y_dummy_predictions), recall_score(y_test, y_dummy_predictions))\n", 145 | " \n", 146 | " return ans\n", 147 | "answer_two()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "### Question 3\n", 155 | "\n", 156 | "Using X_train, X_test, y_train, y_test (as defined above), train a SVC classifer using the default parameters. What is the accuracy, recall, and precision of this classifier?\n", 157 | "\n", 158 | "*This function should a return a tuple with three floats, i.e. `(accuracy score, recall score, precision score)`.*" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "(0.99078171091445433, 0.375, 1.0)" 172 | ] 173 | }, 174 | "execution_count": 10, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "def answer_three():\n", 181 | " from sklearn.metrics import recall_score, precision_score, accuracy_score\n", 182 | " from sklearn.svm import SVC\n", 183 | "\n", 184 | " # Your code here\n", 185 | " svm = SVC().fit(X_train, y_train)\n", 186 | " y_predictions = svm.predict(X_test)\n", 187 | " \n", 188 | " ans = (accuracy_score(y_test, y_predictions), recall_score(y_test, y_predictions), precision_score(y_test, y_predictions))\n", 189 | " \n", 190 | " return ans\n", 191 | "answer_three()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "### Question 4\n", 199 | "\n", 200 | "Using the SVC classifier with parameters `{'C': 1e9, 'gamma': 1e-07}`, what is the confusion matrix when using a threshold of -220 on the decision function. Use X_test and y_test.\n", 201 | "\n", 202 | "*This function should return a confusion matrix, a 2x2 numpy array with 4 integers.*" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 18, 208 | "metadata": { 209 | "collapsed": false 210 | }, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "array([[5320, 24],\n", 216 | " [ 14, 66]])" 217 | ] 218 | }, 219 | "execution_count": 18, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "def answer_four():\n", 226 | " from sklearn.metrics import confusion_matrix\n", 227 | " from sklearn.svm import SVC\n", 228 | "\n", 229 | " # Your code here\n", 230 | " svm = SVC(C = 1e9, gamma = 1e-07).fit(X_train, y_train)\n", 231 | " svm_predicted = svm.decision_function(X_test) > -220\n", 232 | " \n", 233 | "# print(svm_predicted)\n", 234 | " \n", 235 | " confusion = confusion_matrix(y_test, svm_predicted)\n", 236 | " \n", 237 | " ans = confusion\n", 238 | " \n", 239 | " return ans\n", 240 | "\n", 241 | "answer_four()" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "### Question 5\n", 249 | "\n", 250 | "Train a logisitic regression classifier with default parameters using X_train and y_train.\n", 251 | "\n", 252 | "For the logisitic regression classifier, create a precision recall curve and a roc curve using y_test and the probability estimates for X_test (probability it is fraud).\n", 253 | "\n", 254 | "Looking at the precision recall curve, what is the recall when the precision is `0.75`?\n", 255 | "\n", 256 | "Looking at the roc curve, what is the true positive rate when the false positive rate is `0.16`?\n", 257 | "\n", 258 | "*This function should return a tuple with two floats, i.e. `(recall, true positive rate)`.*" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 23, 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "(0.82499999999999996, 0.98750000000000004)" 272 | ] 273 | }, 274 | "execution_count": 23, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "def answer_five():\n", 281 | " \n", 282 | " # Your code here\n", 283 | " from sklearn.linear_model import LogisticRegression\n", 284 | " from sklearn.metrics import precision_recall_curve\n", 285 | " from sklearn.metrics import roc_curve\n", 286 | "\n", 287 | " lr = LogisticRegression().fit(X_train, y_train)\n", 288 | " \n", 289 | " y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test)\n", 290 | " \n", 291 | "# lr_predicted = lr.predict(X_test)\n", 292 | " \n", 293 | " precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)\n", 294 | " closest_zero_p = np.argmin(np.abs(precision-0.75))\n", 295 | "# closest_zero_p = precision[closest_zero]\n", 296 | " closest_zero_r = recall[closest_zero_p]\n", 297 | " \n", 298 | "# print(closest_zero_r)\n", 299 | " \n", 300 | " \n", 301 | " fpr_lr, tpr_lr, _ = roc_curve(y_test, y_scores_lr)\n", 302 | "# roc_auc_lr = auc(fpr_lr, tpr_lr)\n", 303 | " \n", 304 | " closest_zero_fpr_lr = np.argmin(np.abs(fpr_lr - 0.16))\n", 305 | "# closest_zero_p = precision[closest_zero]\n", 306 | " closest_zero_tpr_lr = recall[closest_zero_fpr_lr]\n", 307 | " \n", 308 | "# print(closest_zero_tpr_lr)\n", 309 | "\n", 310 | " \n", 311 | "# y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test)\n", 312 | " \n", 313 | "# confusion = confusion_matrix(y_test, lr_predicted)\n", 314 | "\n", 315 | " ans = (closest_zero_r, closest_zero_tpr_lr)\n", 316 | " \n", 317 | " return ans\n", 318 | "answer_five()" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "### Question 6\n", 326 | "\n", 327 | "Perform a grid search over the parameters listed below for a Logisitic Regression classifier, using recall for scoring and the default 3-fold cross validation.\n", 328 | "\n", 329 | "`'penalty': ['l1', 'l2']`\n", 330 | "\n", 331 | "`'C':[0.01, 0.1, 1, 10, 100]`\n", 332 | "\n", 333 | "From `.cv_results_`, create an array of the mean test scores of each parameter combination. i.e.\n", 334 | "\n", 335 | "| \t| `l1` \t| `l2` \t|\n", 336 | "|:----:\t|----\t|----\t|\n", 337 | "| **`0.01`** \t| ?\t| ? \t|\n", 338 | "| **`0.1`** \t| ?\t| ? \t|\n", 339 | "| **`1`** \t| ?\t| ? \t|\n", 340 | "| **`10`** \t| ?\t| ? \t|\n", 341 | "| **`100`** \t| ?\t| ? \t|\n", 342 | "\n", 343 | "
\n", 344 | "\n", 345 | "*This function should return a 5 by 2 numpy array with 10 floats.* \n", 346 | "\n", 347 | "*Note: do not return a DataFrame, just the values denoted by '?' above in a numpy array.*" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 33, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "array([[ 0.66666667, 0.76086957],\n", 361 | " [ 0.80072464, 0.80434783],\n", 362 | " [ 0.8115942 , 0.8115942 ],\n", 363 | " [ 0.80797101, 0.8115942 ],\n", 364 | " [ 0.80797101, 0.80797101]])" 365 | ] 366 | }, 367 | "execution_count": 33, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "def answer_six(): \n", 374 | " from sklearn.model_selection import GridSearchCV\n", 375 | " from sklearn.linear_model import LogisticRegression\n", 376 | "\n", 377 | " # Your code here\n", 378 | " lr = LogisticRegression()\n", 379 | "\n", 380 | " grid_values = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}\n", 381 | "\n", 382 | " # default metric to optimize over grid parameters\n", 383 | " grid_lr = GridSearchCV(lr, param_grid = grid_values, scoring = 'recall')\n", 384 | " grid_lr.fit(X_train, y_train)\n", 385 | " \n", 386 | "# print(grid_lr.cv_results_['mean_test_score'].reshape(5,2))\n", 387 | " ans = np.array(grid_lr.cv_results_['mean_test_score'].reshape(5,2))\n", 388 | " \n", 389 | " return ans\n", 390 | "\n", 391 | "answer_six()" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "collapsed": false 399 | }, 400 | "outputs": [], 401 | "source": [ 402 | "# Use the following function to help visualize results from the grid search\n", 403 | "def GridSearch_Heatmap(scores):\n", 404 | " %matplotlib notebook\n", 405 | " import seaborn as sns\n", 406 | " import matplotlib.pyplot as plt\n", 407 | " plt.figure()\n", 408 | " sns.heatmap(scores.reshape(5,2), xticklabels=['l1','l2'], yticklabels=[0.01, 0.1, 1, 10, 100])\n", 409 | " plt.yticks(rotation=0);\n", 410 | "\n", 411 | "# GridSearch_Heatmap(answer_six())" 412 | ] 413 | } 414 | ], 415 | "metadata": { 416 | "coursera": { 417 | "course_slug": "python-machine-learning", 418 | "graded_item_id": "5yX9Z", 419 | "launcher_item_id": "eqnV3", 420 | "part_id": "Msnj0" 421 | }, 422 | "kernelspec": { 423 | "display_name": "Python 3", 424 | "language": "python", 425 | "name": "python3" 426 | }, 427 | "language_info": { 428 | "codemirror_mode": { 429 | "name": "ipython", 430 | "version": 3 431 | }, 432 | "file_extension": ".py", 433 | "mimetype": "text/x-python", 434 | "name": "python", 435 | "nbconvert_exporter": "python", 436 | "pygments_lexer": "ipython3", 437 | "version": "3.5.2" 438 | } 439 | }, 440 | "nbformat": 4, 441 | "nbformat_minor": 2 442 | } 443 | -------------------------------------------------------------------------------- /week4_Assignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Assignment 4 - Understanding and Predicting Property Maintenance Fines\n", 19 | "\n", 20 | "This assignment is based on a data challenge from the Michigan Data Science Team ([MDST](http://midas.umich.edu/mdst/)). \n", 21 | "\n", 22 | "The Michigan Data Science Team ([MDST](http://midas.umich.edu/mdst/)) and the Michigan Student Symposium for Interdisciplinary Statistical Sciences ([MSSISS](https://sites.lsa.umich.edu/mssiss/)) have partnered with the City of Detroit to help solve one of the most pressing problems facing Detroit - blight. [Blight violations](http://www.detroitmi.gov/How-Do-I/Report/Blight-Complaint-FAQs) are issued by the city to individuals who allow their properties to remain in a deteriorated condition. Every year, the city of Detroit issues millions of dollars in fines to residents and every year, many of these fines remain unpaid. Enforcing unpaid blight fines is a costly and tedious process, so the city wants to know: how can we increase blight ticket compliance?\n", 23 | "\n", 24 | "The first step in answering this question is understanding when and why a resident might fail to comply with a blight ticket. This is where predictive modeling comes in. For this assignment, your task is to predict whether a given blight ticket will be paid on time.\n", 25 | "\n", 26 | "All data for this assignment has been provided to us through the [Detroit Open Data Portal](https://data.detroitmi.gov/). **Only the data already included in your Coursera directory can be used for training the model for this assignment.** Nonetheless, we encourage you to look into data from other Detroit datasets to help inform feature creation and model selection. We recommend taking a look at the following related datasets:\n", 27 | "\n", 28 | "* [Building Permits](https://data.detroitmi.gov/Property-Parcels/Building-Permits/xw2a-a7tf)\n", 29 | "* [Trades Permits](https://data.detroitmi.gov/Property-Parcels/Trades-Permits/635b-dsgv)\n", 30 | "* [Improve Detroit: Submitted Issues](https://data.detroitmi.gov/Government/Improve-Detroit-Submitted-Issues/fwz3-w3yn)\n", 31 | "* [DPD: Citizen Complaints](https://data.detroitmi.gov/Public-Safety/DPD-Citizen-Complaints-2016/kahe-efs3)\n", 32 | "* [Parcel Map](https://data.detroitmi.gov/Property-Parcels/Parcel-Map/fxkw-udwf)\n", 33 | "\n", 34 | "___\n", 35 | "\n", 36 | "We provide you with two data files for use in training and validating your models: train.csv and test.csv. Each row in these two files corresponds to a single blight ticket, and includes information about when, why, and to whom each ticket was issued. The target variable is compliance, which is True if the ticket was paid early, on time, or within one month of the hearing data, False if the ticket was paid after the hearing date or not at all, and Null if the violator was found not responsible. Compliance, as well as a handful of other variables that will not be available at test-time, are only included in train.csv.\n", 37 | "\n", 38 | "Note: All tickets where the violators were found not responsible are not considered during evaluation. They are included in the training set as an additional source of data for visualization, and to enable unsupervised and semi-supervised approaches. However, they are not included in the test set.\n", 39 | "\n", 40 | "
\n", 41 | "\n", 42 | "**File descriptions** (Use only this data for training your model!)\n", 43 | "\n", 44 | " train.csv - the training set (all tickets issued 2004-2011)\n", 45 | " test.csv - the test set (all tickets issued 2012-2016)\n", 46 | " addresses.csv & latlons.csv - mapping from ticket id to addresses, and from addresses to lat/lon coordinates. \n", 47 | " Note: misspelled addresses may be incorrectly geolocated.\n", 48 | "\n", 49 | "
\n", 50 | "\n", 51 | "**Data fields**\n", 52 | "\n", 53 | "train.csv & test.csv\n", 54 | "\n", 55 | " ticket_id - unique identifier for tickets\n", 56 | " agency_name - Agency that issued the ticket\n", 57 | " inspector_name - Name of inspector that issued the ticket\n", 58 | " violator_name - Name of the person/organization that the ticket was issued to\n", 59 | " violation_street_number, violation_street_name, violation_zip_code - Address where the violation occurred\n", 60 | " mailing_address_str_number, mailing_address_str_name, city, state, zip_code, non_us_str_code, country - Mailing address of the violator\n", 61 | " ticket_issued_date - Date and time the ticket was issued\n", 62 | " hearing_date - Date and time the violator's hearing was scheduled\n", 63 | " violation_code, violation_description - Type of violation\n", 64 | " disposition - Judgment and judgement type\n", 65 | " fine_amount - Violation fine amount, excluding fees\n", 66 | " admin_fee - $20 fee assigned to responsible judgments\n", 67 | "state_fee - $10 fee assigned to responsible judgments\n", 68 | " late_fee - 10% fee assigned to responsible judgments\n", 69 | " discount_amount - discount applied, if any\n", 70 | " clean_up_cost - DPW clean-up or graffiti removal cost\n", 71 | " judgment_amount - Sum of all fines and fees\n", 72 | " grafitti_status - Flag for graffiti violations\n", 73 | " \n", 74 | "train.csv only\n", 75 | "\n", 76 | " payment_amount - Amount paid, if any\n", 77 | " payment_date - Date payment was made, if it was received\n", 78 | " payment_status - Current payment status as of Feb 1 2017\n", 79 | " balance_due - Fines and fees still owed\n", 80 | " collection_status - Flag for payments in collections\n", 81 | " compliance [target variable for prediction] \n", 82 | " Null = Not responsible\n", 83 | " 0 = Responsible, non-compliant\n", 84 | " 1 = Responsible, compliant\n", 85 | " compliance_detail - More information on why each ticket was marked compliant or non-compliant\n", 86 | "\n", 87 | "\n", 88 | "___\n", 89 | "\n", 90 | "## Evaluation\n", 91 | "\n", 92 | "Your predictions will be given as the probability that the corresponding blight ticket will be paid on time.\n", 93 | "\n", 94 | "The evaluation metric for this assignment is the Area Under the ROC Curve (AUC). \n", 95 | "\n", 96 | "Your grade will be based on the AUC score computed for your classifier. A model which with an AUROC of 0.7 passes this assignment, over 0.75 will recieve full points.\n", 97 | "___\n", 98 | "\n", 99 | "For this assignment, create a function that trains a model to predict blight ticket compliance in Detroit using `train.csv`. Using this model, return a series of length 61001 with the data being the probability that each corresponding ticket from `test.csv` will be paid, and the index being the ticket_id.\n", 100 | "\n", 101 | "Example:\n", 102 | "\n", 103 | " ticket_id\n", 104 | " 284932 0.531842\n", 105 | " 285362 0.401958\n", 106 | " 285361 0.105928\n", 107 | " 285338 0.018572\n", 108 | " ...\n", 109 | " 376499 0.208567\n", 110 | " 376500 0.818759\n", 111 | " 369851 0.018528\n", 112 | " Name: compliance, dtype: float32" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 15, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stderr", 124 | "output_type": "stream", 125 | "text": [ 126 | "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2827: DtypeWarning: Columns (11,12,31) have mixed types. Specify dtype option on import or set low_memory=False.\n", 127 | " if self.run_code(code, result):\n" 128 | ] 129 | }, 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "ticket_id\n", 134 | "284932 0.060788\n", 135 | "285362 0.026533\n", 136 | "285361 0.068650\n", 137 | "285338 0.060788\n", 138 | "285346 0.068650\n", 139 | "285345 0.060788\n", 140 | "285347 0.055858\n", 141 | "285342 0.401352\n", 142 | "285530 0.026533\n", 143 | "284989 0.029734\n", 144 | "285344 0.055858\n", 145 | "285343 0.026533\n", 146 | "285340 0.026533\n", 147 | "285341 0.055858\n", 148 | "285349 0.068650\n", 149 | "285348 0.060788\n", 150 | "284991 0.029734\n", 151 | "285532 0.029734\n", 152 | "285406 0.029734\n", 153 | "285001 0.029734\n", 154 | "285006 0.026533\n", 155 | "285405 0.026533\n", 156 | "285337 0.029734\n", 157 | "285496 0.055858\n", 158 | "285497 0.060788\n", 159 | "285378 0.026533\n", 160 | "285589 0.029734\n", 161 | "285585 0.060788\n", 162 | "285501 0.068650\n", 163 | "285581 0.026533\n", 164 | " ... \n", 165 | "376367 0.029734\n", 166 | "376366 0.035475\n", 167 | "376362 0.035475\n", 168 | "376363 0.060788\n", 169 | "376365 0.029734\n", 170 | "376364 0.035475\n", 171 | "376228 0.035475\n", 172 | "376265 0.035475\n", 173 | "376286 0.369236\n", 174 | "376320 0.035475\n", 175 | "376314 0.035475\n", 176 | "376327 0.369236\n", 177 | "376385 0.369236\n", 178 | "376435 0.475758\n", 179 | "376370 0.369236\n", 180 | "376434 0.055858\n", 181 | "376459 0.068650\n", 182 | "376478 0.008845\n", 183 | "376473 0.035475\n", 184 | "376484 0.024999\n", 185 | "376482 0.029734\n", 186 | "376480 0.029734\n", 187 | "376479 0.029734\n", 188 | "376481 0.029734\n", 189 | "376483 0.035475\n", 190 | "376496 0.026533\n", 191 | "376497 0.026533\n", 192 | "376499 0.068650\n", 193 | "376500 0.068650\n", 194 | "369851 0.308120\n", 195 | "dtype: float32" 196 | ] 197 | }, 198 | "execution_count": 15, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "import pandas as pd\n", 205 | "import numpy as np\n", 206 | "import math\n", 207 | "from sklearn.ensemble import RandomForestClassifier\n", 208 | "from sklearn.model_selection import train_test_split\n", 209 | "from sklearn.metrics import roc_auc_score\n", 210 | "from sklearn.metrics import roc_curve, auc\n", 211 | "from sklearn.preprocessing import LabelEncoder\n", 212 | "from sklearn.model_selection import GridSearchCV\n", 213 | "\n", 214 | "def blight_model():\n", 215 | " \n", 216 | " # Your code here\n", 217 | " \n", 218 | " df = pd.read_csv('train.csv', encoding = \"ISO-8859-1\")\n", 219 | " \n", 220 | " df.index = df['ticket_id']\n", 221 | " \n", 222 | "# features_name = ['agency_name', 'inspector_name', 'violator_name', 'violation_street_number', \n", 223 | "# 'violation_street_name', 'mailing_address_str_number', 'mailing_address_str_name',\n", 224 | "# 'city', 'state', 'zip_code', 'ticket_issued_date', 'hearing_date',\n", 225 | "# 'violation_code', 'violation_description', 'disposition', 'fine_amount', 'admin_fee',\n", 226 | "# 'state_fee' , 'late_fee', 'discount_amount', 'clean_up_cost' , 'judgment_amount'\n", 227 | "# ]\n", 228 | "\n", 229 | "\n", 230 | " features_name = ['fine_amount', 'admin_fee', 'state_fee', 'late_fee']\n", 231 | " \n", 232 | " df.compliance = df.compliance.fillna(value=-1)\n", 233 | " \n", 234 | " df = df[df.compliance != -1]\n", 235 | " \n", 236 | "# le = LabelEncoder().fit(df['inspector_name'])\n", 237 | " \n", 238 | "# inspector_name_transformed = le.transform(df['inspector_name'])\n", 239 | " \n", 240 | " \n", 241 | " X = df[features_name]\n", 242 | " \n", 243 | "# X['inspector_name'] = le.transform(df['inspector_name'])\n", 244 | " \n", 245 | "# print(X)\n", 246 | " \n", 247 | " X.fillna(value = -1)\n", 248 | " \n", 249 | " y = df.compliance\n", 250 | " \n", 251 | " X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)\n", 252 | " \n", 253 | " clf = RandomForestClassifier(n_estimators = 10, max_depth = 5).fit(X_train, y_train)\n", 254 | " \n", 255 | "# grid_values = {'n_estimators': [9, 10, 11], 'max_depth': [1,2,3,4,5] } # n_est = 10 and max_depth = 5\n", 256 | " \n", 257 | " # default metric to optimize over grid parameters: accuracy\n", 258 | "# grid_clf = GridSearchCV(clf, param_grid = grid_values)\n", 259 | "# grid_clf.fit(X_train, y_train)\n", 260 | "\n", 261 | " \n", 262 | "# y_score = clf.predict(X_test)\n", 263 | " \n", 264 | "# fpr, tpr, _ = roc_curve(y_test, y_score)\n", 265 | " \n", 266 | "# roc_auc = auc(fpr, tpr)\n", 267 | " \n", 268 | "# print(roc_auc)\n", 269 | "\n", 270 | " features_name = ['fine_amount', 'admin_fee', 'state_fee', 'late_fee']\n", 271 | " \n", 272 | " df_test = pd.read_csv('test.csv', encoding = \"ISO-8859-1\")\n", 273 | " \n", 274 | " df_test.index = df_test['ticket_id']\n", 275 | " \n", 276 | " X_predict = clf.predict_proba(df_test[features_name])\n", 277 | " \n", 278 | " ans = pd.Series(data = X_predict[:,1], index = df_test['ticket_id'], dtype='float32')\n", 279 | "\n", 280 | "# print(ans)\n", 281 | " \n", 282 | " return ans\n", 283 | "\n", 284 | "blight_model()" 285 | ] 286 | } 287 | ], 288 | "metadata": { 289 | "coursera": { 290 | "course_slug": "python-machine-learning", 291 | "graded_item_id": "nNS8l", 292 | "launcher_item_id": "yWWk7", 293 | "part_id": "w8BSS" 294 | }, 295 | "kernelspec": { 296 | "display_name": "Python 3", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.5.2" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 2 315 | } 316 | --------------------------------------------------------------------------------