├── README.md
├── Certificate.pdf
├── Week 4
    ├── rules_of_ml.pdf
    ├── cs670_Tran_PreferredPaper_LeakingInDataMining.pdf
    ├── Unsupervised+Learning.ipynb
    ├── Module+4.ipynb
    └── Assignment4.ipynb
├── .gitattributes
├── .gitignore
├── Week 1
    └── Module+1.ipynb
├── Week 3
    └── Module+3.ipynb
└── Week 2
    └── Module+2.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # Applied-Machine-Learning-in-Python
2 | 
3 | Coursera course by University of Michigan
4 | 


--------------------------------------------------------------------------------
/Certificate.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bondeanikets/Applied-Machine-Learning-in-Python/master/Certificate.pdf


--------------------------------------------------------------------------------
/Week 4/rules_of_ml.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bondeanikets/Applied-Machine-Learning-in-Python/master/Week 4/rules_of_ml.pdf


--------------------------------------------------------------------------------
/Week 4/cs670_Tran_PreferredPaper_LeakingInDataMining.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bondeanikets/Applied-Machine-Learning-in-Python/master/Week 4/cs670_Tran_PreferredPaper_LeakingInDataMining.pdf


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/Week 1/Module+1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "---\n",
  8 |     "\n",
  9 |     "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n",
 10 |     "\n",
 11 |     "---"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Applied Machine Learning, Module 1:  A simple classification task"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "### Import required modules and load data file"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "%matplotlib notebook\n",
 37 |     "import numpy as np\n",
 38 |     "import matplotlib.pyplot as plt\n",
 39 |     "import pandas as pd\n",
 40 |     "from sklearn.model_selection import train_test_split\n",
 41 |     "\n",
 42 |     "fruits = pd.read_table('fruit_data_with_colors.txt')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "collapsed": false
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "fruits.head()"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# create a mapping from fruit label value to fruit name to make results easier to interpret\n",
 65 |     "lookup_fruit_name = dict(zip(fruits.fruit_label.unique(), fruits.fruit_name.unique()))   \n",
 66 |     "lookup_fruit_name"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "The file contains the mass, height, and width of a selection of oranges, lemons and apples. The heights were measured along the core of the fruit. The widths were the widest width perpendicular to the height."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Examining the data"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "collapsed": false
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# plotting a scatter matrix\n",
 92 |     "from matplotlib import cm\n",
 93 |     "\n",
 94 |     "X = fruits[['height', 'width', 'mass', 'color_score']]\n",
 95 |     "y = fruits['fruit_label']\n",
 96 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n",
 97 |     "\n",
 98 |     "cmap = cm.get_cmap('gnuplot')\n",
 99 |     "scatter = pd.scatter_matrix(X_train, c= y_train, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(9,9), cmap=cmap)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "# plotting a 3D scatter plot\n",
111 |     "from mpl_toolkits.mplot3d import Axes3D\n",
112 |     "\n",
113 |     "fig = plt.figure()\n",
114 |     "ax = fig.add_subplot(111, projection = '3d')\n",
115 |     "ax.scatter(X_train['width'], X_train['height'], X_train['color_score'], c = y_train, marker = 'o', s=100)\n",
116 |     "ax.set_xlabel('width')\n",
117 |     "ax.set_ylabel('height')\n",
118 |     "ax.set_zlabel('color_score')\n",
119 |     "plt.show()"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "### Create train-test split"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {
133 |     "collapsed": true
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "# For this example, we use the mass, width, and height features of each fruit instance\n",
138 |     "X = fruits[['mass', 'width', 'height']]\n",
139 |     "y = fruits['fruit_label']\n",
140 |     "\n",
141 |     "# default is 75% / 25% train-test split\n",
142 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "### Create classifier object"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "from sklearn.neighbors import KNeighborsClassifier\n",
161 |     "\n",
162 |     "knn = KNeighborsClassifier(n_neighbors = 5)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "### Train the classifier (fit the estimator) using the training data"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "knn.fit(X_train, y_train)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "### Estimate the accuracy of the classifier on future data, using the test data"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "collapsed": false
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "knn.score(X_test, y_test)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "### Use the trained k-NN classifier model to classify new, previously unseen objects"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "# first example: a small fruit with mass 20g, width 4.3 cm, height 5.5 cm\n",
217 |     "fruit_prediction = knn.predict([[20, 4.3, 5.5]])\n",
218 |     "lookup_fruit_name[fruit_prediction[0]]"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "# second example: a larger, elongated fruit with mass 100g, width 6.3 cm, height 8.5 cm\n",
230 |     "fruit_prediction = knn.predict([[100, 6.3, 8.5]])\n",
231 |     "lookup_fruit_name[fruit_prediction[0]]"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "### Plot the decision boundaries of the k-NN classifier"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "collapsed": false
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "from adspy_shared_utilities import plot_fruit_knn\n",
250 |     "\n",
251 |     "plot_fruit_knn(X_train, y_train, 5, 'uniform')   # we choose 5 nearest neighbors"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "### How sensitive is k-NN classification accuracy to the choice of the 'k' parameter?"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {
265 |     "collapsed": false
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "k_range = range(1,20)\n",
270 |     "scores = []\n",
271 |     "\n",
272 |     "for k in k_range:\n",
273 |     "    knn = KNeighborsClassifier(n_neighbors = k)\n",
274 |     "    knn.fit(X_train, y_train)\n",
275 |     "    scores.append(knn.score(X_test, y_test))\n",
276 |     "\n",
277 |     "plt.figure()\n",
278 |     "plt.xlabel('k')\n",
279 |     "plt.ylabel('accuracy')\n",
280 |     "plt.scatter(k_range, scores)\n",
281 |     "plt.xticks([0,5,10,15,20]);"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "### How sensitive is k-NN classification accuracy to the train/test split proportion?"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {
295 |     "collapsed": false
296 |    },
297 |    "outputs": [],
298 |    "source": [
299 |     "t = [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]\n",
300 |     "\n",
301 |     "knn = KNeighborsClassifier(n_neighbors = 5)\n",
302 |     "\n",
303 |     "plt.figure()\n",
304 |     "\n",
305 |     "for s in t:\n",
306 |     "\n",
307 |     "    scores = []\n",
308 |     "    for i in range(1,1000):\n",
309 |     "        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-s)\n",
310 |     "        knn.fit(X_train, y_train)\n",
311 |     "        scores.append(knn.score(X_test, y_test))\n",
312 |     "    plt.plot(s, np.mean(scores), 'bo')\n",
313 |     "\n",
314 |     "plt.xlabel('Training set proportion (%)')\n",
315 |     "plt.ylabel('accuracy');"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "collapsed": true
323 |    },
324 |    "outputs": [],
325 |    "source": []
326 |   }
327 |  ],
328 |  "metadata": {
329 |   "anaconda-cloud": {},
330 |   "kernelspec": {
331 |    "display_name": "Python 3",
332 |    "language": "python",
333 |    "name": "python3"
334 |   },
335 |   "language_info": {
336 |    "codemirror_mode": {
337 |     "name": "ipython",
338 |     "version": 3
339 |    },
340 |    "file_extension": ".py",
341 |    "mimetype": "text/x-python",
342 |    "name": "python",
343 |    "nbconvert_exporter": "python",
344 |    "pygments_lexer": "ipython3",
345 |    "version": "3.5.2"
346 |   }
347 |  },
348 |  "nbformat": 4,
349 |  "nbformat_minor": 1
350 | }
351 | 


--------------------------------------------------------------------------------
/Week 4/Unsupervised+Learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Applied Machine Learning: Unsupervised Learning"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Preamble and Datasets"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "%matplotlib notebook\n",
 26 |     "import numpy as np\n",
 27 |     "import pandas as pd\n",
 28 |     "import seaborn as sn\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "from sklearn.datasets import load_breast_cancer\n",
 31 |     "\n",
 32 |     "# Breast cancer dataset\n",
 33 |     "cancer = load_breast_cancer()\n",
 34 |     "(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)\n",
 35 |     "\n",
 36 |     "# Our sample fruits dataset\n",
 37 |     "fruits = pd.read_table('fruit_data_with_colors.txt')\n",
 38 |     "X_fruits = fruits[['mass','width','height', 'color_score']]\n",
 39 |     "y_fruits = fruits[['fruit_label']] - 1"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Dimensionality Reduction and Manifold Learning"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "### Principal Components Analysis (PCA)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "#### Using PCA to find the first two principal components of the breast cancer dataset"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "from sklearn.preprocessing import StandardScaler\n",
 72 |     "from sklearn.decomposition import PCA\n",
 73 |     "from sklearn.datasets import load_breast_cancer\n",
 74 |     "\n",
 75 |     "cancer = load_breast_cancer()\n",
 76 |     "(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)\n",
 77 |     "\n",
 78 |     "# Before applying PCA, each feature should be centered (zero mean) and with unit variance\n",
 79 |     "X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)  \n",
 80 |     "\n",
 81 |     "pca = PCA(n_components = 2).fit(X_normalized)\n",
 82 |     "\n",
 83 |     "X_pca = pca.transform(X_normalized)\n",
 84 |     "print(X_cancer.shape, X_pca.shape)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "#### Plotting the PCA-transformed version of the breast cancer dataset"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "from adspy_shared_utilities import plot_labelled_scatter\n",
103 |     "plot_labelled_scatter(X_pca, y_cancer, ['malignant', 'benign'])\n",
104 |     "\n",
105 |     "plt.xlabel('First principal component')\n",
106 |     "plt.ylabel('Second principal component')\n",
107 |     "plt.title('Breast Cancer Dataset PCA (n_components = 2)');"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "#### Plotting the magnitude of each feature value for the first two principal components"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "fig = plt.figure(figsize=(8, 4))\n",
126 |     "plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')\n",
127 |     "feature_names = list(cancer.feature_names)\n",
128 |     "\n",
129 |     "plt.gca().set_xticks(np.arange(-.5, len(feature_names)));\n",
130 |     "plt.gca().set_yticks(np.arange(0.5, 2));\n",
131 |     "plt.gca().set_xticklabels(feature_names, rotation=90, ha='left', fontsize=12);\n",
132 |     "plt.gca().set_yticklabels(['First PC', 'Second PC'], va='bottom', fontsize=12);\n",
133 |     "\n",
134 |     "plt.colorbar(orientation='horizontal', ticks=[pca.components_.min(), 0, \n",
135 |     "                                              pca.components_.max()], pad=0.65);"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "#### PCA on the fruit dataset (for comparison)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "from sklearn.preprocessing import StandardScaler\n",
154 |     "from sklearn.decomposition import PCA\n",
155 |     "\n",
156 |     "# each feature should be centered (zero mean) and with unit variance\n",
157 |     "X_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  \n",
158 |     "\n",
159 |     "pca = PCA(n_components = 2).fit(X_normalized)\n",
160 |     "X_pca = pca.transform(X_normalized)\n",
161 |     "\n",
162 |     "from adspy_shared_utilities import plot_labelled_scatter\n",
163 |     "plot_labelled_scatter(X_pca, y_fruits, ['apple','mandarin','orange','lemon'])\n",
164 |     "\n",
165 |     "plt.xlabel('First principal component')\n",
166 |     "plt.ylabel('Second principal component')\n",
167 |     "plt.title('Fruits Dataset PCA (n_components = 2)');"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "### Manifold learning methods"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "#### Multidimensional scaling (MDS) on the fruit dataset"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "collapsed": false
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "from adspy_shared_utilities import plot_labelled_scatter\n",
193 |     "from sklearn.preprocessing import StandardScaler\n",
194 |     "from sklearn.manifold import MDS\n",
195 |     "\n",
196 |     "# each feature should be centered (zero mean) and with unit variance\n",
197 |     "X_fruits_normalized = StandardScaler().fit(X_fruits).transform(X_fruits)  \n",
198 |     "\n",
199 |     "mds = MDS(n_components = 2)\n",
200 |     "\n",
201 |     "X_fruits_mds = mds.fit_transform(X_fruits_normalized)\n",
202 |     "\n",
203 |     "plot_labelled_scatter(X_fruits_mds, y_fruits, ['apple', 'mandarin', 'orange', 'lemon'])\n",
204 |     "plt.xlabel('First MDS feature')\n",
205 |     "plt.ylabel('Second MDS feature')\n",
206 |     "plt.title('Fruit sample dataset MDS');"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "#### Multidimensional scaling (MDS) on the breast cancer dataset"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "(This example is not covered in the lecture video, but is included here so you can compare it to the results from PCA.)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "from sklearn.preprocessing import StandardScaler\n",
232 |     "from sklearn.manifold import MDS\n",
233 |     "from sklearn.datasets import load_breast_cancer\n",
234 |     "\n",
235 |     "cancer = load_breast_cancer()\n",
236 |     "(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)\n",
237 |     "\n",
238 |     "# each feature should be centered (zero mean) and with unit variance\n",
239 |     "X_normalized = StandardScaler().fit(X_cancer).transform(X_cancer)  \n",
240 |     "\n",
241 |     "mds = MDS(n_components = 2)\n",
242 |     "\n",
243 |     "X_mds = mds.fit_transform(X_normalized)\n",
244 |     "\n",
245 |     "from adspy_shared_utilities import plot_labelled_scatter\n",
246 |     "plot_labelled_scatter(X_mds, y_cancer, ['malignant', 'benign'])\n",
247 |     "\n",
248 |     "plt.xlabel('First MDS dimension')\n",
249 |     "plt.ylabel('Second MDS dimension')\n",
250 |     "plt.title('Breast Cancer Dataset MDS (n_components = 2)');"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {
256 |     "collapsed": true
257 |    },
258 |    "source": [
259 |     "#### t-SNE on the fruit dataset"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "(This example from the lecture video is included so that you can see how some dimensionality reduction methods may be less successful on some datasets. Here, it doesn't work as well at finding structure in the small fruits dataset, compared to other methods like MDS.)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "collapsed": false
274 |    },
275 |    "outputs": [],
276 |    "source": [
277 |     "from sklearn.manifold import TSNE\n",
278 |     "\n",
279 |     "tsne = TSNE(random_state = 0)\n",
280 |     "\n",
281 |     "X_tsne = tsne.fit_transform(X_fruits_normalized)\n",
282 |     "\n",
283 |     "plot_labelled_scatter(X_tsne, y_fruits, \n",
284 |     "    ['apple', 'mandarin', 'orange', 'lemon'])\n",
285 |     "plt.xlabel('First t-SNE feature')\n",
286 |     "plt.ylabel('Second t-SNE feature')\n",
287 |     "plt.title('Fruits dataset t-SNE');"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "#### t-SNE on the breast cancer dataset"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "Although not shown in the lecture video, this example is included for comparison, showing the results of running t-SNE on the breast cancer dataset.  See the reading \"How to Use t-SNE effectively\" for further details on how the visualizations from t-SNE are affected by specific parameter settings."
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {
308 |     "collapsed": false
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "tsne = TSNE(random_state = 0)\n",
313 |     "\n",
314 |     "X_tsne = tsne.fit_transform(X_normalized)\n",
315 |     "\n",
316 |     "plot_labelled_scatter(X_tsne, y_cancer, \n",
317 |     "    ['malignant', 'benign'])\n",
318 |     "plt.xlabel('First t-SNE feature')\n",
319 |     "plt.ylabel('Second t-SNE feature')\n",
320 |     "plt.title('Breast cancer dataset t-SNE');"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {},
326 |    "source": [
327 |     "## Clustering"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {
333 |     "collapsed": true
334 |    },
335 |    "source": [
336 |     "### K-means"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {},
342 |    "source": [
343 |     "This example from the lecture video creates an artificial dataset with make_blobs, then applies k-means to find 3 clusters, and plots the points in each cluster identified by a corresponding color."
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {
350 |     "collapsed": false
351 |    },
352 |    "outputs": [],
353 |    "source": [
354 |     "from sklearn.datasets import make_blobs\n",
355 |     "from sklearn.cluster import KMeans\n",
356 |     "from adspy_shared_utilities import plot_labelled_scatter\n",
357 |     "\n",
358 |     "X, y = make_blobs(random_state = 10)\n",
359 |     "\n",
360 |     "kmeans = KMeans(n_clusters = 3)\n",
361 |     "kmeans.fit(X)\n",
362 |     "\n",
363 |     "plot_labelled_scatter(X, kmeans.labels_, ['Cluster 1', 'Cluster 2', 'Cluster 3'])\n"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "Example showing k-means used to find 4 clusters in the fruits dataset.  Note that in general, it's important to scale the individual features before applying k-means clustering."
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {
377 |     "collapsed": false
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "from sklearn.datasets import make_blobs\n",
382 |     "from sklearn.cluster import KMeans\n",
383 |     "from adspy_shared_utilities import plot_labelled_scatter\n",
384 |     "from sklearn.preprocessing import MinMaxScaler\n",
385 |     "\n",
386 |     "fruits = pd.read_table('fruit_data_with_colors.txt')\n",
387 |     "X_fruits = fruits[['mass','width','height', 'color_score']].as_matrix()\n",
388 |     "y_fruits = fruits[['fruit_label']] - 1\n",
389 |     "\n",
390 |     "X_fruits_normalized = MinMaxScaler().fit(X_fruits).transform(X_fruits)  \n",
391 |     "\n",
392 |     "kmeans = KMeans(n_clusters = 4, random_state = 0)\n",
393 |     "kmeans.fit(X_fruits)\n",
394 |     "\n",
395 |     "plot_labelled_scatter(X_fruits_normalized, kmeans.labels_, \n",
396 |     "                      ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4'])"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "markdown",
401 |    "metadata": {},
402 |    "source": [
403 |     "### Agglomerative clustering"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {
410 |     "collapsed": false,
411 |     "scrolled": false
412 |    },
413 |    "outputs": [],
414 |    "source": [
415 |     "from sklearn.datasets import make_blobs\n",
416 |     "from sklearn.cluster import AgglomerativeClustering\n",
417 |     "from adspy_shared_utilities import plot_labelled_scatter\n",
418 |     "\n",
419 |     "X, y = make_blobs(random_state = 10)\n",
420 |     "\n",
421 |     "cls = AgglomerativeClustering(n_clusters = 3)\n",
422 |     "cls_assignment = cls.fit_predict(X)\n",
423 |     "\n",
424 |     "plot_labelled_scatter(X, cls_assignment, \n",
425 |     "        ['Cluster 1', 'Cluster 2', 'Cluster 3'])"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "markdown",
430 |    "metadata": {},
431 |    "source": [
432 |     "#### Creating a dendrogram (using scipy)"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "metadata": {},
438 |    "source": [
439 |     "This dendrogram plot is based on the dataset created in the previous step with make_blobs, but for clarity, only 10 samples have been selected for this example, as plotted here:"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {
446 |     "collapsed": false
447 |    },
448 |    "outputs": [],
449 |    "source": [
450 |     "X, y = make_blobs(random_state = 10, n_samples = 10)\n",
451 |     "plot_labelled_scatter(X, y, \n",
452 |     "        ['Cluster 1', 'Cluster 2', 'Cluster 3'])\n",
453 |     "print(X)"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "markdown",
458 |    "metadata": {},
459 |    "source": [
460 |     "And here's the dendrogram corresponding to agglomerative clustering of the 10 points above using Ward's method.  The index 0..9 of the points corresponds to the index of the points in the X array above.  For example, point 0 (5.69, -9.47) and point 9 (5.43, -9.76) are the closest two points and are clustered first."
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": null,
466 |    "metadata": {
467 |     "collapsed": false
468 |    },
469 |    "outputs": [],
470 |    "source": [
471 |     "from scipy.cluster.hierarchy import ward, dendrogram\n",
472 |     "plt.figure()\n",
473 |     "dendrogram(ward(X))\n",
474 |     "plt.show()"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "markdown",
479 |    "metadata": {},
480 |    "source": [
481 |     "### DBSCAN clustering"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {
488 |     "collapsed": false
489 |    },
490 |    "outputs": [],
491 |    "source": [
492 |     "from sklearn.cluster import DBSCAN\n",
493 |     "from sklearn.datasets import make_blobs\n",
494 |     "\n",
495 |     "X, y = make_blobs(random_state = 9, n_samples = 25)\n",
496 |     "\n",
497 |     "dbscan = DBSCAN(eps = 2, min_samples = 2)\n",
498 |     "\n",
499 |     "cls = dbscan.fit_predict(X)\n",
500 |     "print(\"Cluster membership values:\\n{}\".format(cls))\n",
501 |     "\n",
502 |     "plot_labelled_scatter(X, cls + 1, \n",
503 |     "        ['Noise', 'Cluster 0', 'Cluster 1', 'Cluster 2'])"
504 |    ]
505 |   }
506 |  ],
507 |  "metadata": {
508 |   "anaconda-cloud": {},
509 |   "kernelspec": {
510 |    "display_name": "Python 3",
511 |    "language": "python",
512 |    "name": "python3"
513 |   },
514 |   "language_info": {
515 |    "codemirror_mode": {
516 |     "name": "ipython",
517 |     "version": 3
518 |    },
519 |    "file_extension": ".py",
520 |    "mimetype": "text/x-python",
521 |    "name": "python",
522 |    "nbconvert_exporter": "python",
523 |    "pygments_lexer": "ipython3",
524 |    "version": "3.5.2"
525 |   }
526 |  },
527 |  "nbformat": 4,
528 |  "nbformat_minor": 1
529 | }
530 | 


--------------------------------------------------------------------------------
/Week 4/Module+4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 44,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "gzip: readonly is a directory -- ignored\r\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "!gzip readonly"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "---\n",
 27 |     "\n",
 28 |     "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n",
 29 |     "\n",
 30 |     "---"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "source": [
 39 |     "# Applied Machine Learning: Module 4 (Supervised Learning, Part II)"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Preamble and Datasets"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": false,
 54 |     "scrolled": false
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "%matplotlib notebook\n",
 59 |     "import numpy as np\n",
 60 |     "import pandas as pd\n",
 61 |     "import seaborn as sn\n",
 62 |     "import matplotlib.pyplot as plt\n",
 63 |     "\n",
 64 |     "from sklearn.model_selection import train_test_split\n",
 65 |     "from sklearn.datasets import make_classification, make_blobs\n",
 66 |     "from matplotlib.colors import ListedColormap\n",
 67 |     "from sklearn.datasets import load_breast_cancer\n",
 68 |     "from adspy_shared_utilities import load_crime_dataset\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])\n",
 72 |     "\n",
 73 |     "# fruits dataset\n",
 74 |     "fruits = pd.read_table('fruit_data_with_colors.txt')\n",
 75 |     "\n",
 76 |     "feature_names_fruits = ['height', 'width', 'mass', 'color_score']\n",
 77 |     "X_fruits = fruits[feature_names_fruits]\n",
 78 |     "y_fruits = fruits['fruit_label']\n",
 79 |     "target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']\n",
 80 |     "\n",
 81 |     "X_fruits_2d = fruits[['height', 'width']]\n",
 82 |     "y_fruits_2d = fruits['fruit_label']\n",
 83 |     "\n",
 84 |     "# synthetic dataset for simple regression\n",
 85 |     "from sklearn.datasets import make_regression\n",
 86 |     "plt.figure()\n",
 87 |     "plt.title('Sample regression problem with one input variable')\n",
 88 |     "X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,\n",
 89 |     "                            n_informative=1, bias = 150.0,\n",
 90 |     "                            noise = 30, random_state=0)\n",
 91 |     "plt.scatter(X_R1, y_R1, marker= 'o', s=50)\n",
 92 |     "plt.show()\n",
 93 |     "\n",
 94 |     "# synthetic dataset for more complex regression\n",
 95 |     "from sklearn.datasets import make_friedman1\n",
 96 |     "plt.figure()\n",
 97 |     "plt.title('Complex regression problem with one input variable')\n",
 98 |     "X_F1, y_F1 = make_friedman1(n_samples = 100, n_features = 7,\n",
 99 |     "                           random_state=0)\n",
100 |     "\n",
101 |     "plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50)\n",
102 |     "plt.show()\n",
103 |     "\n",
104 |     "# synthetic dataset for classification (binary)\n",
105 |     "plt.figure()\n",
106 |     "plt.title('Sample binary classification problem with two informative features')\n",
107 |     "X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,\n",
108 |     "                                n_redundant=0, n_informative=2,\n",
109 |     "                                n_clusters_per_class=1, flip_y = 0.1,\n",
110 |     "                                class_sep = 0.5, random_state=0)\n",
111 |     "plt.scatter(X_C2[:, 0], X_C2[:, 1], marker= 'o',\n",
112 |     "           c=y_C2, s=50, cmap=cmap_bold)\n",
113 |     "plt.show()\n",
114 |     "\n",
115 |     "# more difficult synthetic dataset for classification (binary)\n",
116 |     "# with classes that are not linearly separable\n",
117 |     "X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2,\n",
118 |     "                       centers = 8, cluster_std = 1.3,\n",
119 |     "                       random_state = 4)\n",
120 |     "y_D2 = y_D2 % 2\n",
121 |     "plt.figure()\n",
122 |     "plt.title('Sample binary classification problem with non-linearly separable classes')\n",
123 |     "plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,\n",
124 |     "           marker= 'o', s=50, cmap=cmap_bold)\n",
125 |     "plt.show()\n",
126 |     "\n",
127 |     "# Breast cancer dataset for classification\n",
128 |     "cancer = load_breast_cancer()\n",
129 |     "(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)\n",
130 |     "\n",
131 |     "# Communities and Crime dataset\n",
132 |     "(X_crime, y_crime) = load_crime_dataset()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Naive Bayes classifiers"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": false
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "from sklearn.naive_bayes import GaussianNB\n",
151 |     "from adspy_shared_utilities import plot_class_regions_for_classifier\n",
152 |     "\n",
153 |     "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)\n",
154 |     "\n",
155 |     "nbclf = GaussianNB().fit(X_train, y_train)\n",
156 |     "plot_class_regions_for_classifier(nbclf, X_train, y_train, X_test, y_test,\n",
157 |     "                                 'Gaussian Naive Bayes classifier: Dataset 1')"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {
164 |     "collapsed": false
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2,\n",
169 |     "                                                   random_state=0)\n",
170 |     "\n",
171 |     "nbclf = GaussianNB().fit(X_train, y_train)\n",
172 |     "plot_class_regions_for_classifier(nbclf, X_train, y_train, X_test, y_test,\n",
173 |     "                                 'Gaussian Naive Bayes classifier: Dataset 2')"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "### Application to a real-world dataset"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {
187 |     "collapsed": false
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n",
192 |     "\n",
193 |     "nbclf = GaussianNB().fit(X_train, y_train)\n",
194 |     "print('Breast cancer dataset')\n",
195 |     "print('Accuracy of GaussianNB classifier on training set: {:.2f}'\n",
196 |     "     .format(nbclf.score(X_train, y_train)))\n",
197 |     "print('Accuracy of GaussianNB classifier on test set: {:.2f}'\n",
198 |     "     .format(nbclf.score(X_test, y_test)))"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "## Ensembles of Decision Trees"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "### Random forests"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "collapsed": false,
220 |     "scrolled": false
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "from sklearn.ensemble import RandomForestClassifier\n",
225 |     "from sklearn.model_selection import train_test_split\n",
226 |     "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n",
227 |     "\n",
228 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2,\n",
229 |     "                                                   random_state = 0)\n",
230 |     "fig, subaxes = plt.subplots(1, 1, figsize=(6, 6))\n",
231 |     "\n",
232 |     "clf = RandomForestClassifier().fit(X_train, y_train)\n",
233 |     "title = 'Random Forest Classifier, complex binary dataset, default settings'\n",
234 |     "plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test,\n",
235 |     "                                         y_test, title, subaxes)\n",
236 |     "\n",
237 |     "plt.show()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "### Random forest: Fruit dataset"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {
251 |     "collapsed": false,
252 |     "scrolled": false
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "from sklearn.ensemble import RandomForestClassifier\n",
257 |     "from sklearn.model_selection import train_test_split\n",
258 |     "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n",
259 |     "\n",
260 |     "X_train, X_test, y_train, y_test = train_test_split(X_fruits.as_matrix(),\n",
261 |     "                                                   y_fruits.as_matrix(),\n",
262 |     "                                                   random_state = 0)\n",
263 |     "fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))\n",
264 |     "\n",
265 |     "title = 'Random Forest, fruits dataset, default settings'\n",
266 |     "pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]\n",
267 |     "\n",
268 |     "for pair, axis in zip(pair_list, subaxes):\n",
269 |     "    X = X_train[:, pair]\n",
270 |     "    y = y_train\n",
271 |     "    \n",
272 |     "    clf = RandomForestClassifier().fit(X, y)\n",
273 |     "    plot_class_regions_for_classifier_subplot(clf, X, y, None,\n",
274 |     "                                             None, title, axis,\n",
275 |     "                                             target_names_fruits)\n",
276 |     "    \n",
277 |     "    axis.set_xlabel(feature_names_fruits[pair[0]])\n",
278 |     "    axis.set_ylabel(feature_names_fruits[pair[1]])\n",
279 |     "    \n",
280 |     "plt.tight_layout()\n",
281 |     "plt.show()\n",
282 |     "\n",
283 |     "clf = RandomForestClassifier(n_estimators = 10,\n",
284 |     "                            random_state=0).fit(X_train, y_train)\n",
285 |     "\n",
286 |     "print('Random Forest, Fruit dataset, default settings')\n",
287 |     "print('Accuracy of RF classifier on training set: {:.2f}'\n",
288 |     "     .format(clf.score(X_train, y_train)))\n",
289 |     "print('Accuracy of RF classifier on test set: {:.2f}'\n",
290 |     "     .format(clf.score(X_test, y_test)))"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "#### Random Forests on a real-world dataset"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {
304 |     "collapsed": false
305 |    },
306 |    "outputs": [],
307 |    "source": [
308 |     "from sklearn.ensemble import RandomForestClassifier\n",
309 |     "\n",
310 |     "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n",
311 |     "\n",
312 |     "clf = RandomForestClassifier(max_features = 8, random_state = 0)\n",
313 |     "clf.fit(X_train, y_train)\n",
314 |     "\n",
315 |     "print('Breast cancer dataset')\n",
316 |     "print('Accuracy of RF classifier on training set: {:.2f}'\n",
317 |     "     .format(clf.score(X_train, y_train)))\n",
318 |     "print('Accuracy of RF classifier on test set: {:.2f}'\n",
319 |     "     .format(clf.score(X_test, y_test)))"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "### Gradient-boosted decision trees"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "collapsed": false
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
338 |     "from sklearn.model_selection import train_test_split\n",
339 |     "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n",
340 |     "\n",
341 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)\n",
342 |     "fig, subaxes = plt.subplots(1, 1, figsize=(6, 6))\n",
343 |     "\n",
344 |     "clf = GradientBoostingClassifier().fit(X_train, y_train)\n",
345 |     "title = 'GBDT, complex binary dataset, default settings'\n",
346 |     "plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test,\n",
347 |     "                                         y_test, title, subaxes)\n",
348 |     "\n",
349 |     "plt.show()"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "markdown",
354 |    "metadata": {},
355 |    "source": [
356 |     "#### Gradient boosted decision trees on the fruit dataset"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {
363 |     "collapsed": false,
364 |     "scrolled": false
365 |    },
366 |    "outputs": [],
367 |    "source": [
368 |     "X_train, X_test, y_train, y_test = train_test_split(X_fruits.as_matrix(),\n",
369 |     "                                                   y_fruits.as_matrix(),\n",
370 |     "                                                   random_state = 0)\n",
371 |     "fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))\n",
372 |     "\n",
373 |     "pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]\n",
374 |     "\n",
375 |     "for pair, axis in zip(pair_list, subaxes):\n",
376 |     "    X = X_train[:, pair]\n",
377 |     "    y = y_train\n",
378 |     "    \n",
379 |     "    clf = GradientBoostingClassifier().fit(X, y)\n",
380 |     "    plot_class_regions_for_classifier_subplot(clf, X, y, None,\n",
381 |     "                                             None, title, axis,\n",
382 |     "                                             target_names_fruits)\n",
383 |     "    \n",
384 |     "    axis.set_xlabel(feature_names_fruits[pair[0]])\n",
385 |     "    axis.set_ylabel(feature_names_fruits[pair[1]])\n",
386 |     "    \n",
387 |     "plt.tight_layout()\n",
388 |     "plt.show()\n",
389 |     "clf = GradientBoostingClassifier().fit(X_train, y_train)\n",
390 |     "\n",
391 |     "print('GBDT, Fruit dataset, default settings')\n",
392 |     "print('Accuracy of GBDT classifier on training set: {:.2f}'\n",
393 |     "     .format(clf.score(X_train, y_train)))\n",
394 |     "print('Accuracy of GBDT classifier on test set: {:.2f}'\n",
395 |     "     .format(clf.score(X_test, y_test)))"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "#### Gradient-boosted decision trees on a real-world dataset"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {
409 |     "collapsed": false
410 |    },
411 |    "outputs": [],
412 |    "source": [
413 |     "from sklearn.ensemble import GradientBoostingClassifier\n",
414 |     "\n",
415 |     "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n",
416 |     "\n",
417 |     "clf = GradientBoostingClassifier(random_state = 0)\n",
418 |     "clf.fit(X_train, y_train)\n",
419 |     "\n",
420 |     "print('Breast cancer dataset (learning_rate=0.1, max_depth=3)')\n",
421 |     "print('Accuracy of GBDT classifier on training set: {:.2f}'\n",
422 |     "     .format(clf.score(X_train, y_train)))\n",
423 |     "print('Accuracy of GBDT classifier on test set: {:.2f}\\n'\n",
424 |     "     .format(clf.score(X_test, y_test)))\n",
425 |     "\n",
426 |     "clf = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 2, random_state = 0)\n",
427 |     "clf.fit(X_train, y_train)\n",
428 |     "\n",
429 |     "print('Breast cancer dataset (learning_rate=0.01, max_depth=2)')\n",
430 |     "print('Accuracy of GBDT classifier on training set: {:.2f}'\n",
431 |     "     .format(clf.score(X_train, y_train)))\n",
432 |     "print('Accuracy of GBDT classifier on test set: {:.2f}'\n",
433 |     "     .format(clf.score(X_test, y_test)))"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "## Neural networks"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "markdown",
445 |    "metadata": {},
446 |    "source": [
447 |     "#### Activation functions"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {
454 |     "collapsed": false
455 |    },
456 |    "outputs": [],
457 |    "source": [
458 |     "xrange = np.linspace(-2, 2, 200)\n",
459 |     "\n",
460 |     "plt.figure(figsize=(7,6))\n",
461 |     "\n",
462 |     "plt.plot(xrange, np.maximum(xrange, 0), label = 'relu')\n",
463 |     "plt.plot(xrange, np.tanh(xrange), label = 'tanh')\n",
464 |     "plt.plot(xrange, 1 / (1 + np.exp(-xrange)), label = 'logistic')\n",
465 |     "plt.legend()\n",
466 |     "plt.title('Neural network activation functions')\n",
467 |     "plt.xlabel('Input value (x)')\n",
468 |     "plt.ylabel('Activation function output')\n",
469 |     "\n",
470 |     "plt.show()"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "### Neural networks: Classification"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {},
483 |    "source": [
484 |     "#### Synthetic dataset 1: single hidden layer"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": null,
490 |    "metadata": {
491 |     "collapsed": false,
492 |     "scrolled": false
493 |    },
494 |    "outputs": [],
495 |    "source": [
496 |     "from sklearn.neural_network import MLPClassifier\n",
497 |     "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n",
498 |     "\n",
499 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)\n",
500 |     "\n",
501 |     "fig, subaxes = plt.subplots(3, 1, figsize=(6,18))\n",
502 |     "\n",
503 |     "for units, axis in zip([1, 10, 100], subaxes):\n",
504 |     "    nnclf = MLPClassifier(hidden_layer_sizes = [units], solver='lbfgs',\n",
505 |     "                         random_state = 0).fit(X_train, y_train)\n",
506 |     "    \n",
507 |     "    title = 'Dataset 1: Neural net classifier, 1 layer, {} units'.format(units)\n",
508 |     "    \n",
509 |     "    plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train,\n",
510 |     "                                             X_test, y_test, title, axis)\n",
511 |     "    plt.tight_layout()"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "markdown",
516 |    "metadata": {},
517 |    "source": [
518 |     "#### Synthetic dataset 1: two hidden layers"
519 |    ]
520 |   },
521 |   {
522 |    "cell_type": "code",
523 |    "execution_count": null,
524 |    "metadata": {
525 |     "collapsed": false
526 |    },
527 |    "outputs": [],
528 |    "source": [
529 |     "from adspy_shared_utilities import plot_class_regions_for_classifier\n",
530 |     "\n",
531 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)\n",
532 |     "\n",
533 |     "nnclf = MLPClassifier(hidden_layer_sizes = [10, 10], solver='lbfgs',\n",
534 |     "                     random_state = 0).fit(X_train, y_train)\n",
535 |     "\n",
536 |     "plot_class_regions_for_classifier(nnclf, X_train, y_train, X_test, y_test,\n",
537 |     "                                 'Dataset 1: Neural net classifier, 2 layers, 10/10 units')"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "markdown",
542 |    "metadata": {},
543 |    "source": [
544 |     "#### Regularization parameter: alpha"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {
551 |     "collapsed": false,
552 |     "scrolled": false
553 |    },
554 |    "outputs": [],
555 |    "source": [
556 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)\n",
557 |     "\n",
558 |     "fig, subaxes = plt.subplots(4, 1, figsize=(6, 23))\n",
559 |     "\n",
560 |     "for this_alpha, axis in zip([0.01, 0.1, 1.0, 5.0], subaxes):\n",
561 |     "    nnclf = MLPClassifier(solver='lbfgs', activation = 'tanh',\n",
562 |     "                         alpha = this_alpha,\n",
563 |     "                         hidden_layer_sizes = [100, 100],\n",
564 |     "                         random_state = 0).fit(X_train, y_train)\n",
565 |     "    \n",
566 |     "    title = 'Dataset 2: NN classifier, alpha = {:.3f} '.format(this_alpha)\n",
567 |     "    \n",
568 |     "    plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train,\n",
569 |     "                                             X_test, y_test, title, axis)\n",
570 |     "    plt.tight_layout()\n",
571 |     "    "
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "markdown",
576 |    "metadata": {},
577 |    "source": [
578 |     "#### The effect of different choices of activation function"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "code",
583 |    "execution_count": null,
584 |    "metadata": {
585 |     "collapsed": false,
586 |     "scrolled": false
587 |    },
588 |    "outputs": [],
589 |    "source": [
590 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)\n",
591 |     "\n",
592 |     "fig, subaxes = plt.subplots(3, 1, figsize=(6,18))\n",
593 |     "\n",
594 |     "for this_activation, axis in zip(['logistic', 'tanh', 'relu'], subaxes):\n",
595 |     "    nnclf = MLPClassifier(solver='lbfgs', activation = this_activation,\n",
596 |     "                         alpha = 0.1, hidden_layer_sizes = [10, 10],\n",
597 |     "                         random_state = 0).fit(X_train, y_train)\n",
598 |     "    \n",
599 |     "    title = 'Dataset 2: NN classifier, 2 layers 10/10, {} \\\n",
600 |     "activation function'.format(this_activation)\n",
601 |     "    \n",
602 |     "    plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train,\n",
603 |     "                                             X_test, y_test, title, axis)\n",
604 |     "    plt.tight_layout()"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "markdown",
609 |    "metadata": {},
610 |    "source": [
611 |     "### Neural networks: Regression"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": null,
617 |    "metadata": {
618 |     "collapsed": false
619 |    },
620 |    "outputs": [],
621 |    "source": [
622 |     "from sklearn.neural_network import MLPRegressor\n",
623 |     "\n",
624 |     "fig, subaxes = plt.subplots(2, 3, figsize=(11,8), dpi=70)\n",
625 |     "\n",
626 |     "X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)\n",
627 |     "\n",
628 |     "X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0)\n",
629 |     "\n",
630 |     "for thisaxisrow, thisactivation in zip(subaxes, ['tanh', 'relu']):\n",
631 |     "    for thisalpha, thisaxis in zip([0.0001, 1.0, 100], thisaxisrow):\n",
632 |     "        mlpreg = MLPRegressor(hidden_layer_sizes = [100,100],\n",
633 |     "                             activation = thisactivation,\n",
634 |     "                             alpha = thisalpha,\n",
635 |     "                             solver = 'lbfgs').fit(X_train, y_train)\n",
636 |     "        y_predict_output = mlpreg.predict(X_predict_input)\n",
637 |     "        thisaxis.set_xlim([-2.5, 0.75])\n",
638 |     "        thisaxis.plot(X_predict_input, y_predict_output,\n",
639 |     "                     '^', markersize = 10)\n",
640 |     "        thisaxis.plot(X_train, y_train, 'o')\n",
641 |     "        thisaxis.set_xlabel('Input feature')\n",
642 |     "        thisaxis.set_ylabel('Target value')\n",
643 |     "        thisaxis.set_title('MLP regression\\nalpha={}, activation={})'\n",
644 |     "                          .format(thisalpha, thisactivation))\n",
645 |     "        plt.tight_layout()"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "markdown",
650 |    "metadata": {},
651 |    "source": [
652 |     "#### Application to real-world dataset for classification"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": null,
658 |    "metadata": {
659 |     "collapsed": false
660 |    },
661 |    "outputs": [],
662 |    "source": [
663 |     "from sklearn.neural_network import MLPClassifier\n",
664 |     "from sklearn.preprocessing import MinMaxScaler\n",
665 |     "\n",
666 |     "\n",
667 |     "scaler = MinMaxScaler()\n",
668 |     "\n",
669 |     "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n",
670 |     "X_train_scaled = scaler.fit_transform(X_train)\n",
671 |     "X_test_scaled = scaler.transform(X_test)\n",
672 |     "\n",
673 |     "clf = MLPClassifier(hidden_layer_sizes = [100, 100], alpha = 5.0,\n",
674 |     "                   random_state = 0, solver='lbfgs').fit(X_train_scaled, y_train)\n",
675 |     "\n",
676 |     "print('Breast cancer dataset')\n",
677 |     "print('Accuracy of NN classifier on training set: {:.2f}'\n",
678 |     "     .format(clf.score(X_train_scaled, y_train)))\n",
679 |     "print('Accuracy of NN classifier on test set: {:.2f}'\n",
680 |     "     .format(clf.score(X_test_scaled, y_test)))"
681 |    ]
682 |   }
683 |  ],
684 |  "metadata": {
685 |   "anaconda-cloud": {},
686 |   "kernelspec": {
687 |    "display_name": "Python 3",
688 |    "language": "python",
689 |    "name": "python3"
690 |   },
691 |   "language_info": {
692 |    "codemirror_mode": {
693 |     "name": "ipython",
694 |     "version": 3
695 |    },
696 |    "file_extension": ".py",
697 |    "mimetype": "text/x-python",
698 |    "name": "python",
699 |    "nbconvert_exporter": "python",
700 |    "pygments_lexer": "ipython3",
701 |    "version": "3.5.2"
702 |   }
703 |  },
704 |  "nbformat": 4,
705 |  "nbformat_minor": 2
706 | }
707 | 


--------------------------------------------------------------------------------
/Week 3/Module+3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "---\n",
  8 |     "\n",
  9 |     "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n",
 10 |     "\n",
 11 |     "---"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "source": [
 20 |     "# Applied Machine Learning: Module 3 (Evaluation)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "## Evaluation for Classification"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### Preamble"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "%matplotlib notebook\n",
 46 |     "import numpy as np\n",
 47 |     "import pandas as pd\n",
 48 |     "import seaborn as sns\n",
 49 |     "import matplotlib.pyplot as plt\n",
 50 |     "from sklearn.model_selection import train_test_split\n",
 51 |     "from sklearn.datasets import load_digits\n",
 52 |     "\n",
 53 |     "dataset = load_digits()\n",
 54 |     "X, y = dataset.data, dataset.target\n",
 55 |     "\n",
 56 |     "for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):\n",
 57 |     "    print(class_name,class_count)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# Creating a dataset with imbalanced binary classes:  \n",
 69 |     "# Negative class (0) is 'not digit 1' \n",
 70 |     "# Positive class (1) is 'digit 1'\n",
 71 |     "y_binary_imbalanced = y.copy()\n",
 72 |     "y_binary_imbalanced[y_binary_imbalanced != 1] = 0\n",
 73 |     "\n",
 74 |     "print('Original labels:\\t', y[1:30])\n",
 75 |     "print('New binary labels:\\t', y_binary_imbalanced[1:30])"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {
 82 |     "collapsed": false,
 83 |     "scrolled": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "np.bincount(y_binary_imbalanced)    # Negative class (0) is the most frequent class"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n",
 99 |     "\n",
100 |     "# Accuracy of Support Vector Machine classifier\n",
101 |     "from sklearn.svm import SVC\n",
102 |     "\n",
103 |     "svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)\n",
104 |     "svm.score(X_test, y_test)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "### Dummy Classifiers"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {
117 |     "collapsed": true
118 |    },
119 |    "source": [
120 |     "DummyClassifier is a classifier that makes predictions using simple rules, which can be useful as a baseline for comparison against actual classifiers, especially with imbalanced classes."
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {
127 |     "collapsed": false
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "from sklearn.dummy import DummyClassifier\n",
132 |     "\n",
133 |     "# Negative class (0) is most frequent\n",
134 |     "dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)\n",
135 |     "# Therefore the dummy 'most_frequent' classifier always predicts class 0\n",
136 |     "y_dummy_predictions = dummy_majority.predict(X_test)\n",
137 |     "\n",
138 |     "y_dummy_predictions"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "dummy_majority.score(X_test, y_test)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "svm = SVC(kernel='linear', C=1).fit(X_train, y_train)\n",
161 |     "svm.score(X_test, y_test)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "### Confusion matrices"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "#### Binary (two-class) confusion matrix"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "from sklearn.metrics import confusion_matrix\n",
187 |     "\n",
188 |     "# Negative class (0) is most frequent\n",
189 |     "dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)\n",
190 |     "y_majority_predicted = dummy_majority.predict(X_test)\n",
191 |     "confusion = confusion_matrix(y_test, y_majority_predicted)\n",
192 |     "\n",
193 |     "print('Most frequent class (dummy classifier)\\n', confusion)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {
200 |     "collapsed": false
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "# produces random predictions w/ same class proportion as training set\n",
205 |     "dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train, y_train)\n",
206 |     "y_classprop_predicted = dummy_classprop.predict(X_test)\n",
207 |     "confusion = confusion_matrix(y_test, y_classprop_predicted)\n",
208 |     "\n",
209 |     "print('Random class-proportional prediction (dummy classifier)\\n', confusion)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": false,
217 |     "scrolled": true
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "svm = SVC(kernel='linear', C=1).fit(X_train, y_train)\n",
222 |     "svm_predicted = svm.predict(X_test)\n",
223 |     "confusion = confusion_matrix(y_test, svm_predicted)\n",
224 |     "\n",
225 |     "print('Support vector machine classifier (linear kernel, C=1)\\n', confusion)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {
232 |     "collapsed": false
233 |    },
234 |    "outputs": [],
235 |    "source": [
236 |     "from sklearn.linear_model import LogisticRegression\n",
237 |     "\n",
238 |     "lr = LogisticRegression().fit(X_train, y_train)\n",
239 |     "lr_predicted = lr.predict(X_test)\n",
240 |     "confusion = confusion_matrix(y_test, lr_predicted)\n",
241 |     "\n",
242 |     "print('Logistic regression classifier (default settings)\\n', confusion)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": false
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "from sklearn.tree import DecisionTreeClassifier\n",
254 |     "\n",
255 |     "dt = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)\n",
256 |     "tree_predicted = dt.predict(X_test)\n",
257 |     "confusion = confusion_matrix(y_test, tree_predicted)\n",
258 |     "\n",
259 |     "print('Decision tree classifier (max_depth = 2)\\n', confusion)"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "### Evaluation metrics for binary classification"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "collapsed": false
274 |    },
275 |    "outputs": [],
276 |    "source": [
277 |     "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
278 |     "# Accuracy = TP + TN / (TP + TN + FP + FN)\n",
279 |     "# Precision = TP / (TP + FP)\n",
280 |     "# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate\n",
281 |     "# F1 = 2 * Precision * Recall / (Precision + Recall) \n",
282 |     "print('Accuracy: {:.2f}'.format(accuracy_score(y_test, tree_predicted)))\n",
283 |     "print('Precision: {:.2f}'.format(precision_score(y_test, tree_predicted)))\n",
284 |     "print('Recall: {:.2f}'.format(recall_score(y_test, tree_predicted)))\n",
285 |     "print('F1: {:.2f}'.format(f1_score(y_test, tree_predicted)))"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {
292 |     "collapsed": false
293 |    },
294 |    "outputs": [],
295 |    "source": [
296 |     "# Combined report with all above metrics\n",
297 |     "from sklearn.metrics import classification_report\n",
298 |     "\n",
299 |     "print(classification_report(y_test, tree_predicted, target_names=['not 1', '1']))"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {
306 |     "collapsed": false,
307 |     "scrolled": false
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "print('Random class-proportional (dummy)\\n', \n",
312 |     "      classification_report(y_test, y_classprop_predicted, target_names=['not 1', '1']))\n",
313 |     "print('SVM\\n', \n",
314 |     "      classification_report(y_test, svm_predicted, target_names = ['not 1', '1']))\n",
315 |     "print('Logistic regression\\n', \n",
316 |     "      classification_report(y_test, lr_predicted, target_names = ['not 1', '1']))\n",
317 |     "print('Decision tree\\n', \n",
318 |     "      classification_report(y_test, tree_predicted, target_names = ['not 1', '1']))"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "### Decision functions"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n",
337 |     "y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test)\n",
338 |     "y_score_list = list(zip(y_test[0:20], y_scores_lr[0:20]))\n",
339 |     "\n",
340 |     "# show the decision_function scores for first 20 instances\n",
341 |     "y_score_list"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {
348 |     "collapsed": false
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n",
353 |     "y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test)\n",
354 |     "y_proba_list = list(zip(y_test[0:20], y_proba_lr[0:20,1]))\n",
355 |     "\n",
356 |     "# show the probability of positive class for first 20 instances\n",
357 |     "y_proba_list"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {},
363 |    "source": [
364 |     "### Precision-recall curves"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {
371 |     "collapsed": false
372 |    },
373 |    "outputs": [],
374 |    "source": [
375 |     "from sklearn.metrics import precision_recall_curve\n",
376 |     "\n",
377 |     "precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)\n",
378 |     "closest_zero = np.argmin(np.abs(thresholds))\n",
379 |     "closest_zero_p = precision[closest_zero]\n",
380 |     "closest_zero_r = recall[closest_zero]\n",
381 |     "\n",
382 |     "plt.figure()\n",
383 |     "plt.xlim([0.0, 1.01])\n",
384 |     "plt.ylim([0.0, 1.01])\n",
385 |     "plt.plot(precision, recall, label='Precision-Recall Curve')\n",
386 |     "plt.plot(closest_zero_p, closest_zero_r, 'o', markersize = 12, fillstyle = 'none', c='r', mew=3)\n",
387 |     "plt.xlabel('Precision', fontsize=16)\n",
388 |     "plt.ylabel('Recall', fontsize=16)\n",
389 |     "plt.axes().set_aspect('equal')\n",
390 |     "plt.show()"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {},
396 |    "source": [
397 |     "### ROC curves, Area-Under-Curve (AUC)"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {
404 |     "collapsed": false
405 |    },
406 |    "outputs": [],
407 |    "source": [
408 |     "from sklearn.metrics import roc_curve, auc\n",
409 |     "\n",
410 |     "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n",
411 |     "\n",
412 |     "y_score_lr = lr.fit(X_train, y_train).decision_function(X_test)\n",
413 |     "fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr)\n",
414 |     "roc_auc_lr = auc(fpr_lr, tpr_lr)\n",
415 |     "\n",
416 |     "plt.figure()\n",
417 |     "plt.xlim([-0.01, 1.00])\n",
418 |     "plt.ylim([-0.01, 1.01])\n",
419 |     "plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))\n",
420 |     "plt.xlabel('False Positive Rate', fontsize=16)\n",
421 |     "plt.ylabel('True Positive Rate', fontsize=16)\n",
422 |     "plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)\n",
423 |     "plt.legend(loc='lower right', fontsize=13)\n",
424 |     "plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')\n",
425 |     "plt.axes().set_aspect('equal')\n",
426 |     "plt.show()"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": null,
432 |    "metadata": {
433 |     "collapsed": false,
434 |     "scrolled": false
435 |    },
436 |    "outputs": [],
437 |    "source": [
438 |     "from matplotlib import cm\n",
439 |     "\n",
440 |     "X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)\n",
441 |     "\n",
442 |     "plt.figure()\n",
443 |     "plt.xlim([-0.01, 1.00])\n",
444 |     "plt.ylim([-0.01, 1.01])\n",
445 |     "for g in [0.01, 0.1, 0.20, 1]:\n",
446 |     "    svm = SVC(gamma=g).fit(X_train, y_train)\n",
447 |     "    y_score_svm = svm.decision_function(X_test)\n",
448 |     "    fpr_svm, tpr_svm, _ = roc_curve(y_test, y_score_svm)\n",
449 |     "    roc_auc_svm = auc(fpr_svm, tpr_svm)\n",
450 |     "    accuracy_svm = svm.score(X_test, y_test)\n",
451 |     "    print(\"gamma = {:.2f}  accuracy = {:.2f}   AUC = {:.2f}\".format(g, accuracy_svm, \n",
452 |     "                                                                    roc_auc_svm))\n",
453 |     "    plt.plot(fpr_svm, tpr_svm, lw=3, alpha=0.7, \n",
454 |     "             label='SVM (gamma = {:0.2f}, area = {:0.2f})'.format(g, roc_auc_svm))\n",
455 |     "\n",
456 |     "plt.xlabel('False Positive Rate', fontsize=16)\n",
457 |     "plt.ylabel('True Positive Rate (Recall)', fontsize=16)\n",
458 |     "plt.plot([0, 1], [0, 1], color='k', lw=0.5, linestyle='--')\n",
459 |     "plt.legend(loc=\"lower right\", fontsize=11)\n",
460 |     "plt.title('ROC curve: (1-of-10 digits classifier)', fontsize=16)\n",
461 |     "plt.axes().set_aspect('equal')\n",
462 |     "\n",
463 |     "plt.show()"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "markdown",
468 |    "metadata": {},
469 |    "source": [
470 |     "### Evaluation measures for multi-class classification"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "#### Multi-class confusion matrix"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": null,
483 |    "metadata": {
484 |     "collapsed": false,
485 |     "scrolled": false
486 |    },
487 |    "outputs": [],
488 |    "source": [
489 |     "dataset = load_digits()\n",
490 |     "X, y = dataset.data, dataset.target\n",
491 |     "X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(X, y, random_state=0)\n",
492 |     "\n",
493 |     "\n",
494 |     "svm = SVC(kernel = 'linear').fit(X_train_mc, y_train_mc)\n",
495 |     "svm_predicted_mc = svm.predict(X_test_mc)\n",
496 |     "confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc)\n",
497 |     "df_cm = pd.DataFrame(confusion_mc, \n",
498 |     "                     index = [i for i in range(0,10)], columns = [i for i in range(0,10)])\n",
499 |     "\n",
500 |     "plt.figure(figsize=(5.5,4))\n",
501 |     "sns.heatmap(df_cm, annot=True)\n",
502 |     "plt.title('SVM Linear Kernel \\nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, \n",
503 |     "                                                                       svm_predicted_mc)))\n",
504 |     "plt.ylabel('True label')\n",
505 |     "plt.xlabel('Predicted label')\n",
506 |     "\n",
507 |     "\n",
508 |     "svm = SVC(kernel = 'rbf').fit(X_train_mc, y_train_mc)\n",
509 |     "svm_predicted_mc = svm.predict(X_test_mc)\n",
510 |     "confusion_mc = confusion_matrix(y_test_mc, svm_predicted_mc)\n",
511 |     "df_cm = pd.DataFrame(confusion_mc, index = [i for i in range(0,10)],\n",
512 |     "                  columns = [i for i in range(0,10)])\n",
513 |     "\n",
514 |     "plt.figure(figsize = (5.5,4))\n",
515 |     "sns.heatmap(df_cm, annot=True)\n",
516 |     "plt.title('SVM RBF Kernel \\nAccuracy:{0:.3f}'.format(accuracy_score(y_test_mc, \n",
517 |     "                                                                    svm_predicted_mc)))\n",
518 |     "plt.ylabel('True label')\n",
519 |     "plt.xlabel('Predicted label');"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "metadata": {},
525 |    "source": [
526 |     "#### Multi-class classification report"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {
533 |     "collapsed": false
534 |    },
535 |    "outputs": [],
536 |    "source": [
537 |     "print(classification_report(y_test_mc, svm_predicted_mc))"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "markdown",
542 |    "metadata": {},
543 |    "source": [
544 |     "#### Micro- vs. macro-averaged metrics"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {
551 |     "collapsed": false
552 |    },
553 |    "outputs": [],
554 |    "source": [
555 |     "print('Micro-averaged precision = {:.2f} (treat instances equally)'\n",
556 |     "      .format(precision_score(y_test_mc, svm_predicted_mc, average = 'micro')))\n",
557 |     "print('Macro-averaged precision = {:.2f} (treat classes equally)'\n",
558 |     "      .format(precision_score(y_test_mc, svm_predicted_mc, average = 'macro')))"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": null,
564 |    "metadata": {
565 |     "collapsed": false
566 |    },
567 |    "outputs": [],
568 |    "source": [
569 |     "print('Micro-averaged f1 = {:.2f} (treat instances equally)'\n",
570 |     "      .format(f1_score(y_test_mc, svm_predicted_mc, average = 'micro')))\n",
571 |     "print('Macro-averaged f1 = {:.2f} (treat classes equally)'\n",
572 |     "      .format(f1_score(y_test_mc, svm_predicted_mc, average = 'macro')))"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "metadata": {},
578 |    "source": [
579 |     "### Regression evaluation metrics"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": null,
585 |    "metadata": {
586 |     "collapsed": false
587 |    },
588 |    "outputs": [],
589 |    "source": [
590 |     "%matplotlib notebook\n",
591 |     "import matplotlib.pyplot as plt\n",
592 |     "import numpy as np\n",
593 |     "from sklearn.model_selection import train_test_split\n",
594 |     "from sklearn import datasets\n",
595 |     "from sklearn.linear_model import LinearRegression\n",
596 |     "from sklearn.metrics import mean_squared_error, r2_score\n",
597 |     "from sklearn.dummy import DummyRegressor\n",
598 |     "\n",
599 |     "diabetes = datasets.load_diabetes()\n",
600 |     "\n",
601 |     "X = diabetes.data[:, None, 6]\n",
602 |     "y = diabetes.target\n",
603 |     "\n",
604 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n",
605 |     "\n",
606 |     "lm = LinearRegression().fit(X_train, y_train)\n",
607 |     "lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)\n",
608 |     "\n",
609 |     "y_predict = lm.predict(X_test)\n",
610 |     "y_predict_dummy_mean = lm_dummy_mean.predict(X_test)\n",
611 |     "\n",
612 |     "print('Linear model, coefficients: ', lm.coef_)\n",
613 |     "print(\"Mean squared error (dummy): {:.2f}\".format(mean_squared_error(y_test, \n",
614 |     "                                                                     y_predict_dummy_mean)))\n",
615 |     "print(\"Mean squared error (linear model): {:.2f}\".format(mean_squared_error(y_test, y_predict)))\n",
616 |     "print(\"r2_score (dummy): {:.2f}\".format(r2_score(y_test, y_predict_dummy_mean)))\n",
617 |     "print(\"r2_score (linear model): {:.2f}\".format(r2_score(y_test, y_predict)))\n",
618 |     "\n",
619 |     "# Plot outputs\n",
620 |     "plt.scatter(X_test, y_test,  color='black')\n",
621 |     "plt.plot(X_test, y_predict, color='green', linewidth=2)\n",
622 |     "plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed', \n",
623 |     "         linewidth=2, label = 'dummy')\n",
624 |     "\n",
625 |     "plt.show()"
626 |    ]
627 |   },
628 |   {
629 |    "cell_type": "markdown",
630 |    "metadata": {},
631 |    "source": [
632 |     "### Model selection using evaluation metrics"
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "markdown",
637 |    "metadata": {},
638 |    "source": [
639 |     "#### Cross-validation example"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": null,
645 |    "metadata": {
646 |     "collapsed": false
647 |    },
648 |    "outputs": [],
649 |    "source": [
650 |     "from sklearn.model_selection import cross_val_score\n",
651 |     "from sklearn.svm import SVC\n",
652 |     "\n",
653 |     "dataset = load_digits()\n",
654 |     "# again, making this a binary problem with 'digit 1' as positive class \n",
655 |     "# and 'not 1' as negative class\n",
656 |     "X, y = dataset.data, dataset.target == 1\n",
657 |     "clf = SVC(kernel='linear', C=1)\n",
658 |     "\n",
659 |     "# accuracy is the default scoring metric\n",
660 |     "print('Cross-validation (accuracy)', cross_val_score(clf, X, y, cv=5))\n",
661 |     "# use AUC as scoring metric\n",
662 |     "print('Cross-validation (AUC)', cross_val_score(clf, X, y, cv=5, scoring = 'roc_auc'))\n",
663 |     "# use recall as scoring metric\n",
664 |     "print('Cross-validation (recall)', cross_val_score(clf, X, y, cv=5, scoring = 'recall'))"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "markdown",
669 |    "metadata": {},
670 |    "source": [
671 |     "#### Grid search example"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "code",
676 |    "execution_count": null,
677 |    "metadata": {
678 |     "collapsed": false
679 |    },
680 |    "outputs": [],
681 |    "source": [
682 |     "from sklearn.svm import SVC\n",
683 |     "from sklearn.model_selection import GridSearchCV\n",
684 |     "from sklearn.metrics import roc_auc_score\n",
685 |     "\n",
686 |     "dataset = load_digits()\n",
687 |     "X, y = dataset.data, dataset.target == 1\n",
688 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n",
689 |     "\n",
690 |     "clf = SVC(kernel='rbf')\n",
691 |     "grid_values = {'gamma': [0.001, 0.01, 0.05, 0.1, 1, 10, 100]}\n",
692 |     "\n",
693 |     "# default metric to optimize over grid parameters: accuracy\n",
694 |     "grid_clf_acc = GridSearchCV(clf, param_grid = grid_values)\n",
695 |     "grid_clf_acc.fit(X_train, y_train)\n",
696 |     "y_decision_fn_scores_acc = grid_clf_acc.decision_function(X_test) \n",
697 |     "\n",
698 |     "print('Grid best parameter (max. accuracy): ', grid_clf_acc.best_params_)\n",
699 |     "print('Grid best score (accuracy): ', grid_clf_acc.best_score_)\n",
700 |     "\n",
701 |     "# alternative metric to optimize over grid parameters: AUC\n",
702 |     "grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')\n",
703 |     "grid_clf_auc.fit(X_train, y_train)\n",
704 |     "y_decision_fn_scores_auc = grid_clf_auc.decision_function(X_test) \n",
705 |     "\n",
706 |     "print('Test set AUC: ', roc_auc_score(y_test, y_decision_fn_scores_auc))\n",
707 |     "print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)\n",
708 |     "print('Grid best score (AUC): ', grid_clf_auc.best_score_)\n"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "markdown",
713 |    "metadata": {},
714 |    "source": [
715 |     "#### Evaluation metrics supported for model selection"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "code",
720 |    "execution_count": null,
721 |    "metadata": {
722 |     "collapsed": false
723 |    },
724 |    "outputs": [],
725 |    "source": [
726 |     "from sklearn.metrics.scorer import SCORERS\n",
727 |     "\n",
728 |     "print(sorted(list(SCORERS.keys())))"
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "markdown",
733 |    "metadata": {},
734 |    "source": [
735 |     "### Two-feature classification example using the digits dataset"
736 |    ]
737 |   },
738 |   {
739 |    "cell_type": "markdown",
740 |    "metadata": {},
741 |    "source": [
742 |     "#### Optimizing a classifier using different evaluation metrics"
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "code",
747 |    "execution_count": null,
748 |    "metadata": {
749 |     "collapsed": false,
750 |     "scrolled": false
751 |    },
752 |    "outputs": [],
753 |    "source": [
754 |     "from sklearn.datasets import load_digits\n",
755 |     "from sklearn.model_selection import train_test_split\n",
756 |     "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n",
757 |     "from sklearn.svm import SVC\n",
758 |     "from sklearn.model_selection import GridSearchCV\n",
759 |     "\n",
760 |     "\n",
761 |     "dataset = load_digits()\n",
762 |     "X, y = dataset.data, dataset.target == 1\n",
763 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n",
764 |     "\n",
765 |     "# Create a two-feature input vector matching the example plot above\n",
766 |     "# We jitter the points (add a small amount of random noise) in case there are areas\n",
767 |     "# in feature space where many instances have the same features.\n",
768 |     "jitter_delta = 0.25\n",
769 |     "X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta\n",
770 |     "X_twovar_test  = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta\n",
771 |     "\n",
772 |     "clf = SVC(kernel = 'linear').fit(X_twovar_train, y_train)\n",
773 |     "grid_values = {'class_weight':['balanced', {1:2},{1:3},{1:4},{1:5},{1:10},{1:20},{1:50}]}\n",
774 |     "plt.figure(figsize=(9,6))\n",
775 |     "for i, eval_metric in enumerate(('precision','recall', 'f1','roc_auc')):\n",
776 |     "    grid_clf_custom = GridSearchCV(clf, param_grid=grid_values, scoring=eval_metric)\n",
777 |     "    grid_clf_custom.fit(X_twovar_train, y_train)\n",
778 |     "    print('Grid best parameter (max. {0}): {1}'\n",
779 |     "          .format(eval_metric, grid_clf_custom.best_params_))\n",
780 |     "    print('Grid best score ({0}): {1}'\n",
781 |     "          .format(eval_metric, grid_clf_custom.best_score_))\n",
782 |     "    plt.subplots_adjust(wspace=0.3, hspace=0.3)\n",
783 |     "    plot_class_regions_for_classifier_subplot(grid_clf_custom, X_twovar_test, y_test, None,\n",
784 |     "                                             None, None,  plt.subplot(2, 2, i+1))\n",
785 |     "    \n",
786 |     "    plt.title(eval_metric+'-oriented SVC')\n",
787 |     "plt.tight_layout()\n",
788 |     "plt.show()"
789 |    ]
790 |   },
791 |   {
792 |    "cell_type": "markdown",
793 |    "metadata": {},
794 |    "source": [
795 |     "#### Precision-recall curve for the default SVC classifier (with balanced class weights)"
796 |    ]
797 |   },
798 |   {
799 |    "cell_type": "code",
800 |    "execution_count": null,
801 |    "metadata": {
802 |     "collapsed": false,
803 |     "scrolled": false
804 |    },
805 |    "outputs": [],
806 |    "source": [
807 |     "from sklearn.model_selection import train_test_split\n",
808 |     "from sklearn.metrics import precision_recall_curve\n",
809 |     "from adspy_shared_utilities import plot_class_regions_for_classifier\n",
810 |     "from sklearn.svm import SVC\n",
811 |     "\n",
812 |     "dataset = load_digits()\n",
813 |     "X, y = dataset.data, dataset.target == 1\n",
814 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n",
815 |     "\n",
816 |     "# create a two-feature input vector matching the example plot above\n",
817 |     "jitter_delta = 0.25\n",
818 |     "X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta\n",
819 |     "X_twovar_test  = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta\n",
820 |     "\n",
821 |     "clf = SVC(kernel='linear', class_weight='balanced').fit(X_twovar_train, y_train)\n",
822 |     "\n",
823 |     "y_scores = clf.decision_function(X_twovar_test)\n",
824 |     "\n",
825 |     "precision, recall, thresholds = precision_recall_curve(y_test, y_scores)\n",
826 |     "closest_zero = np.argmin(np.abs(thresholds))\n",
827 |     "closest_zero_p = precision[closest_zero]\n",
828 |     "closest_zero_r = recall[closest_zero]\n",
829 |     "\n",
830 |     "plot_class_regions_for_classifier(clf, X_twovar_test, y_test)\n",
831 |     "plt.title(\"SVC, class_weight = 'balanced', optimized for accuracy\")\n",
832 |     "plt.show()\n",
833 |     "\n",
834 |     "plt.figure()\n",
835 |     "plt.xlim([0.0, 1.01])\n",
836 |     "plt.ylim([0.0, 1.01])\n",
837 |     "plt.title (\"Precision-recall curve: SVC, class_weight = 'balanced'\")\n",
838 |     "plt.plot(precision, recall, label = 'Precision-Recall Curve')\n",
839 |     "plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3)\n",
840 |     "plt.xlabel('Precision', fontsize=16)\n",
841 |     "plt.ylabel('Recall', fontsize=16)\n",
842 |     "plt.axes().set_aspect('equal')\n",
843 |     "plt.show()\n",
844 |     "print('At zero threshold, precision: {:.2f}, recall: {:.2f}'\n",
845 |     "      .format(closest_zero_p, closest_zero_r))"
846 |    ]
847 |   },
848 |   {
849 |    "cell_type": "code",
850 |    "execution_count": null,
851 |    "metadata": {
852 |     "collapsed": true
853 |    },
854 |    "outputs": [],
855 |    "source": []
856 |   }
857 |  ],
858 |  "metadata": {
859 |   "anaconda-cloud": {},
860 |   "kernelspec": {
861 |    "display_name": "Python 3",
862 |    "language": "python",
863 |    "name": "python3"
864 |   },
865 |   "language_info": {
866 |    "codemirror_mode": {
867 |     "name": "ipython",
868 |     "version": 3
869 |    },
870 |    "file_extension": ".py",
871 |    "mimetype": "text/x-python",
872 |    "name": "python",
873 |    "nbconvert_exporter": "python",
874 |    "pygments_lexer": "ipython3",
875 |    "version": "3.5.2"
876 |   }
877 |  },
878 |  "nbformat": 4,
879 |  "nbformat_minor": 1
880 | }
881 | 


--------------------------------------------------------------------------------
/Week 2/Module+2.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "---\n",
   8 |     "\n",
   9 |     "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n",
  10 |     "\n",
  11 |     "---"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "metadata": {},
  17 |    "source": [
  18 |     "# Applied Machine Learning: Module 2 (Supervised Learning, Part I)"
  19 |    ]
  20 |   },
  21 |   {
  22 |    "cell_type": "markdown",
  23 |    "metadata": {},
  24 |    "source": [
  25 |     "## Preamble and Review"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "code",
  30 |    "execution_count": null,
  31 |    "metadata": {
  32 |     "collapsed": false
  33 |    },
  34 |    "outputs": [],
  35 |    "source": [
  36 |     "%matplotlib notebook\n",
  37 |     "import numpy as np\n",
  38 |     "import pandas as pd\n",
  39 |     "import seaborn as sn\n",
  40 |     "import matplotlib.pyplot as plt\n",
  41 |     "\n",
  42 |     "from sklearn.model_selection import train_test_split\n",
  43 |     "from sklearn.neighbors import KNeighborsClassifier\n",
  44 |     "\n",
  45 |     "np.set_printoptions(precision=2)\n",
  46 |     "\n",
  47 |     "\n",
  48 |     "fruits = pd.read_table('fruit_data_with_colors.txt')\n",
  49 |     "\n",
  50 |     "feature_names_fruits = ['height', 'width', 'mass', 'color_score']\n",
  51 |     "X_fruits = fruits[feature_names_fruits]\n",
  52 |     "y_fruits = fruits['fruit_label']\n",
  53 |     "target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']\n",
  54 |     "\n",
  55 |     "X_fruits_2d = fruits[['height', 'width']]\n",
  56 |     "y_fruits_2d = fruits['fruit_label']\n",
  57 |     "\n",
  58 |     "X_train, X_test, y_train, y_test = train_test_split(X_fruits, y_fruits, random_state=0)\n",
  59 |     "\n",
  60 |     "from sklearn.preprocessing import MinMaxScaler\n",
  61 |     "scaler = MinMaxScaler()\n",
  62 |     "X_train_scaled = scaler.fit_transform(X_train)\n",
  63 |     "# we must apply the scaling to the test set that we computed for the training set\n",
  64 |     "X_test_scaled = scaler.transform(X_test)\n",
  65 |     "\n",
  66 |     "knn = KNeighborsClassifier(n_neighbors = 5)\n",
  67 |     "knn.fit(X_train_scaled, y_train)\n",
  68 |     "print('Accuracy of K-NN classifier on training set: {:.2f}'\n",
  69 |     "     .format(knn.score(X_train_scaled, y_train)))\n",
  70 |     "print('Accuracy of K-NN classifier on test set: {:.2f}'\n",
  71 |     "     .format(knn.score(X_test_scaled, y_test)))\n",
  72 |     "\n",
  73 |     "example_fruit = [[5.5, 2.2, 10, 0.70]]\n",
  74 |     "print('Predicted fruit type for ', example_fruit, ' is ', \n",
  75 |     "      target_names_fruits[knn.predict(example_fruit)[0]-1])"
  76 |    ]
  77 |   },
  78 |   {
  79 |    "cell_type": "markdown",
  80 |    "metadata": {},
  81 |    "source": [
  82 |     "## Datasets"
  83 |    ]
  84 |   },
  85 |   {
  86 |    "cell_type": "code",
  87 |    "execution_count": null,
  88 |    "metadata": {
  89 |     "collapsed": false,
  90 |     "scrolled": false
  91 |    },
  92 |    "outputs": [],
  93 |    "source": [
  94 |     "from sklearn.datasets import make_classification, make_blobs\n",
  95 |     "from matplotlib.colors import ListedColormap\n",
  96 |     "from sklearn.datasets import load_breast_cancer\n",
  97 |     "from adspy_shared_utilities import load_crime_dataset\n",
  98 |     "\n",
  99 |     "cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])\n",
 100 |     "\n",
 101 |     "\n",
 102 |     "# synthetic dataset for simple regression\n",
 103 |     "from sklearn.datasets import make_regression\n",
 104 |     "plt.figure()\n",
 105 |     "plt.title('Sample regression problem with one input variable')\n",
 106 |     "X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,\n",
 107 |     "                            n_informative=1, bias = 150.0,\n",
 108 |     "                            noise = 30, random_state=0)\n",
 109 |     "plt.scatter(X_R1, y_R1, marker= 'o', s=50)\n",
 110 |     "plt.show()\n",
 111 |     "\n",
 112 |     "\n",
 113 |     "# synthetic dataset for more complex regression\n",
 114 |     "from sklearn.datasets import make_friedman1\n",
 115 |     "plt.figure()\n",
 116 |     "plt.title('Complex regression problem with one input variable')\n",
 117 |     "X_F1, y_F1 = make_friedman1(n_samples = 100,\n",
 118 |     "                           n_features = 7, random_state=0)\n",
 119 |     "\n",
 120 |     "plt.scatter(X_F1[:, 2], y_F1, marker= 'o', s=50)\n",
 121 |     "plt.show()\n",
 122 |     "\n",
 123 |     "# synthetic dataset for classification (binary) \n",
 124 |     "plt.figure()\n",
 125 |     "plt.title('Sample binary classification problem with two informative features')\n",
 126 |     "X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,\n",
 127 |     "                                n_redundant=0, n_informative=2,\n",
 128 |     "                                n_clusters_per_class=1, flip_y = 0.1,\n",
 129 |     "                                class_sep = 0.5, random_state=0)\n",
 130 |     "plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2,\n",
 131 |     "           marker= 'o', s=50, cmap=cmap_bold)\n",
 132 |     "plt.show()\n",
 133 |     "\n",
 134 |     "\n",
 135 |     "# more difficult synthetic dataset for classification (binary) \n",
 136 |     "# with classes that are not linearly separable\n",
 137 |     "X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8,\n",
 138 |     "                       cluster_std = 1.3, random_state = 4)\n",
 139 |     "y_D2 = y_D2 % 2\n",
 140 |     "plt.figure()\n",
 141 |     "plt.title('Sample binary classification problem with non-linearly separable classes')\n",
 142 |     "plt.scatter(X_D2[:,0], X_D2[:,1], c=y_D2,\n",
 143 |     "           marker= 'o', s=50, cmap=cmap_bold)\n",
 144 |     "plt.show()\n",
 145 |     "\n",
 146 |     "\n",
 147 |     "# Breast cancer dataset for classification\n",
 148 |     "cancer = load_breast_cancer()\n",
 149 |     "(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)\n",
 150 |     "\n",
 151 |     "\n",
 152 |     "# Communities and Crime dataset\n",
 153 |     "(X_crime, y_crime) = load_crime_dataset()"
 154 |    ]
 155 |   },
 156 |   {
 157 |    "cell_type": "markdown",
 158 |    "metadata": {},
 159 |    "source": [
 160 |     "## K-Nearest Neighbors"
 161 |    ]
 162 |   },
 163 |   {
 164 |    "cell_type": "markdown",
 165 |    "metadata": {},
 166 |    "source": [
 167 |     "### Classification"
 168 |    ]
 169 |   },
 170 |   {
 171 |    "cell_type": "code",
 172 |    "execution_count": null,
 173 |    "metadata": {
 174 |     "collapsed": false,
 175 |     "scrolled": false
 176 |    },
 177 |    "outputs": [],
 178 |    "source": [
 179 |     "from adspy_shared_utilities import plot_two_class_knn\n",
 180 |     "\n",
 181 |     "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,\n",
 182 |     "                                                   random_state=0)\n",
 183 |     "\n",
 184 |     "plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test)\n",
 185 |     "plot_two_class_knn(X_train, y_train, 3, 'uniform', X_test, y_test)\n",
 186 |     "plot_two_class_knn(X_train, y_train, 11, 'uniform', X_test, y_test)"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "markdown",
 191 |    "metadata": {},
 192 |    "source": [
 193 |     "### Regression"
 194 |    ]
 195 |   },
 196 |   {
 197 |    "cell_type": "code",
 198 |    "execution_count": null,
 199 |    "metadata": {
 200 |     "collapsed": false
 201 |    },
 202 |    "outputs": [],
 203 |    "source": [
 204 |     "from sklearn.neighbors import KNeighborsRegressor\n",
 205 |     "\n",
 206 |     "X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1, random_state = 0)\n",
 207 |     "\n",
 208 |     "knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)\n",
 209 |     "\n",
 210 |     "print(knnreg.predict(X_test))\n",
 211 |     "print('R-squared test score: {:.3f}'\n",
 212 |     "     .format(knnreg.score(X_test, y_test)))"
 213 |    ]
 214 |   },
 215 |   {
 216 |    "cell_type": "code",
 217 |    "execution_count": null,
 218 |    "metadata": {
 219 |     "collapsed": false
 220 |    },
 221 |    "outputs": [],
 222 |    "source": [
 223 |     "fig, subaxes = plt.subplots(1, 2, figsize=(8,4))\n",
 224 |     "X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)\n",
 225 |     "X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0)\n",
 226 |     "\n",
 227 |     "for thisaxis, K in zip(subaxes, [1, 3]):\n",
 228 |     "    knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)\n",
 229 |     "    y_predict_output = knnreg.predict(X_predict_input)\n",
 230 |     "    thisaxis.set_xlim([-2.5, 0.75])\n",
 231 |     "    thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10,\n",
 232 |     "                 label='Predicted', alpha=0.8)\n",
 233 |     "    thisaxis.plot(X_train, y_train, 'o', label='True Value', alpha=0.8)\n",
 234 |     "    thisaxis.set_xlabel('Input feature')\n",
 235 |     "    thisaxis.set_ylabel('Target value')\n",
 236 |     "    thisaxis.set_title('KNN regression (K={})'.format(K))\n",
 237 |     "    thisaxis.legend()\n",
 238 |     "plt.tight_layout()"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "markdown",
 243 |    "metadata": {},
 244 |    "source": [
 245 |     "### Regression model complexity as a function of K"
 246 |    ]
 247 |   },
 248 |   {
 249 |    "cell_type": "code",
 250 |    "execution_count": null,
 251 |    "metadata": {
 252 |     "collapsed": false,
 253 |     "scrolled": false
 254 |    },
 255 |    "outputs": [],
 256 |    "source": [
 257 |     "# plot k-NN regression on sample dataset for different values of K\n",
 258 |     "fig, subaxes = plt.subplots(5, 1, figsize=(5,20))\n",
 259 |     "X_predict_input = np.linspace(-3, 3, 500).reshape(-1,1)\n",
 260 |     "X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,\n",
 261 |     "                                                   random_state = 0)\n",
 262 |     "\n",
 263 |     "for thisaxis, K in zip(subaxes, [1, 3, 7, 15, 55]):\n",
 264 |     "    knnreg = KNeighborsRegressor(n_neighbors = K).fit(X_train, y_train)\n",
 265 |     "    y_predict_output = knnreg.predict(X_predict_input)\n",
 266 |     "    train_score = knnreg.score(X_train, y_train)\n",
 267 |     "    test_score = knnreg.score(X_test, y_test)\n",
 268 |     "    thisaxis.plot(X_predict_input, y_predict_output)\n",
 269 |     "    thisaxis.plot(X_train, y_train, 'o', alpha=0.9, label='Train')\n",
 270 |     "    thisaxis.plot(X_test, y_test, '^', alpha=0.9, label='Test')\n",
 271 |     "    thisaxis.set_xlabel('Input feature')\n",
 272 |     "    thisaxis.set_ylabel('Target value')\n",
 273 |     "    thisaxis.set_title('KNN Regression (K={})\\n\\\n",
 274 |     "Train $R^2 = {:.3f}$,  Test $R^2 = {:.3f}$'\n",
 275 |     "                      .format(K, train_score, test_score))\n",
 276 |     "    thisaxis.legend()\n",
 277 |     "    plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)\n"
 278 |    ]
 279 |   },
 280 |   {
 281 |    "cell_type": "markdown",
 282 |    "metadata": {},
 283 |    "source": [
 284 |     "## Linear models for regression"
 285 |    ]
 286 |   },
 287 |   {
 288 |    "cell_type": "markdown",
 289 |    "metadata": {},
 290 |    "source": [
 291 |     "### Linear regression"
 292 |    ]
 293 |   },
 294 |   {
 295 |    "cell_type": "code",
 296 |    "execution_count": null,
 297 |    "metadata": {
 298 |     "collapsed": false
 299 |    },
 300 |    "outputs": [],
 301 |    "source": [
 302 |     "from sklearn.linear_model import LinearRegression\n",
 303 |     "\n",
 304 |     "X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,\n",
 305 |     "                                                   random_state = 0)\n",
 306 |     "linreg = LinearRegression().fit(X_train, y_train)\n",
 307 |     "\n",
 308 |     "print('linear model coeff (w): {}'\n",
 309 |     "     .format(linreg.coef_))\n",
 310 |     "print('linear model intercept (b): {:.3f}'\n",
 311 |     "     .format(linreg.intercept_))\n",
 312 |     "print('R-squared score (training): {:.3f}'\n",
 313 |     "     .format(linreg.score(X_train, y_train)))\n",
 314 |     "print('R-squared score (test): {:.3f}'\n",
 315 |     "     .format(linreg.score(X_test, y_test)))"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "markdown",
 320 |    "metadata": {},
 321 |    "source": [
 322 |     "### Linear regression: example plot "
 323 |    ]
 324 |   },
 325 |   {
 326 |    "cell_type": "code",
 327 |    "execution_count": null,
 328 |    "metadata": {
 329 |     "collapsed": false
 330 |    },
 331 |    "outputs": [],
 332 |    "source": [
 333 |     "plt.figure(figsize=(5,4))\n",
 334 |     "plt.scatter(X_R1, y_R1, marker= 'o', s=50, alpha=0.8)\n",
 335 |     "plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-')\n",
 336 |     "plt.title('Least-squares linear regression')\n",
 337 |     "plt.xlabel('Feature value (x)')\n",
 338 |     "plt.ylabel('Target value (y)')\n",
 339 |     "plt.show()"
 340 |    ]
 341 |   },
 342 |   {
 343 |    "cell_type": "code",
 344 |    "execution_count": null,
 345 |    "metadata": {
 346 |     "collapsed": false
 347 |    },
 348 |    "outputs": [],
 349 |    "source": [
 350 |     "X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,\n",
 351 |     "                                                   random_state = 0)\n",
 352 |     "linreg = LinearRegression().fit(X_train, y_train)\n",
 353 |     "\n",
 354 |     "print('Crime dataset')\n",
 355 |     "print('linear model intercept: {}'\n",
 356 |     "     .format(linreg.intercept_))\n",
 357 |     "print('linear model coeff:\\n{}'\n",
 358 |     "     .format(linreg.coef_))\n",
 359 |     "print('R-squared score (training): {:.3f}'\n",
 360 |     "     .format(linreg.score(X_train, y_train)))\n",
 361 |     "print('R-squared score (test): {:.3f}'\n",
 362 |     "     .format(linreg.score(X_test, y_test)))"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "markdown",
 367 |    "metadata": {},
 368 |    "source": [
 369 |     "### Ridge regression"
 370 |    ]
 371 |   },
 372 |   {
 373 |    "cell_type": "code",
 374 |    "execution_count": null,
 375 |    "metadata": {
 376 |     "collapsed": false
 377 |    },
 378 |    "outputs": [],
 379 |    "source": [
 380 |     "from sklearn.linear_model import Ridge\n",
 381 |     "X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,\n",
 382 |     "                                                   random_state = 0)\n",
 383 |     "\n",
 384 |     "linridge = Ridge(alpha=20.0).fit(X_train, y_train)\n",
 385 |     "\n",
 386 |     "print('Crime dataset')\n",
 387 |     "print('ridge regression linear model intercept: {}'\n",
 388 |     "     .format(linridge.intercept_))\n",
 389 |     "print('ridge regression linear model coeff:\\n{}'\n",
 390 |     "     .format(linridge.coef_))\n",
 391 |     "print('R-squared score (training): {:.3f}'\n",
 392 |     "     .format(linridge.score(X_train, y_train)))\n",
 393 |     "print('R-squared score (test): {:.3f}'\n",
 394 |     "     .format(linridge.score(X_test, y_test)))\n",
 395 |     "print('Number of non-zero features: {}'\n",
 396 |     "     .format(np.sum(linridge.coef_ != 0)))"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "markdown",
 401 |    "metadata": {},
 402 |    "source": [
 403 |     "#### Ridge regression with feature normalization"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "code",
 408 |    "execution_count": null,
 409 |    "metadata": {
 410 |     "collapsed": false
 411 |    },
 412 |    "outputs": [],
 413 |    "source": [
 414 |     "from sklearn.preprocessing import MinMaxScaler\n",
 415 |     "scaler = MinMaxScaler()\n",
 416 |     "\n",
 417 |     "from sklearn.linear_model import Ridge\n",
 418 |     "X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,\n",
 419 |     "                                                   random_state = 0)\n",
 420 |     "\n",
 421 |     "X_train_scaled = scaler.fit_transform(X_train)\n",
 422 |     "X_test_scaled = scaler.transform(X_test)\n",
 423 |     "\n",
 424 |     "linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)\n",
 425 |     "\n",
 426 |     "print('Crime dataset')\n",
 427 |     "print('ridge regression linear model intercept: {}'\n",
 428 |     "     .format(linridge.intercept_))\n",
 429 |     "print('ridge regression linear model coeff:\\n{}'\n",
 430 |     "     .format(linridge.coef_))\n",
 431 |     "print('R-squared score (training): {:.3f}'\n",
 432 |     "     .format(linridge.score(X_train_scaled, y_train)))\n",
 433 |     "print('R-squared score (test): {:.3f}'\n",
 434 |     "     .format(linridge.score(X_test_scaled, y_test)))\n",
 435 |     "print('Number of non-zero features: {}'\n",
 436 |     "     .format(np.sum(linridge.coef_ != 0)))"
 437 |    ]
 438 |   },
 439 |   {
 440 |    "cell_type": "markdown",
 441 |    "metadata": {},
 442 |    "source": [
 443 |     "#### Ridge regression with regularization parameter: alpha"
 444 |    ]
 445 |   },
 446 |   {
 447 |    "cell_type": "code",
 448 |    "execution_count": null,
 449 |    "metadata": {
 450 |     "collapsed": false
 451 |    },
 452 |    "outputs": [],
 453 |    "source": [
 454 |     "print('Ridge regression: effect of alpha regularization parameter\\n')\n",
 455 |     "for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:\n",
 456 |     "    linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)\n",
 457 |     "    r2_train = linridge.score(X_train_scaled, y_train)\n",
 458 |     "    r2_test = linridge.score(X_test_scaled, y_test)\n",
 459 |     "    num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)\n",
 460 |     "    print('Alpha = {:.2f}\\nnum abs(coeff) > 1.0: {}, \\\n",
 461 |     "r-squared training: {:.2f}, r-squared test: {:.2f}\\n'\n",
 462 |     "         .format(this_alpha, num_coeff_bigger, r2_train, r2_test))"
 463 |    ]
 464 |   },
 465 |   {
 466 |    "cell_type": "markdown",
 467 |    "metadata": {},
 468 |    "source": [
 469 |     "### Lasso regression"
 470 |    ]
 471 |   },
 472 |   {
 473 |    "cell_type": "code",
 474 |    "execution_count": null,
 475 |    "metadata": {
 476 |     "collapsed": false
 477 |    },
 478 |    "outputs": [],
 479 |    "source": [
 480 |     "from sklearn.linear_model import Lasso\n",
 481 |     "from sklearn.preprocessing import MinMaxScaler\n",
 482 |     "scaler = MinMaxScaler()\n",
 483 |     "\n",
 484 |     "X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime,\n",
 485 |     "                                                   random_state = 0)\n",
 486 |     "\n",
 487 |     "X_train_scaled = scaler.fit_transform(X_train)\n",
 488 |     "X_test_scaled = scaler.transform(X_test)\n",
 489 |     "\n",
 490 |     "linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train_scaled, y_train)\n",
 491 |     "\n",
 492 |     "print('Crime dataset')\n",
 493 |     "print('lasso regression linear model intercept: {}'\n",
 494 |     "     .format(linlasso.intercept_))\n",
 495 |     "print('lasso regression linear model coeff:\\n{}'\n",
 496 |     "     .format(linlasso.coef_))\n",
 497 |     "print('Non-zero features: {}'\n",
 498 |     "     .format(np.sum(linlasso.coef_ != 0)))\n",
 499 |     "print('R-squared score (training): {:.3f}'\n",
 500 |     "     .format(linlasso.score(X_train_scaled, y_train)))\n",
 501 |     "print('R-squared score (test): {:.3f}\\n'\n",
 502 |     "     .format(linlasso.score(X_test_scaled, y_test)))\n",
 503 |     "print('Features with non-zero weight (sorted by absolute magnitude):')\n",
 504 |     "\n",
 505 |     "for e in sorted (list(zip(list(X_crime), linlasso.coef_)),\n",
 506 |     "                key = lambda e: -abs(e[1])):\n",
 507 |     "    if e[1] != 0:\n",
 508 |     "        print('\\t{}, {:.3f}'.format(e[0], e[1]))"
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "markdown",
 513 |    "metadata": {},
 514 |    "source": [
 515 |     "#### Lasso regression with regularization parameter: alpha"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "code",
 520 |    "execution_count": null,
 521 |    "metadata": {
 522 |     "collapsed": false
 523 |    },
 524 |    "outputs": [],
 525 |    "source": [
 526 |     "print('Lasso regression: effect of alpha regularization\\n\\\n",
 527 |     "parameter on number of features kept in final model\\n')\n",
 528 |     "\n",
 529 |     "for alpha in [0.5, 1, 2, 3, 5, 10, 20, 50]:\n",
 530 |     "    linlasso = Lasso(alpha, max_iter = 10000).fit(X_train_scaled, y_train)\n",
 531 |     "    r2_train = linlasso.score(X_train_scaled, y_train)\n",
 532 |     "    r2_test = linlasso.score(X_test_scaled, y_test)\n",
 533 |     "    \n",
 534 |     "    print('Alpha = {:.2f}\\nFeatures kept: {}, r-squared training: {:.2f}, \\\n",
 535 |     "r-squared test: {:.2f}\\n'\n",
 536 |     "         .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))"
 537 |    ]
 538 |   },
 539 |   {
 540 |    "cell_type": "markdown",
 541 |    "metadata": {},
 542 |    "source": [
 543 |     "### Polynomial regression"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "code",
 548 |    "execution_count": null,
 549 |    "metadata": {
 550 |     "collapsed": false
 551 |    },
 552 |    "outputs": [],
 553 |    "source": [
 554 |     "from sklearn.linear_model import LinearRegression\n",
 555 |     "from sklearn.linear_model import Ridge\n",
 556 |     "from sklearn.preprocessing import PolynomialFeatures\n",
 557 |     "\n",
 558 |     "\n",
 559 |     "X_train, X_test, y_train, y_test = train_test_split(X_F1, y_F1,\n",
 560 |     "                                                   random_state = 0)\n",
 561 |     "linreg = LinearRegression().fit(X_train, y_train)\n",
 562 |     "\n",
 563 |     "print('linear model coeff (w): {}'\n",
 564 |     "     .format(linreg.coef_))\n",
 565 |     "print('linear model intercept (b): {:.3f}'\n",
 566 |     "     .format(linreg.intercept_))\n",
 567 |     "print('R-squared score (training): {:.3f}'\n",
 568 |     "     .format(linreg.score(X_train, y_train)))\n",
 569 |     "print('R-squared score (test): {:.3f}'\n",
 570 |     "     .format(linreg.score(X_test, y_test)))\n",
 571 |     "\n",
 572 |     "print('\\nNow we transform the original input data to add\\n\\\n",
 573 |     "polynomial features up to degree 2 (quadratic)\\n')\n",
 574 |     "poly = PolynomialFeatures(degree=2)\n",
 575 |     "X_F1_poly = poly.fit_transform(X_F1)\n",
 576 |     "\n",
 577 |     "X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,\n",
 578 |     "                                                   random_state = 0)\n",
 579 |     "linreg = LinearRegression().fit(X_train, y_train)\n",
 580 |     "\n",
 581 |     "print('(poly deg 2) linear model coeff (w):\\n{}'\n",
 582 |     "     .format(linreg.coef_))\n",
 583 |     "print('(poly deg 2) linear model intercept (b): {:.3f}'\n",
 584 |     "     .format(linreg.intercept_))\n",
 585 |     "print('(poly deg 2) R-squared score (training): {:.3f}'\n",
 586 |     "     .format(linreg.score(X_train, y_train)))\n",
 587 |     "print('(poly deg 2) R-squared score (test): {:.3f}\\n'\n",
 588 |     "     .format(linreg.score(X_test, y_test)))\n",
 589 |     "\n",
 590 |     "print('\\nAddition of many polynomial features often leads to\\n\\\n",
 591 |     "overfitting, so we often use polynomial features in combination\\n\\\n",
 592 |     "with regression that has a regularization penalty, like ridge\\n\\\n",
 593 |     "regression.\\n')\n",
 594 |     "\n",
 595 |     "X_train, X_test, y_train, y_test = train_test_split(X_F1_poly, y_F1,\n",
 596 |     "                                                   random_state = 0)\n",
 597 |     "linreg = Ridge().fit(X_train, y_train)\n",
 598 |     "\n",
 599 |     "print('(poly deg 2 + ridge) linear model coeff (w):\\n{}'\n",
 600 |     "     .format(linreg.coef_))\n",
 601 |     "print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'\n",
 602 |     "     .format(linreg.intercept_))\n",
 603 |     "print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'\n",
 604 |     "     .format(linreg.score(X_train, y_train)))\n",
 605 |     "print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'\n",
 606 |     "     .format(linreg.score(X_test, y_test)))"
 607 |    ]
 608 |   },
 609 |   {
 610 |    "cell_type": "markdown",
 611 |    "metadata": {},
 612 |    "source": [
 613 |     "## Linear models for classification"
 614 |    ]
 615 |   },
 616 |   {
 617 |    "cell_type": "markdown",
 618 |    "metadata": {},
 619 |    "source": [
 620 |     "### Logistic regression"
 621 |    ]
 622 |   },
 623 |   {
 624 |    "cell_type": "markdown",
 625 |    "metadata": {},
 626 |    "source": [
 627 |     "#### Logistic regression for binary classification on fruits dataset using height, width features (positive class: apple, negative class: others)"
 628 |    ]
 629 |   },
 630 |   {
 631 |    "cell_type": "code",
 632 |    "execution_count": null,
 633 |    "metadata": {
 634 |     "collapsed": false
 635 |    },
 636 |    "outputs": [],
 637 |    "source": [
 638 |     "from sklearn.linear_model import LogisticRegression\n",
 639 |     "from adspy_shared_utilities import (\n",
 640 |     "plot_class_regions_for_classifier_subplot)\n",
 641 |     "\n",
 642 |     "fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))\n",
 643 |     "y_fruits_apple = y_fruits_2d == 1   # make into a binary problem: apples vs everything else\n",
 644 |     "X_train, X_test, y_train, y_test = (\n",
 645 |     "train_test_split(X_fruits_2d.as_matrix(),\n",
 646 |     "                y_fruits_apple.as_matrix(),\n",
 647 |     "                random_state = 0))\n",
 648 |     "\n",
 649 |     "clf = LogisticRegression(C=100).fit(X_train, y_train)\n",
 650 |     "plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,\n",
 651 |     "                                         None, 'Logistic regression \\\n",
 652 |     "for binary classification\\nFruit dataset: Apple vs others',\n",
 653 |     "                                         subaxes)\n",
 654 |     "\n",
 655 |     "h = 6\n",
 656 |     "w = 8\n",
 657 |     "print('A fruit with height {} and width {} is predicted to be: {}'\n",
 658 |     "     .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]]))\n",
 659 |     "\n",
 660 |     "h = 10\n",
 661 |     "w = 7\n",
 662 |     "print('A fruit with height {} and width {} is predicted to be: {}'\n",
 663 |     "     .format(h,w, ['not an apple', 'an apple'][clf.predict([[h,w]])[0]]))\n",
 664 |     "subaxes.set_xlabel('height')\n",
 665 |     "subaxes.set_ylabel('width')\n",
 666 |     "\n",
 667 |     "print('Accuracy of Logistic regression classifier on training set: {:.2f}'\n",
 668 |     "     .format(clf.score(X_train, y_train)))\n",
 669 |     "print('Accuracy of Logistic regression classifier on test set: {:.2f}'\n",
 670 |     "     .format(clf.score(X_test, y_test)))"
 671 |    ]
 672 |   },
 673 |   {
 674 |    "cell_type": "markdown",
 675 |    "metadata": {},
 676 |    "source": [
 677 |     "#### Logistic regression on simple synthetic dataset"
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "code",
 682 |    "execution_count": null,
 683 |    "metadata": {
 684 |     "collapsed": false,
 685 |     "scrolled": false
 686 |    },
 687 |    "outputs": [],
 688 |    "source": [
 689 |     "from sklearn.linear_model import LogisticRegression\n",
 690 |     "from adspy_shared_utilities import (\n",
 691 |     "plot_class_regions_for_classifier_subplot)\n",
 692 |     "\n",
 693 |     "\n",
 694 |     "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,\n",
 695 |     "                                                   random_state = 0)\n",
 696 |     "\n",
 697 |     "fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))\n",
 698 |     "clf = LogisticRegression().fit(X_train, y_train)\n",
 699 |     "title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(1.0)\n",
 700 |     "plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n",
 701 |     "                                         None, None, title, subaxes)\n",
 702 |     "\n",
 703 |     "print('Accuracy of Logistic regression classifier on training set: {:.2f}'\n",
 704 |     "     .format(clf.score(X_train, y_train)))\n",
 705 |     "print('Accuracy of Logistic regression classifier on test set: {:.2f}'\n",
 706 |     "     .format(clf.score(X_test, y_test)))\n",
 707 |     "     "
 708 |    ]
 709 |   },
 710 |   {
 711 |    "cell_type": "markdown",
 712 |    "metadata": {},
 713 |    "source": [
 714 |     "#### Logistic regression regularization: C parameter"
 715 |    ]
 716 |   },
 717 |   {
 718 |    "cell_type": "code",
 719 |    "execution_count": null,
 720 |    "metadata": {
 721 |     "collapsed": false,
 722 |     "scrolled": false
 723 |    },
 724 |    "outputs": [],
 725 |    "source": [
 726 |     "X_train, X_test, y_train, y_test = (\n",
 727 |     "train_test_split(X_fruits_2d.as_matrix(),\n",
 728 |     "                y_fruits_apple.as_matrix(),\n",
 729 |     "                random_state=0))\n",
 730 |     "\n",
 731 |     "fig, subaxes = plt.subplots(3, 1, figsize=(4, 10))\n",
 732 |     "\n",
 733 |     "for this_C, subplot in zip([0.1, 1, 100], subaxes):\n",
 734 |     "    clf = LogisticRegression(C=this_C).fit(X_train, y_train)\n",
 735 |     "    title ='Logistic regression (apple vs rest), C = {:.3f}'.format(this_C)\n",
 736 |     "    \n",
 737 |     "    plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n",
 738 |     "                                             X_test, y_test, title,\n",
 739 |     "                                             subplot)\n",
 740 |     "plt.tight_layout()"
 741 |    ]
 742 |   },
 743 |   {
 744 |    "cell_type": "markdown",
 745 |    "metadata": {},
 746 |    "source": [
 747 |     "#### Application to real dataset"
 748 |    ]
 749 |   },
 750 |   {
 751 |    "cell_type": "code",
 752 |    "execution_count": null,
 753 |    "metadata": {
 754 |     "collapsed": false
 755 |    },
 756 |    "outputs": [],
 757 |    "source": [
 758 |     "from sklearn.linear_model import LogisticRegression\n",
 759 |     "\n",
 760 |     "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n",
 761 |     "\n",
 762 |     "clf = LogisticRegression().fit(X_train, y_train)\n",
 763 |     "print('Breast cancer dataset')\n",
 764 |     "print('Accuracy of Logistic regression classifier on training set: {:.2f}'\n",
 765 |     "     .format(clf.score(X_train, y_train)))\n",
 766 |     "print('Accuracy of Logistic regression classifier on test set: {:.2f}'\n",
 767 |     "     .format(clf.score(X_test, y_test)))"
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "markdown",
 772 |    "metadata": {},
 773 |    "source": [
 774 |     "### Support Vector Machines"
 775 |    ]
 776 |   },
 777 |   {
 778 |    "cell_type": "markdown",
 779 |    "metadata": {},
 780 |    "source": [
 781 |     "#### Linear Support Vector Machine"
 782 |    ]
 783 |   },
 784 |   {
 785 |    "cell_type": "code",
 786 |    "execution_count": null,
 787 |    "metadata": {
 788 |     "collapsed": false
 789 |    },
 790 |    "outputs": [],
 791 |    "source": [
 792 |     "from sklearn.svm import SVC\n",
 793 |     "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n",
 794 |     "\n",
 795 |     "\n",
 796 |     "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)\n",
 797 |     "\n",
 798 |     "fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))\n",
 799 |     "this_C = 1.0\n",
 800 |     "clf = SVC(kernel = 'linear', C=this_C).fit(X_train, y_train)\n",
 801 |     "title = 'Linear SVC, C = {:.3f}'.format(this_C)\n",
 802 |     "plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None, title, subaxes)"
 803 |    ]
 804 |   },
 805 |   {
 806 |    "cell_type": "markdown",
 807 |    "metadata": {},
 808 |    "source": [
 809 |     "#### Linear Support Vector Machine: C parameter"
 810 |    ]
 811 |   },
 812 |   {
 813 |    "cell_type": "code",
 814 |    "execution_count": null,
 815 |    "metadata": {
 816 |     "collapsed": false
 817 |    },
 818 |    "outputs": [],
 819 |    "source": [
 820 |     "from sklearn.svm import LinearSVC\n",
 821 |     "from adspy_shared_utilities import plot_class_regions_for_classifier\n",
 822 |     "\n",
 823 |     "X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state = 0)\n",
 824 |     "fig, subaxes = plt.subplots(1, 2, figsize=(8, 4))\n",
 825 |     "\n",
 826 |     "for this_C, subplot in zip([0.00001, 100], subaxes):\n",
 827 |     "    clf = LinearSVC(C=this_C).fit(X_train, y_train)\n",
 828 |     "    title = 'Linear SVC, C = {:.5f}'.format(this_C)\n",
 829 |     "    plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n",
 830 |     "                                             None, None, title, subplot)\n",
 831 |     "plt.tight_layout()"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "markdown",
 836 |    "metadata": {},
 837 |    "source": [
 838 |     "#### Application to real dataset"
 839 |    ]
 840 |   },
 841 |   {
 842 |    "cell_type": "code",
 843 |    "execution_count": null,
 844 |    "metadata": {
 845 |     "collapsed": false
 846 |    },
 847 |    "outputs": [],
 848 |    "source": [
 849 |     "from sklearn.svm import LinearSVC\n",
 850 |     "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n",
 851 |     "\n",
 852 |     "clf = LinearSVC().fit(X_train, y_train)\n",
 853 |     "print('Breast cancer dataset')\n",
 854 |     "print('Accuracy of Linear SVC classifier on training set: {:.2f}'\n",
 855 |     "     .format(clf.score(X_train, y_train)))\n",
 856 |     "print('Accuracy of Linear SVC classifier on test set: {:.2f}'\n",
 857 |     "     .format(clf.score(X_test, y_test)))"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "markdown",
 862 |    "metadata": {},
 863 |    "source": [
 864 |     "### Multi-class classification with linear models"
 865 |    ]
 866 |   },
 867 |   {
 868 |    "cell_type": "markdown",
 869 |    "metadata": {},
 870 |    "source": [
 871 |     "#### LinearSVC with M classes generates M one vs rest classifiers."
 872 |    ]
 873 |   },
 874 |   {
 875 |    "cell_type": "code",
 876 |    "execution_count": null,
 877 |    "metadata": {
 878 |     "collapsed": false
 879 |    },
 880 |    "outputs": [],
 881 |    "source": [
 882 |     "from sklearn.svm import LinearSVC\n",
 883 |     "\n",
 884 |     "X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d, y_fruits_2d, random_state = 0)\n",
 885 |     "\n",
 886 |     "clf = LinearSVC(C=5, random_state = 67).fit(X_train, y_train)\n",
 887 |     "print('Coefficients:\\n', clf.coef_)\n",
 888 |     "print('Intercepts:\\n', clf.intercept_)"
 889 |    ]
 890 |   },
 891 |   {
 892 |    "cell_type": "markdown",
 893 |    "metadata": {},
 894 |    "source": [
 895 |     "#### Multi-class results on the fruit dataset"
 896 |    ]
 897 |   },
 898 |   {
 899 |    "cell_type": "code",
 900 |    "execution_count": null,
 901 |    "metadata": {
 902 |     "collapsed": false
 903 |    },
 904 |    "outputs": [],
 905 |    "source": [
 906 |     "plt.figure(figsize=(6,6))\n",
 907 |     "colors = ['r', 'g', 'b', 'y']\n",
 908 |     "cmap_fruits = ListedColormap(['#FF0000', '#00FF00', '#0000FF','#FFFF00'])\n",
 909 |     "\n",
 910 |     "plt.scatter(X_fruits_2d[['height']], X_fruits_2d[['width']],\n",
 911 |     "           c=y_fruits_2d, cmap=cmap_fruits, edgecolor = 'black', alpha=.7)\n",
 912 |     "\n",
 913 |     "x_0_range = np.linspace(-10, 15)\n",
 914 |     "\n",
 915 |     "for w, b, color in zip(clf.coef_, clf.intercept_, ['r', 'g', 'b', 'y']):\n",
 916 |     "    # Since class prediction with a linear model uses the formula y = w_0 x_0 + w_1 x_1 + b, \n",
 917 |     "    # and the decision boundary is defined as being all points with y = 0, to plot x_1 as a \n",
 918 |     "    # function of x_0 we just solve w_0 x_0 + w_1 x_1 + b = 0 for x_1:\n",
 919 |     "    plt.plot(x_0_range, -(x_0_range * w[0] + b) / w[1], c=color, alpha=.8)\n",
 920 |     "    \n",
 921 |     "plt.legend(target_names_fruits)\n",
 922 |     "plt.xlabel('height')\n",
 923 |     "plt.ylabel('width')\n",
 924 |     "plt.xlim(-2, 12)\n",
 925 |     "plt.ylim(-2, 15)\n",
 926 |     "plt.show()"
 927 |    ]
 928 |   },
 929 |   {
 930 |    "cell_type": "markdown",
 931 |    "metadata": {},
 932 |    "source": [
 933 |     "## Kernelized Support Vector Machines"
 934 |    ]
 935 |   },
 936 |   {
 937 |    "cell_type": "markdown",
 938 |    "metadata": {},
 939 |    "source": [
 940 |     "### Classification"
 941 |    ]
 942 |   },
 943 |   {
 944 |    "cell_type": "code",
 945 |    "execution_count": null,
 946 |    "metadata": {
 947 |     "collapsed": false,
 948 |     "scrolled": false
 949 |    },
 950 |    "outputs": [],
 951 |    "source": [
 952 |     "from sklearn.svm import SVC\n",
 953 |     "from adspy_shared_utilities import plot_class_regions_for_classifier\n",
 954 |     "\n",
 955 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)\n",
 956 |     "\n",
 957 |     "# The default SVC kernel is radial basis function (RBF)\n",
 958 |     "plot_class_regions_for_classifier(SVC().fit(X_train, y_train),\n",
 959 |     "                                 X_train, y_train, None, None,\n",
 960 |     "                                 'Support Vector Classifier: RBF kernel')\n",
 961 |     "\n",
 962 |     "# Compare decision boundries with polynomial kernel, degree = 3\n",
 963 |     "plot_class_regions_for_classifier(SVC(kernel = 'poly', degree = 3)\n",
 964 |     "                                 .fit(X_train, y_train), X_train,\n",
 965 |     "                                 y_train, None, None,\n",
 966 |     "                                 'Support Vector Classifier: Polynomial kernel, degree = 3')"
 967 |    ]
 968 |   },
 969 |   {
 970 |    "cell_type": "markdown",
 971 |    "metadata": {},
 972 |    "source": [
 973 |     "#### Support Vector Machine with RBF kernel: gamma parameter"
 974 |    ]
 975 |   },
 976 |   {
 977 |    "cell_type": "code",
 978 |    "execution_count": null,
 979 |    "metadata": {
 980 |     "collapsed": false
 981 |    },
 982 |    "outputs": [],
 983 |    "source": [
 984 |     "from adspy_shared_utilities import plot_class_regions_for_classifier\n",
 985 |     "\n",
 986 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)\n",
 987 |     "fig, subaxes = plt.subplots(3, 1, figsize=(4, 11))\n",
 988 |     "\n",
 989 |     "for this_gamma, subplot in zip([0.01, 1.0, 10.0], subaxes):\n",
 990 |     "    clf = SVC(kernel = 'rbf', gamma=this_gamma).fit(X_train, y_train)\n",
 991 |     "    title = 'Support Vector Classifier: \\nRBF kernel, gamma = {:.2f}'.format(this_gamma)\n",
 992 |     "    plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n",
 993 |     "                                             None, None, title, subplot)\n",
 994 |     "    plt.tight_layout()"
 995 |    ]
 996 |   },
 997 |   {
 998 |    "cell_type": "markdown",
 999 |    "metadata": {},
1000 |    "source": [
1001 |     "#### Support Vector Machine with RBF kernel: using both C and gamma parameter "
1002 |    ]
1003 |   },
1004 |   {
1005 |    "cell_type": "code",
1006 |    "execution_count": null,
1007 |    "metadata": {
1008 |     "collapsed": false
1009 |    },
1010 |    "outputs": [],
1011 |    "source": [
1012 |     "from sklearn.svm import SVC\n",
1013 |     "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n",
1014 |     "\n",
1015 |     "from sklearn.model_selection import train_test_split\n",
1016 |     "\n",
1017 |     "\n",
1018 |     "X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)\n",
1019 |     "fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50)\n",
1020 |     "\n",
1021 |     "for this_gamma, this_axis in zip([0.01, 1, 5], subaxes):\n",
1022 |     "    \n",
1023 |     "    for this_C, subplot in zip([0.1, 1, 15, 250], this_axis):\n",
1024 |     "        title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C)\n",
1025 |     "        clf = SVC(kernel = 'rbf', gamma = this_gamma,\n",
1026 |     "                 C = this_C).fit(X_train, y_train)\n",
1027 |     "        plot_class_regions_for_classifier_subplot(clf, X_train, y_train,\n",
1028 |     "                                                 X_test, y_test, title,\n",
1029 |     "                                                 subplot)\n",
1030 |     "        plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)"
1031 |    ]
1032 |   },
1033 |   {
1034 |    "cell_type": "markdown",
1035 |    "metadata": {},
1036 |    "source": [
1037 |     "### Application of SVMs to a real dataset: unnormalized data"
1038 |    ]
1039 |   },
1040 |   {
1041 |    "cell_type": "code",
1042 |    "execution_count": null,
1043 |    "metadata": {
1044 |     "collapsed": false
1045 |    },
1046 |    "outputs": [],
1047 |    "source": [
1048 |     "from sklearn.svm import SVC\n",
1049 |     "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,\n",
1050 |     "                                                   random_state = 0)\n",
1051 |     "\n",
1052 |     "clf = SVC(C=10).fit(X_train, y_train)\n",
1053 |     "print('Breast cancer dataset (unnormalized features)')\n",
1054 |     "print('Accuracy of RBF-kernel SVC on training set: {:.2f}'\n",
1055 |     "     .format(clf.score(X_train, y_train)))\n",
1056 |     "print('Accuracy of RBF-kernel SVC on test set: {:.2f}'\n",
1057 |     "     .format(clf.score(X_test, y_test)))"
1058 |    ]
1059 |   },
1060 |   {
1061 |    "cell_type": "markdown",
1062 |    "metadata": {},
1063 |    "source": [
1064 |     "### Application of SVMs to a real dataset: normalized data with feature preprocessing using minmax scaling"
1065 |    ]
1066 |   },
1067 |   {
1068 |    "cell_type": "code",
1069 |    "execution_count": null,
1070 |    "metadata": {
1071 |     "collapsed": false
1072 |    },
1073 |    "outputs": [],
1074 |    "source": [
1075 |     "from sklearn.preprocessing import MinMaxScaler\n",
1076 |     "scaler = MinMaxScaler()\n",
1077 |     "X_train_scaled = scaler.fit_transform(X_train)\n",
1078 |     "X_test_scaled = scaler.transform(X_test)\n",
1079 |     "\n",
1080 |     "clf = SVC(C=10).fit(X_train_scaled, y_train)\n",
1081 |     "print('Breast cancer dataset (normalized with MinMax scaling)')\n",
1082 |     "print('RBF-kernel SVC (with MinMax scaling) training set accuracy: {:.2f}'\n",
1083 |     "     .format(clf.score(X_train_scaled, y_train)))\n",
1084 |     "print('RBF-kernel SVC (with MinMax scaling) test set accuracy: {:.2f}'\n",
1085 |     "     .format(clf.score(X_test_scaled, y_test)))"
1086 |    ]
1087 |   },
1088 |   {
1089 |    "cell_type": "markdown",
1090 |    "metadata": {
1091 |     "collapsed": true
1092 |    },
1093 |    "source": [
1094 |     "## Cross-validation"
1095 |    ]
1096 |   },
1097 |   {
1098 |    "cell_type": "markdown",
1099 |    "metadata": {},
1100 |    "source": [
1101 |     "### Example based on k-NN classifier with fruit dataset (2 features)"
1102 |    ]
1103 |   },
1104 |   {
1105 |    "cell_type": "code",
1106 |    "execution_count": null,
1107 |    "metadata": {
1108 |     "collapsed": false
1109 |    },
1110 |    "outputs": [],
1111 |    "source": [
1112 |     "from sklearn.model_selection import cross_val_score\n",
1113 |     "\n",
1114 |     "clf = KNeighborsClassifier(n_neighbors = 5)\n",
1115 |     "X = X_fruits_2d.as_matrix()\n",
1116 |     "y = y_fruits_2d.as_matrix()\n",
1117 |     "cv_scores = cross_val_score(clf, X, y)\n",
1118 |     "\n",
1119 |     "print('Cross-validation scores (3-fold):', cv_scores)\n",
1120 |     "print('Mean cross-validation score (3-fold): {:.3f}'\n",
1121 |     "     .format(np.mean(cv_scores)))"
1122 |    ]
1123 |   },
1124 |   {
1125 |    "cell_type": "markdown",
1126 |    "metadata": {},
1127 |    "source": [
1128 |     "### A note on performing cross-validation for more advanced scenarios.\n",
1129 |     "\n",
1130 |     "In some cases (e.g. when feature values have very different ranges), we've seen the need to scale or normalize the training and test sets before use with a classifier. The proper way to do cross-validation when you need to scale the data is *not* to scale the entire dataset with a single transform, since this will indirectly leak information into the training data about the whole dataset, including the test data (see the lecture on data leakage later in the course).  Instead, scaling/normalizing must be computed and applied for each cross-validation fold separately.  To do this, the easiest way in scikit-learn is to use *pipelines*.  While these are beyond the scope of this course, further information is available in the scikit-learn documentation here:\n",
1131 |     "\n",
1132 |     "http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html\n",
1133 |     "\n",
1134 |     "or the Pipeline section in the recommended textbook: Introduction to Machine Learning with Python by Andreas C. Müller and Sarah Guido (O'Reilly Media)."
1135 |    ]
1136 |   },
1137 |   {
1138 |    "cell_type": "markdown",
1139 |    "metadata": {},
1140 |    "source": [
1141 |     "## Validation curve example"
1142 |    ]
1143 |   },
1144 |   {
1145 |    "cell_type": "code",
1146 |    "execution_count": null,
1147 |    "metadata": {
1148 |     "collapsed": true
1149 |    },
1150 |    "outputs": [],
1151 |    "source": [
1152 |     "from sklearn.svm import SVC\n",
1153 |     "from sklearn.model_selection import validation_curve\n",
1154 |     "\n",
1155 |     "param_range = np.logspace(-3, 3, 4)\n",
1156 |     "train_scores, test_scores = validation_curve(SVC(), X, y,\n",
1157 |     "                                            param_name='gamma',\n",
1158 |     "                                            param_range=param_range, cv=3)"
1159 |    ]
1160 |   },
1161 |   {
1162 |    "cell_type": "code",
1163 |    "execution_count": null,
1164 |    "metadata": {
1165 |     "collapsed": false
1166 |    },
1167 |    "outputs": [],
1168 |    "source": [
1169 |     "print(train_scores)"
1170 |    ]
1171 |   },
1172 |   {
1173 |    "cell_type": "code",
1174 |    "execution_count": null,
1175 |    "metadata": {
1176 |     "collapsed": false
1177 |    },
1178 |    "outputs": [],
1179 |    "source": [
1180 |     "print(test_scores)"
1181 |    ]
1182 |   },
1183 |   {
1184 |    "cell_type": "code",
1185 |    "execution_count": null,
1186 |    "metadata": {
1187 |     "collapsed": false
1188 |    },
1189 |    "outputs": [],
1190 |    "source": [
1191 |     "# This code based on scikit-learn validation_plot example\n",
1192 |     "#  See:  http://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html\n",
1193 |     "plt.figure()\n",
1194 |     "\n",
1195 |     "train_scores_mean = np.mean(train_scores, axis=1)\n",
1196 |     "train_scores_std = np.std(train_scores, axis=1)\n",
1197 |     "test_scores_mean = np.mean(test_scores, axis=1)\n",
1198 |     "test_scores_std = np.std(test_scores, axis=1)\n",
1199 |     "\n",
1200 |     "plt.title('Validation Curve with SVM')\n",
1201 |     "plt.xlabel('$\\gamma$ (gamma)')\n",
1202 |     "plt.ylabel('Score')\n",
1203 |     "plt.ylim(0.0, 1.1)\n",
1204 |     "lw = 2\n",
1205 |     "\n",
1206 |     "plt.semilogx(param_range, train_scores_mean, label='Training score',\n",
1207 |     "            color='darkorange', lw=lw)\n",
1208 |     "\n",
1209 |     "plt.fill_between(param_range, train_scores_mean - train_scores_std,\n",
1210 |     "                train_scores_mean + train_scores_std, alpha=0.2,\n",
1211 |     "                color='darkorange', lw=lw)\n",
1212 |     "\n",
1213 |     "plt.semilogx(param_range, test_scores_mean, label='Cross-validation score',\n",
1214 |     "            color='navy', lw=lw)\n",
1215 |     "\n",
1216 |     "plt.fill_between(param_range, test_scores_mean - test_scores_std,\n",
1217 |     "                test_scores_mean + test_scores_std, alpha=0.2,\n",
1218 |     "                color='navy', lw=lw)\n",
1219 |     "\n",
1220 |     "plt.legend(loc='best')\n",
1221 |     "plt.show()"
1222 |    ]
1223 |   },
1224 |   {
1225 |    "cell_type": "markdown",
1226 |    "metadata": {
1227 |     "collapsed": true
1228 |    },
1229 |    "source": [
1230 |     "## Decision Trees"
1231 |    ]
1232 |   },
1233 |   {
1234 |    "cell_type": "code",
1235 |    "execution_count": null,
1236 |    "metadata": {
1237 |     "collapsed": false
1238 |    },
1239 |    "outputs": [],
1240 |    "source": [
1241 |     "from sklearn.datasets import load_iris\n",
1242 |     "from sklearn.tree import DecisionTreeClassifier\n",
1243 |     "from adspy_shared_utilities import plot_decision_tree\n",
1244 |     "from sklearn.model_selection import train_test_split\n",
1245 |     "\n",
1246 |     "\n",
1247 |     "iris = load_iris()\n",
1248 |     "\n",
1249 |     "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)\n",
1250 |     "clf = DecisionTreeClassifier().fit(X_train, y_train)\n",
1251 |     "\n",
1252 |     "print('Accuracy of Decision Tree classifier on training set: {:.2f}'\n",
1253 |     "     .format(clf.score(X_train, y_train)))\n",
1254 |     "print('Accuracy of Decision Tree classifier on test set: {:.2f}'\n",
1255 |     "     .format(clf.score(X_test, y_test)))"
1256 |    ]
1257 |   },
1258 |   {
1259 |    "cell_type": "markdown",
1260 |    "metadata": {},
1261 |    "source": [
1262 |     "#### Setting max decision tree depth to help avoid overfitting"
1263 |    ]
1264 |   },
1265 |   {
1266 |    "cell_type": "code",
1267 |    "execution_count": null,
1268 |    "metadata": {
1269 |     "collapsed": false
1270 |    },
1271 |    "outputs": [],
1272 |    "source": [
1273 |     "clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)\n",
1274 |     "\n",
1275 |     "print('Accuracy of Decision Tree classifier on training set: {:.2f}'\n",
1276 |     "     .format(clf2.score(X_train, y_train)))\n",
1277 |     "print('Accuracy of Decision Tree classifier on test set: {:.2f}'\n",
1278 |     "     .format(clf2.score(X_test, y_test)))"
1279 |    ]
1280 |   },
1281 |   {
1282 |    "cell_type": "markdown",
1283 |    "metadata": {},
1284 |    "source": [
1285 |     "#### Visualizing decision trees"
1286 |    ]
1287 |   },
1288 |   {
1289 |    "cell_type": "code",
1290 |    "execution_count": null,
1291 |    "metadata": {
1292 |     "collapsed": false
1293 |    },
1294 |    "outputs": [],
1295 |    "source": [
1296 |     "plot_decision_tree(clf, iris.feature_names, iris.target_names)"
1297 |    ]
1298 |   },
1299 |   {
1300 |    "cell_type": "markdown",
1301 |    "metadata": {},
1302 |    "source": [
1303 |     "#### Pre-pruned version (max_depth = 3)"
1304 |    ]
1305 |   },
1306 |   {
1307 |    "cell_type": "code",
1308 |    "execution_count": null,
1309 |    "metadata": {
1310 |     "collapsed": false
1311 |    },
1312 |    "outputs": [],
1313 |    "source": [
1314 |     "plot_decision_tree(clf2, iris.feature_names, iris.target_names)"
1315 |    ]
1316 |   },
1317 |   {
1318 |    "cell_type": "markdown",
1319 |    "metadata": {},
1320 |    "source": [
1321 |     "#### Feature importance"
1322 |    ]
1323 |   },
1324 |   {
1325 |    "cell_type": "code",
1326 |    "execution_count": null,
1327 |    "metadata": {
1328 |     "collapsed": false
1329 |    },
1330 |    "outputs": [],
1331 |    "source": [
1332 |     "from adspy_shared_utilities import plot_feature_importances\n",
1333 |     "\n",
1334 |     "plt.figure(figsize=(10,4), dpi=80)\n",
1335 |     "plot_feature_importances(clf, iris.feature_names)\n",
1336 |     "plt.show()\n",
1337 |     "\n",
1338 |     "print('Feature importances: {}'.format(clf.feature_importances_))"
1339 |    ]
1340 |   },
1341 |   {
1342 |    "cell_type": "code",
1343 |    "execution_count": null,
1344 |    "metadata": {
1345 |     "collapsed": false
1346 |    },
1347 |    "outputs": [],
1348 |    "source": [
1349 |     "from sklearn.tree import DecisionTreeClassifier\n",
1350 |     "from adspy_shared_utilities import plot_class_regions_for_classifier_subplot\n",
1351 |     "\n",
1352 |     "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 0)\n",
1353 |     "fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))\n",
1354 |     "\n",
1355 |     "pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]\n",
1356 |     "tree_max_depth = 4\n",
1357 |     "\n",
1358 |     "for pair, axis in zip(pair_list, subaxes):\n",
1359 |     "    X = X_train[:, pair]\n",
1360 |     "    y = y_train\n",
1361 |     "    \n",
1362 |     "    clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)\n",
1363 |     "    title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth)\n",
1364 |     "    plot_class_regions_for_classifier_subplot(clf, X, y, None,\n",
1365 |     "                                             None, title, axis,\n",
1366 |     "                                             iris.target_names)\n",
1367 |     "    \n",
1368 |     "    axis.set_xlabel(iris.feature_names[pair[0]])\n",
1369 |     "    axis.set_ylabel(iris.feature_names[pair[1]])\n",
1370 |     "    \n",
1371 |     "plt.tight_layout()\n",
1372 |     "plt.show()"
1373 |    ]
1374 |   },
1375 |   {
1376 |    "cell_type": "markdown",
1377 |    "metadata": {},
1378 |    "source": [
1379 |     "#### Decision Trees on a real-world dataset"
1380 |    ]
1381 |   },
1382 |   {
1383 |    "cell_type": "code",
1384 |    "execution_count": null,
1385 |    "metadata": {
1386 |     "collapsed": false
1387 |    },
1388 |    "outputs": [],
1389 |    "source": [
1390 |     "from sklearn.tree import DecisionTreeClassifier\n",
1391 |     "from adspy_shared_utilities import plot_decision_tree\n",
1392 |     "from adspy_shared_utilities import plot_feature_importances\n",
1393 |     "\n",
1394 |     "X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)\n",
1395 |     "\n",
1396 |     "clf = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 8,\n",
1397 |     "                            random_state = 0).fit(X_train, y_train)\n",
1398 |     "\n",
1399 |     "plot_decision_tree(clf, cancer.feature_names, cancer.target_names)"
1400 |    ]
1401 |   },
1402 |   {
1403 |    "cell_type": "code",
1404 |    "execution_count": null,
1405 |    "metadata": {
1406 |     "collapsed": false
1407 |    },
1408 |    "outputs": [],
1409 |    "source": [
1410 |     "print('Breast cancer dataset: decision tree')\n",
1411 |     "print('Accuracy of DT classifier on training set: {:.2f}'\n",
1412 |     "     .format(clf.score(X_train, y_train)))\n",
1413 |     "print('Accuracy of DT classifier on test set: {:.2f}'\n",
1414 |     "     .format(clf.score(X_test, y_test)))\n",
1415 |     "\n",
1416 |     "plt.figure(figsize=(10,6),dpi=80)\n",
1417 |     "plot_feature_importances(clf, cancer.feature_names)\n",
1418 |     "plt.tight_layout()\n",
1419 |     "\n",
1420 |     "plt.show()"
1421 |    ]
1422 |   }
1423 |  ],
1424 |  "metadata": {
1425 |   "anaconda-cloud": {},
1426 |   "kernelspec": {
1427 |    "display_name": "Python 3",
1428 |    "language": "python",
1429 |    "name": "python3"
1430 |   },
1431 |   "language_info": {
1432 |    "codemirror_mode": {
1433 |     "name": "ipython",
1434 |     "version": 3
1435 |    },
1436 |    "file_extension": ".py",
1437 |    "mimetype": "text/x-python",
1438 |    "name": "python",
1439 |    "nbconvert_exporter": "python",
1440 |    "pygments_lexer": "ipython3",
1441 |    "version": "3.5.2"
1442 |   }
1443 |  },
1444 |  "nbformat": 4,
1445 |  "nbformat_minor": 2
1446 | }
1447 | 


--------------------------------------------------------------------------------
/Week 4/Assignment4.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "---\n",
   8 |     "\n",
   9 |     "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-machine-learning/resources/bANLa) course resource._\n",
  10 |     "\n",
  11 |     "---"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "metadata": {},
  17 |    "source": [
  18 |     "## Assignment 4 - Understanding and Predicting Property Maintenance Fines\n",
  19 |     "\n",
  20 |     "This assignment is based on a data challenge from the Michigan Data Science Team ([MDST](http://midas.umich.edu/mdst/)). \n",
  21 |     "\n",
  22 |     "The Michigan Data Science Team ([MDST](http://midas.umich.edu/mdst/)) and the Michigan Student Symposium for Interdisciplinary Statistical Sciences ([MSSISS](https://sites.lsa.umich.edu/mssiss/)) have partnered with the City of Detroit to help solve one of the most pressing problems facing Detroit - blight. [Blight violations](http://www.detroitmi.gov/How-Do-I/Report/Blight-Complaint-FAQs) are issued by the city to individuals who allow their properties to remain in a deteriorated condition. Every year, the city of Detroit issues millions of dollars in fines to residents and every year, many of these fines remain unpaid. Enforcing unpaid blight fines is a costly and tedious process, so the city wants to know: how can we increase blight ticket compliance?\n",
  23 |     "\n",
  24 |     "The first step in answering this question is understanding when and why a resident might fail to comply with a blight ticket. This is where predictive modeling comes in. For this assignment, your task is to predict whether a given blight ticket will be paid on time.\n",
  25 |     "\n",
  26 |     "All data for this assignment has been provided to us through the [Detroit Open Data Portal](https://data.detroitmi.gov/). **Only the data already included in your Coursera directory can be used for training the model for this assignment.** Nonetheless, we encourage you to look into data from other Detroit datasets to help inform feature creation and model selection. We recommend taking a look at the following related datasets:\n",
  27 |     "\n",
  28 |     "* [Building Permits](https://data.detroitmi.gov/Property-Parcels/Building-Permits/xw2a-a7tf)\n",
  29 |     "* [Trades Permits](https://data.detroitmi.gov/Property-Parcels/Trades-Permits/635b-dsgv)\n",
  30 |     "* [Improve Detroit: Submitted Issues](https://data.detroitmi.gov/Government/Improve-Detroit-Submitted-Issues/fwz3-w3yn)\n",
  31 |     "* [DPD: Citizen Complaints](https://data.detroitmi.gov/Public-Safety/DPD-Citizen-Complaints-2016/kahe-efs3)\n",
  32 |     "* [Parcel Map](https://data.detroitmi.gov/Property-Parcels/Parcel-Map/fxkw-udwf)\n",
  33 |     "\n",
  34 |     "___\n",
  35 |     "\n",
  36 |     "We provide you with two data files for use in training and validating your models: train.csv and test.csv. Each row in these two files corresponds to a single blight ticket, and includes information about when, why, and to whom each ticket was issued. The target variable is compliance, which is True if the ticket was paid early, on time, or within one month of the hearing data, False if the ticket was paid after the hearing date or not at all, and Null if the violator was found not responsible. Compliance, as well as a handful of other variables that will not be available at test-time, are only included in train.csv.\n",
  37 |     "\n",
  38 |     "Note: All tickets where the violators were found not responsible are not considered during evaluation. They are included in the training set as an additional source of data for visualization, and to enable unsupervised and semi-supervised approaches. However, they are not included in the test set.\n",
  39 |     "\n",
  40 |     "<br>\n",
  41 |     "\n",
  42 |     "**File descriptions** (Use only this data for training your model!)\n",
  43 |     "\n",
  44 |     "    train.csv - the training set (all tickets issued 2004-2011)\n",
  45 |     "    test.csv - the test set (all tickets issued 2012-2016)\n",
  46 |     "    addresses.csv & latlons.csv - mapping from ticket id to addresses, and from addresses to lat/lon coordinates. \n",
  47 |     "     Note: misspelled addresses may be incorrectly geolocated.\n",
  48 |     "\n",
  49 |     "<br>\n",
  50 |     "\n",
  51 |     "**Data fields**\n",
  52 |     "\n",
  53 |     "train.csv & test.csv\n",
  54 |     "\n",
  55 |     "    ticket_id - unique identifier for tickets\n",
  56 |     "    agency_name - Agency that issued the ticket\n",
  57 |     "    inspector_name - Name of inspector that issued the ticket\n",
  58 |     "    violator_name - Name of the person/organization that the ticket was issued to\n",
  59 |     "    violation_street_number, violation_street_name, violation_zip_code - Address where the violation occurred\n",
  60 |     "    mailing_address_str_number, mailing_address_str_name, city, state, zip_code, non_us_str_code, country - Mailing address of the violator\n",
  61 |     "    ticket_issued_date - Date and time the ticket was issued\n",
  62 |     "    hearing_date - Date and time the violator's hearing was scheduled\n",
  63 |     "    violation_code, violation_description - Type of violation\n",
  64 |     "    disposition - Judgment and judgement type\n",
  65 |     "    fine_amount - Violation fine amount, excluding fees\n",
  66 |     "    admin_fee - $20 fee assigned to responsible judgments\n",
  67 |     "state_fee - $10 fee assigned to responsible judgments\n",
  68 |     "    late_fee - 10% fee assigned to responsible judgments\n",
  69 |     "    discount_amount - discount applied, if any\n",
  70 |     "    clean_up_cost - DPW clean-up or graffiti removal cost\n",
  71 |     "    judgment_amount - Sum of all fines and fees\n",
  72 |     "    grafitti_status - Flag for graffiti violations\n",
  73 |     "    \n",
  74 |     "train.csv only\n",
  75 |     "\n",
  76 |     "    payment_amount - Amount paid, if any\n",
  77 |     "    payment_date - Date payment was made, if it was received\n",
  78 |     "    payment_status - Current payment status as of Feb 1 2017\n",
  79 |     "    balance_due - Fines and fees still owed\n",
  80 |     "    collection_status - Flag for payments in collections\n",
  81 |     "    compliance [target variable for prediction] \n",
  82 |     "     Null = Not responsible\n",
  83 |     "     0 = Responsible, non-compliant\n",
  84 |     "     1 = Responsible, compliant\n",
  85 |     "    compliance_detail - More information on why each ticket was marked compliant or non-compliant\n",
  86 |     "\n",
  87 |     "\n",
  88 |     "___\n",
  89 |     "\n",
  90 |     "## Evaluation\n",
  91 |     "\n",
  92 |     "Your predictions will be given as the probability that the corresponding blight ticket will be paid on time.\n",
  93 |     "\n",
  94 |     "The evaluation metric for this assignment is the Area Under the ROC Curve (AUC). \n",
  95 |     "\n",
  96 |     "Your grade will be based on the AUC score computed for your classifier. A model which with an AUROC of 0.7 passes this assignment, over 0.75 will recieve full points.\n",
  97 |     "___\n",
  98 |     "\n",
  99 |     "For this assignment, create a function that trains a model to predict blight ticket compliance in Detroit using `train.csv`. Using this model, return a series of length 61001 with the data being the probability that each corresponding ticket from `test.csv` will be paid, and the index being the ticket_id.\n",
 100 |     "\n",
 101 |     "Example:\n",
 102 |     "\n",
 103 |     "    ticket_id\n",
 104 |     "       284932    0.531842\n",
 105 |     "       285362    0.401958\n",
 106 |     "       285361    0.105928\n",
 107 |     "       285338    0.018572\n",
 108 |     "                 ...\n",
 109 |     "       376499    0.208567\n",
 110 |     "       376500    0.818759\n",
 111 |     "       369851    0.018528\n",
 112 |     "       Name: compliance, dtype: float32"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "code",
 117 |    "execution_count": 1,
 118 |    "metadata": {
 119 |     "collapsed": true
 120 |    },
 121 |    "outputs": [],
 122 |    "source": [
 123 |     "import pandas as pd\n",
 124 |     "import numpy as np"
 125 |    ]
 126 |   },
 127 |   {
 128 |    "cell_type": "code",
 129 |    "execution_count": 13,
 130 |    "metadata": {
 131 |     "collapsed": false
 132 |    },
 133 |    "outputs": [
 134 |     {
 135 |      "name": "stdout",
 136 |      "output_type": "stream",
 137 |      "text": [
 138 |       "(250306, 34)\n"
 139 |      ]
 140 |     },
 141 |     {
 142 |      "name": "stderr",
 143 |      "output_type": "stream",
 144 |      "text": [
 145 |       "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (11,12,31) have mixed types. Specify dtype option on import or set low_memory=False.\n",
 146 |       "  interactivity=interactivity, compiler=compiler, result=result)\n"
 147 |      ]
 148 |     },
 149 |     {
 150 |      "data": {
 151 |       "text/html": [
 152 |        "<div>\n",
 153 |        "<table border=\"1\" class=\"dataframe\">\n",
 154 |        "  <thead>\n",
 155 |        "    <tr style=\"text-align: right;\">\n",
 156 |        "      <th></th>\n",
 157 |        "      <th>ticket_id</th>\n",
 158 |        "      <th>agency_name</th>\n",
 159 |        "      <th>inspector_name</th>\n",
 160 |        "      <th>violator_name</th>\n",
 161 |        "      <th>violation_street_number</th>\n",
 162 |        "      <th>violation_street_name</th>\n",
 163 |        "      <th>violation_zip_code</th>\n",
 164 |        "      <th>mailing_address_str_number</th>\n",
 165 |        "      <th>mailing_address_str_name</th>\n",
 166 |        "      <th>city</th>\n",
 167 |        "      <th>...</th>\n",
 168 |        "      <th>clean_up_cost</th>\n",
 169 |        "      <th>judgment_amount</th>\n",
 170 |        "      <th>payment_amount</th>\n",
 171 |        "      <th>balance_due</th>\n",
 172 |        "      <th>payment_date</th>\n",
 173 |        "      <th>payment_status</th>\n",
 174 |        "      <th>collection_status</th>\n",
 175 |        "      <th>grafitti_status</th>\n",
 176 |        "      <th>compliance_detail</th>\n",
 177 |        "      <th>compliance</th>\n",
 178 |        "    </tr>\n",
 179 |        "  </thead>\n",
 180 |        "  <tbody>\n",
 181 |        "    <tr>\n",
 182 |        "      <th>0</th>\n",
 183 |        "      <td>22056</td>\n",
 184 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
 185 |        "      <td>Sims, Martinzie</td>\n",
 186 |        "      <td>INVESTMENT INC., MIDWEST MORTGAGE</td>\n",
 187 |        "      <td>2900.0</td>\n",
 188 |        "      <td>TYLER</td>\n",
 189 |        "      <td>NaN</td>\n",
 190 |        "      <td>3.0</td>\n",
 191 |        "      <td>S. WICKER</td>\n",
 192 |        "      <td>CHICAGO</td>\n",
 193 |        "      <td>...</td>\n",
 194 |        "      <td>0.0</td>\n",
 195 |        "      <td>305.0</td>\n",
 196 |        "      <td>0.0</td>\n",
 197 |        "      <td>305.0</td>\n",
 198 |        "      <td>NaN</td>\n",
 199 |        "      <td>NO PAYMENT APPLIED</td>\n",
 200 |        "      <td>NaN</td>\n",
 201 |        "      <td>NaN</td>\n",
 202 |        "      <td>non-compliant by no payment</td>\n",
 203 |        "      <td>0.0</td>\n",
 204 |        "    </tr>\n",
 205 |        "    <tr>\n",
 206 |        "      <th>1</th>\n",
 207 |        "      <td>27586</td>\n",
 208 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
 209 |        "      <td>Williams, Darrin</td>\n",
 210 |        "      <td>Michigan, Covenant House</td>\n",
 211 |        "      <td>4311.0</td>\n",
 212 |        "      <td>CENTRAL</td>\n",
 213 |        "      <td>NaN</td>\n",
 214 |        "      <td>2959.0</td>\n",
 215 |        "      <td>Martin Luther King</td>\n",
 216 |        "      <td>Detroit</td>\n",
 217 |        "      <td>...</td>\n",
 218 |        "      <td>0.0</td>\n",
 219 |        "      <td>855.0</td>\n",
 220 |        "      <td>780.0</td>\n",
 221 |        "      <td>75.0</td>\n",
 222 |        "      <td>2005-06-02 00:00:00</td>\n",
 223 |        "      <td>PAID IN FULL</td>\n",
 224 |        "      <td>NaN</td>\n",
 225 |        "      <td>NaN</td>\n",
 226 |        "      <td>compliant by late payment within 1 month</td>\n",
 227 |        "      <td>1.0</td>\n",
 228 |        "    </tr>\n",
 229 |        "    <tr>\n",
 230 |        "      <th>2</th>\n",
 231 |        "      <td>22062</td>\n",
 232 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
 233 |        "      <td>Sims, Martinzie</td>\n",
 234 |        "      <td>SANDERS, DERRON</td>\n",
 235 |        "      <td>1449.0</td>\n",
 236 |        "      <td>LONGFELLOW</td>\n",
 237 |        "      <td>NaN</td>\n",
 238 |        "      <td>23658.0</td>\n",
 239 |        "      <td>P.O. BOX</td>\n",
 240 |        "      <td>DETROIT</td>\n",
 241 |        "      <td>...</td>\n",
 242 |        "      <td>0.0</td>\n",
 243 |        "      <td>0.0</td>\n",
 244 |        "      <td>0.0</td>\n",
 245 |        "      <td>0.0</td>\n",
 246 |        "      <td>NaN</td>\n",
 247 |        "      <td>NO PAYMENT APPLIED</td>\n",
 248 |        "      <td>NaN</td>\n",
 249 |        "      <td>NaN</td>\n",
 250 |        "      <td>not responsible by disposition</td>\n",
 251 |        "      <td>NaN</td>\n",
 252 |        "    </tr>\n",
 253 |        "    <tr>\n",
 254 |        "      <th>3</th>\n",
 255 |        "      <td>22084</td>\n",
 256 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
 257 |        "      <td>Sims, Martinzie</td>\n",
 258 |        "      <td>MOROSI, MIKE</td>\n",
 259 |        "      <td>1441.0</td>\n",
 260 |        "      <td>LONGFELLOW</td>\n",
 261 |        "      <td>NaN</td>\n",
 262 |        "      <td>5.0</td>\n",
 263 |        "      <td>ST. CLAIR</td>\n",
 264 |        "      <td>DETROIT</td>\n",
 265 |        "      <td>...</td>\n",
 266 |        "      <td>0.0</td>\n",
 267 |        "      <td>0.0</td>\n",
 268 |        "      <td>0.0</td>\n",
 269 |        "      <td>0.0</td>\n",
 270 |        "      <td>NaN</td>\n",
 271 |        "      <td>NO PAYMENT APPLIED</td>\n",
 272 |        "      <td>NaN</td>\n",
 273 |        "      <td>NaN</td>\n",
 274 |        "      <td>not responsible by disposition</td>\n",
 275 |        "      <td>NaN</td>\n",
 276 |        "    </tr>\n",
 277 |        "    <tr>\n",
 278 |        "      <th>4</th>\n",
 279 |        "      <td>22093</td>\n",
 280 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
 281 |        "      <td>Sims, Martinzie</td>\n",
 282 |        "      <td>NATHANIEL, NEAL</td>\n",
 283 |        "      <td>2449.0</td>\n",
 284 |        "      <td>CHURCHILL</td>\n",
 285 |        "      <td>NaN</td>\n",
 286 |        "      <td>7449.0</td>\n",
 287 |        "      <td>CHURCHILL</td>\n",
 288 |        "      <td>DETROIT</td>\n",
 289 |        "      <td>...</td>\n",
 290 |        "      <td>0.0</td>\n",
 291 |        "      <td>0.0</td>\n",
 292 |        "      <td>0.0</td>\n",
 293 |        "      <td>0.0</td>\n",
 294 |        "      <td>NaN</td>\n",
 295 |        "      <td>NO PAYMENT APPLIED</td>\n",
 296 |        "      <td>NaN</td>\n",
 297 |        "      <td>NaN</td>\n",
 298 |        "      <td>not responsible by disposition</td>\n",
 299 |        "      <td>NaN</td>\n",
 300 |        "    </tr>\n",
 301 |        "  </tbody>\n",
 302 |        "</table>\n",
 303 |        "<p>5 rows × 34 columns</p>\n",
 304 |        "</div>"
 305 |       ],
 306 |       "text/plain": [
 307 |        "   ticket_id                                     agency_name  \\\n",
 308 |        "0      22056  Buildings, Safety Engineering & Env Department   \n",
 309 |        "1      27586  Buildings, Safety Engineering & Env Department   \n",
 310 |        "2      22062  Buildings, Safety Engineering & Env Department   \n",
 311 |        "3      22084  Buildings, Safety Engineering & Env Department   \n",
 312 |        "4      22093  Buildings, Safety Engineering & Env Department   \n",
 313 |        "\n",
 314 |        "     inspector_name                      violator_name  \\\n",
 315 |        "0   Sims, Martinzie  INVESTMENT INC., MIDWEST MORTGAGE   \n",
 316 |        "1  Williams, Darrin           Michigan, Covenant House   \n",
 317 |        "2   Sims, Martinzie                    SANDERS, DERRON   \n",
 318 |        "3   Sims, Martinzie                       MOROSI, MIKE   \n",
 319 |        "4   Sims, Martinzie                    NATHANIEL, NEAL   \n",
 320 |        "\n",
 321 |        "   violation_street_number violation_street_name  violation_zip_code  \\\n",
 322 |        "0                   2900.0                 TYLER                 NaN   \n",
 323 |        "1                   4311.0               CENTRAL                 NaN   \n",
 324 |        "2                   1449.0            LONGFELLOW                 NaN   \n",
 325 |        "3                   1441.0            LONGFELLOW                 NaN   \n",
 326 |        "4                   2449.0             CHURCHILL                 NaN   \n",
 327 |        "\n",
 328 |        "   mailing_address_str_number mailing_address_str_name     city     ...      \\\n",
 329 |        "0                         3.0                S. WICKER  CHICAGO     ...       \n",
 330 |        "1                      2959.0       Martin Luther King  Detroit     ...       \n",
 331 |        "2                     23658.0                 P.O. BOX  DETROIT     ...       \n",
 332 |        "3                         5.0                ST. CLAIR  DETROIT     ...       \n",
 333 |        "4                      7449.0                CHURCHILL  DETROIT     ...       \n",
 334 |        "\n",
 335 |        "  clean_up_cost judgment_amount payment_amount balance_due  \\\n",
 336 |        "0           0.0           305.0            0.0       305.0   \n",
 337 |        "1           0.0           855.0          780.0        75.0   \n",
 338 |        "2           0.0             0.0            0.0         0.0   \n",
 339 |        "3           0.0             0.0            0.0         0.0   \n",
 340 |        "4           0.0             0.0            0.0         0.0   \n",
 341 |        "\n",
 342 |        "          payment_date      payment_status collection_status grafitti_status  \\\n",
 343 |        "0                  NaN  NO PAYMENT APPLIED               NaN             NaN   \n",
 344 |        "1  2005-06-02 00:00:00        PAID IN FULL               NaN             NaN   \n",
 345 |        "2                  NaN  NO PAYMENT APPLIED               NaN             NaN   \n",
 346 |        "3                  NaN  NO PAYMENT APPLIED               NaN             NaN   \n",
 347 |        "4                  NaN  NO PAYMENT APPLIED               NaN             NaN   \n",
 348 |        "\n",
 349 |        "                          compliance_detail  compliance  \n",
 350 |        "0               non-compliant by no payment         0.0  \n",
 351 |        "1  compliant by late payment within 1 month         1.0  \n",
 352 |        "2            not responsible by disposition         NaN  \n",
 353 |        "3            not responsible by disposition         NaN  \n",
 354 |        "4            not responsible by disposition         NaN  \n",
 355 |        "\n",
 356 |        "[5 rows x 34 columns]"
 357 |       ]
 358 |      },
 359 |      "execution_count": 13,
 360 |      "metadata": {},
 361 |      "output_type": "execute_result"
 362 |     }
 363 |    ],
 364 |    "source": [
 365 |     "train_data = pd.read_csv('train.csv', encoding = 'ISO-8859-1')\n",
 366 |     "print(train_data.shape)\n",
 367 |     "train_data.head()"
 368 |    ]
 369 |   },
 370 |   {
 371 |    "cell_type": "code",
 372 |    "execution_count": 38,
 373 |    "metadata": {
 374 |     "collapsed": false
 375 |    },
 376 |    "outputs": [
 377 |     {
 378 |      "data": {
 379 |       "text/plain": [
 380 |        "(159880, 35)"
 381 |       ]
 382 |      },
 383 |      "execution_count": 38,
 384 |      "metadata": {},
 385 |      "output_type": "execute_result"
 386 |     }
 387 |    ],
 388 |    "source": [
 389 |     "train_data[(train_data['compliance'] == 0) | (train_data['compliance'] == 1)].shape"
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "code",
 394 |    "execution_count": 26,
 395 |    "metadata": {
 396 |     "collapsed": false
 397 |    },
 398 |    "outputs": [
 399 |     {
 400 |      "data": {
 401 |       "text/html": [
 402 |        "<div>\n",
 403 |        "<table border=\"1\" class=\"dataframe\">\n",
 404 |        "  <thead>\n",
 405 |        "    <tr style=\"text-align: right;\">\n",
 406 |        "      <th></th>\n",
 407 |        "      <th>ticket_id</th>\n",
 408 |        "      <th>agency_name</th>\n",
 409 |        "      <th>inspector_name</th>\n",
 410 |        "      <th>violator_name</th>\n",
 411 |        "      <th>violation_street_number</th>\n",
 412 |        "      <th>violation_street_name</th>\n",
 413 |        "      <th>violation_zip_code</th>\n",
 414 |        "      <th>mailing_address_str_number</th>\n",
 415 |        "      <th>mailing_address_str_name</th>\n",
 416 |        "      <th>city</th>\n",
 417 |        "      <th>...</th>\n",
 418 |        "      <th>violation_description</th>\n",
 419 |        "      <th>disposition</th>\n",
 420 |        "      <th>fine_amount</th>\n",
 421 |        "      <th>admin_fee</th>\n",
 422 |        "      <th>state_fee</th>\n",
 423 |        "      <th>late_fee</th>\n",
 424 |        "      <th>discount_amount</th>\n",
 425 |        "      <th>clean_up_cost</th>\n",
 426 |        "      <th>judgment_amount</th>\n",
 427 |        "      <th>grafitti_status</th>\n",
 428 |        "    </tr>\n",
 429 |        "  </thead>\n",
 430 |        "  <tbody>\n",
 431 |        "    <tr>\n",
 432 |        "      <th>0</th>\n",
 433 |        "      <td>284932</td>\n",
 434 |        "      <td>Department of Public Works</td>\n",
 435 |        "      <td>Granberry, Aisha B</td>\n",
 436 |        "      <td>FLUELLEN, JOHN A</td>\n",
 437 |        "      <td>10041.0</td>\n",
 438 |        "      <td>ROSEBERRY</td>\n",
 439 |        "      <td>NaN</td>\n",
 440 |        "      <td>141</td>\n",
 441 |        "      <td>ROSEBERRY</td>\n",
 442 |        "      <td>DETROIT</td>\n",
 443 |        "      <td>...</td>\n",
 444 |        "      <td>Failure to secure City or Private solid waste ...</td>\n",
 445 |        "      <td>Responsible by Default</td>\n",
 446 |        "      <td>200.0</td>\n",
 447 |        "      <td>20.0</td>\n",
 448 |        "      <td>10.0</td>\n",
 449 |        "      <td>20.0</td>\n",
 450 |        "      <td>0.0</td>\n",
 451 |        "      <td>0.0</td>\n",
 452 |        "      <td>250.0</td>\n",
 453 |        "      <td>NaN</td>\n",
 454 |        "    </tr>\n",
 455 |        "    <tr>\n",
 456 |        "      <th>1</th>\n",
 457 |        "      <td>285362</td>\n",
 458 |        "      <td>Department of Public Works</td>\n",
 459 |        "      <td>Lusk, Gertrina</td>\n",
 460 |        "      <td>WHIGHAM, THELMA</td>\n",
 461 |        "      <td>18520.0</td>\n",
 462 |        "      <td>EVERGREEN</td>\n",
 463 |        "      <td>NaN</td>\n",
 464 |        "      <td>19136</td>\n",
 465 |        "      <td>GLASTONBURY</td>\n",
 466 |        "      <td>DETROIT</td>\n",
 467 |        "      <td>...</td>\n",
 468 |        "      <td>Allowing bulk solid waste to lie or accumulate...</td>\n",
 469 |        "      <td>Responsible by Default</td>\n",
 470 |        "      <td>1000.0</td>\n",
 471 |        "      <td>20.0</td>\n",
 472 |        "      <td>10.0</td>\n",
 473 |        "      <td>100.0</td>\n",
 474 |        "      <td>0.0</td>\n",
 475 |        "      <td>0.0</td>\n",
 476 |        "      <td>1130.0</td>\n",
 477 |        "      <td>NaN</td>\n",
 478 |        "    </tr>\n",
 479 |        "    <tr>\n",
 480 |        "      <th>2</th>\n",
 481 |        "      <td>285361</td>\n",
 482 |        "      <td>Department of Public Works</td>\n",
 483 |        "      <td>Lusk, Gertrina</td>\n",
 484 |        "      <td>WHIGHAM, THELMA</td>\n",
 485 |        "      <td>18520.0</td>\n",
 486 |        "      <td>EVERGREEN</td>\n",
 487 |        "      <td>NaN</td>\n",
 488 |        "      <td>19136</td>\n",
 489 |        "      <td>GLASTONBURY</td>\n",
 490 |        "      <td>DETROIT</td>\n",
 491 |        "      <td>...</td>\n",
 492 |        "      <td>Improper placement of Courville container betw...</td>\n",
 493 |        "      <td>Responsible by Default</td>\n",
 494 |        "      <td>100.0</td>\n",
 495 |        "      <td>20.0</td>\n",
 496 |        "      <td>10.0</td>\n",
 497 |        "      <td>10.0</td>\n",
 498 |        "      <td>0.0</td>\n",
 499 |        "      <td>0.0</td>\n",
 500 |        "      <td>140.0</td>\n",
 501 |        "      <td>NaN</td>\n",
 502 |        "    </tr>\n",
 503 |        "    <tr>\n",
 504 |        "      <th>3</th>\n",
 505 |        "      <td>285338</td>\n",
 506 |        "      <td>Department of Public Works</td>\n",
 507 |        "      <td>Talbert, Reginald</td>\n",
 508 |        "      <td>HARABEDIEN, POPKIN</td>\n",
 509 |        "      <td>1835.0</td>\n",
 510 |        "      <td>CENTRAL</td>\n",
 511 |        "      <td>NaN</td>\n",
 512 |        "      <td>2246</td>\n",
 513 |        "      <td>NELSON</td>\n",
 514 |        "      <td>WOODHAVEN</td>\n",
 515 |        "      <td>...</td>\n",
 516 |        "      <td>Allowing bulk solid waste to lie or accumulate...</td>\n",
 517 |        "      <td>Responsible by Default</td>\n",
 518 |        "      <td>200.0</td>\n",
 519 |        "      <td>20.0</td>\n",
 520 |        "      <td>10.0</td>\n",
 521 |        "      <td>20.0</td>\n",
 522 |        "      <td>0.0</td>\n",
 523 |        "      <td>0.0</td>\n",
 524 |        "      <td>250.0</td>\n",
 525 |        "      <td>NaN</td>\n",
 526 |        "    </tr>\n",
 527 |        "    <tr>\n",
 528 |        "      <th>4</th>\n",
 529 |        "      <td>285346</td>\n",
 530 |        "      <td>Department of Public Works</td>\n",
 531 |        "      <td>Talbert, Reginald</td>\n",
 532 |        "      <td>CORBELL, STANLEY</td>\n",
 533 |        "      <td>1700.0</td>\n",
 534 |        "      <td>CENTRAL</td>\n",
 535 |        "      <td>NaN</td>\n",
 536 |        "      <td>3435</td>\n",
 537 |        "      <td>MUNGER</td>\n",
 538 |        "      <td>LIVONIA</td>\n",
 539 |        "      <td>...</td>\n",
 540 |        "      <td>Violation of time limit for approved container...</td>\n",
 541 |        "      <td>Responsible by Default</td>\n",
 542 |        "      <td>100.0</td>\n",
 543 |        "      <td>20.0</td>\n",
 544 |        "      <td>10.0</td>\n",
 545 |        "      <td>10.0</td>\n",
 546 |        "      <td>0.0</td>\n",
 547 |        "      <td>0.0</td>\n",
 548 |        "      <td>140.0</td>\n",
 549 |        "      <td>NaN</td>\n",
 550 |        "    </tr>\n",
 551 |        "  </tbody>\n",
 552 |        "</table>\n",
 553 |        "<p>5 rows × 27 columns</p>\n",
 554 |        "</div>"
 555 |       ],
 556 |       "text/plain": [
 557 |        "   ticket_id                 agency_name      inspector_name  \\\n",
 558 |        "0     284932  Department of Public Works  Granberry, Aisha B   \n",
 559 |        "1     285362  Department of Public Works      Lusk, Gertrina   \n",
 560 |        "2     285361  Department of Public Works      Lusk, Gertrina   \n",
 561 |        "3     285338  Department of Public Works   Talbert, Reginald   \n",
 562 |        "4     285346  Department of Public Works   Talbert, Reginald   \n",
 563 |        "\n",
 564 |        "        violator_name  violation_street_number violation_street_name  \\\n",
 565 |        "0    FLUELLEN, JOHN A                  10041.0             ROSEBERRY   \n",
 566 |        "1     WHIGHAM, THELMA                  18520.0             EVERGREEN   \n",
 567 |        "2     WHIGHAM, THELMA                  18520.0             EVERGREEN   \n",
 568 |        "3  HARABEDIEN, POPKIN                   1835.0               CENTRAL   \n",
 569 |        "4    CORBELL, STANLEY                   1700.0               CENTRAL   \n",
 570 |        "\n",
 571 |        "  violation_zip_code mailing_address_str_number mailing_address_str_name  \\\n",
 572 |        "0                NaN                        141                ROSEBERRY   \n",
 573 |        "1                NaN                      19136              GLASTONBURY   \n",
 574 |        "2                NaN                      19136              GLASTONBURY   \n",
 575 |        "3                NaN                       2246                   NELSON   \n",
 576 |        "4                NaN                       3435                   MUNGER   \n",
 577 |        "\n",
 578 |        "        city       ...         \\\n",
 579 |        "0    DETROIT       ...          \n",
 580 |        "1    DETROIT       ...          \n",
 581 |        "2    DETROIT       ...          \n",
 582 |        "3  WOODHAVEN       ...          \n",
 583 |        "4    LIVONIA       ...          \n",
 584 |        "\n",
 585 |        "                               violation_description             disposition  \\\n",
 586 |        "0  Failure to secure City or Private solid waste ...  Responsible by Default   \n",
 587 |        "1  Allowing bulk solid waste to lie or accumulate...  Responsible by Default   \n",
 588 |        "2  Improper placement of Courville container betw...  Responsible by Default   \n",
 589 |        "3  Allowing bulk solid waste to lie or accumulate...  Responsible by Default   \n",
 590 |        "4  Violation of time limit for approved container...  Responsible by Default   \n",
 591 |        "\n",
 592 |        "   fine_amount admin_fee state_fee late_fee discount_amount clean_up_cost  \\\n",
 593 |        "0        200.0      20.0      10.0     20.0             0.0           0.0   \n",
 594 |        "1       1000.0      20.0      10.0    100.0             0.0           0.0   \n",
 595 |        "2        100.0      20.0      10.0     10.0             0.0           0.0   \n",
 596 |        "3        200.0      20.0      10.0     20.0             0.0           0.0   \n",
 597 |        "4        100.0      20.0      10.0     10.0             0.0           0.0   \n",
 598 |        "\n",
 599 |        "  judgment_amount  grafitti_status  \n",
 600 |        "0           250.0              NaN  \n",
 601 |        "1          1130.0              NaN  \n",
 602 |        "2           140.0              NaN  \n",
 603 |        "3           250.0              NaN  \n",
 604 |        "4           140.0              NaN  \n",
 605 |        "\n",
 606 |        "[5 rows x 27 columns]"
 607 |       ]
 608 |      },
 609 |      "execution_count": 26,
 610 |      "metadata": {},
 611 |      "output_type": "execute_result"
 612 |     }
 613 |    ],
 614 |    "source": [
 615 |     "test_data = pd.read_csv('test.csv')\n",
 616 |     "test_data.head()"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "code",
 621 |    "execution_count": 47,
 622 |    "metadata": {
 623 |     "collapsed": false
 624 |    },
 625 |    "outputs": [
 626 |     {
 627 |      "data": {
 628 |       "text/plain": [
 629 |        "((61001, 28), (26358, 28))"
 630 |       ]
 631 |      },
 632 |      "execution_count": 47,
 633 |      "metadata": {},
 634 |      "output_type": "execute_result"
 635 |     }
 636 |    ],
 637 |    "source": [
 638 |     "test_data.shape, test_data[test_data['city']=='DETROIT'].shape"
 639 |    ]
 640 |   },
 641 |   {
 642 |    "cell_type": "code",
 643 |    "execution_count": 14,
 644 |    "metadata": {
 645 |     "collapsed": false
 646 |    },
 647 |    "outputs": [
 648 |     {
 649 |      "data": {
 650 |       "text/html": [
 651 |        "<div>\n",
 652 |        "<table border=\"1\" class=\"dataframe\">\n",
 653 |        "  <thead>\n",
 654 |        "    <tr style=\"text-align: right;\">\n",
 655 |        "      <th></th>\n",
 656 |        "      <th>ticket_id</th>\n",
 657 |        "      <th>address</th>\n",
 658 |        "    </tr>\n",
 659 |        "  </thead>\n",
 660 |        "  <tbody>\n",
 661 |        "    <tr>\n",
 662 |        "      <th>0</th>\n",
 663 |        "      <td>22056</td>\n",
 664 |        "      <td>2900 tyler, Detroit MI</td>\n",
 665 |        "    </tr>\n",
 666 |        "    <tr>\n",
 667 |        "      <th>1</th>\n",
 668 |        "      <td>27586</td>\n",
 669 |        "      <td>4311 central, Detroit MI</td>\n",
 670 |        "    </tr>\n",
 671 |        "    <tr>\n",
 672 |        "      <th>2</th>\n",
 673 |        "      <td>22062</td>\n",
 674 |        "      <td>1449 longfellow, Detroit MI</td>\n",
 675 |        "    </tr>\n",
 676 |        "    <tr>\n",
 677 |        "      <th>3</th>\n",
 678 |        "      <td>22084</td>\n",
 679 |        "      <td>1441 longfellow, Detroit MI</td>\n",
 680 |        "    </tr>\n",
 681 |        "    <tr>\n",
 682 |        "      <th>4</th>\n",
 683 |        "      <td>22093</td>\n",
 684 |        "      <td>2449 churchill, Detroit MI</td>\n",
 685 |        "    </tr>\n",
 686 |        "  </tbody>\n",
 687 |        "</table>\n",
 688 |        "</div>"
 689 |       ],
 690 |       "text/plain": [
 691 |        "   ticket_id                      address\n",
 692 |        "0      22056       2900 tyler, Detroit MI\n",
 693 |        "1      27586     4311 central, Detroit MI\n",
 694 |        "2      22062  1449 longfellow, Detroit MI\n",
 695 |        "3      22084  1441 longfellow, Detroit MI\n",
 696 |        "4      22093   2449 churchill, Detroit MI"
 697 |       ]
 698 |      },
 699 |      "execution_count": 14,
 700 |      "metadata": {},
 701 |      "output_type": "execute_result"
 702 |     }
 703 |    ],
 704 |    "source": [
 705 |     "address =  pd.read_csv('addresses.csv')\n",
 706 |     "address.head()"
 707 |    ]
 708 |   },
 709 |   {
 710 |    "cell_type": "code",
 711 |    "execution_count": 15,
 712 |    "metadata": {
 713 |     "collapsed": false
 714 |    },
 715 |    "outputs": [
 716 |     {
 717 |      "data": {
 718 |       "text/html": [
 719 |        "<div>\n",
 720 |        "<table border=\"1\" class=\"dataframe\">\n",
 721 |        "  <thead>\n",
 722 |        "    <tr style=\"text-align: right;\">\n",
 723 |        "      <th></th>\n",
 724 |        "      <th>address</th>\n",
 725 |        "      <th>lat</th>\n",
 726 |        "      <th>lon</th>\n",
 727 |        "    </tr>\n",
 728 |        "  </thead>\n",
 729 |        "  <tbody>\n",
 730 |        "    <tr>\n",
 731 |        "      <th>0</th>\n",
 732 |        "      <td>4300 rosa parks blvd, Detroit MI 48208</td>\n",
 733 |        "      <td>42.346169</td>\n",
 734 |        "      <td>-83.079962</td>\n",
 735 |        "    </tr>\n",
 736 |        "    <tr>\n",
 737 |        "      <th>1</th>\n",
 738 |        "      <td>14512 sussex, Detroit MI</td>\n",
 739 |        "      <td>42.394657</td>\n",
 740 |        "      <td>-83.194265</td>\n",
 741 |        "    </tr>\n",
 742 |        "    <tr>\n",
 743 |        "      <th>2</th>\n",
 744 |        "      <td>3456 garland, Detroit MI</td>\n",
 745 |        "      <td>42.373779</td>\n",
 746 |        "      <td>-82.986228</td>\n",
 747 |        "    </tr>\n",
 748 |        "    <tr>\n",
 749 |        "      <th>3</th>\n",
 750 |        "      <td>5787 wayburn, Detroit MI</td>\n",
 751 |        "      <td>42.403342</td>\n",
 752 |        "      <td>-82.957805</td>\n",
 753 |        "    </tr>\n",
 754 |        "    <tr>\n",
 755 |        "      <th>4</th>\n",
 756 |        "      <td>5766 haverhill, Detroit MI</td>\n",
 757 |        "      <td>42.407255</td>\n",
 758 |        "      <td>-82.946295</td>\n",
 759 |        "    </tr>\n",
 760 |        "  </tbody>\n",
 761 |        "</table>\n",
 762 |        "</div>"
 763 |       ],
 764 |       "text/plain": [
 765 |        "                                  address        lat        lon\n",
 766 |        "0  4300 rosa parks blvd, Detroit MI 48208  42.346169 -83.079962\n",
 767 |        "1                14512 sussex, Detroit MI  42.394657 -83.194265\n",
 768 |        "2                3456 garland, Detroit MI  42.373779 -82.986228\n",
 769 |        "3                5787 wayburn, Detroit MI  42.403342 -82.957805\n",
 770 |        "4              5766 haverhill, Detroit MI  42.407255 -82.946295"
 771 |       ]
 772 |      },
 773 |      "execution_count": 15,
 774 |      "metadata": {},
 775 |      "output_type": "execute_result"
 776 |     }
 777 |    ],
 778 |    "source": [
 779 |     "latlons = pd.read_csv('latlons.csv')\n",
 780 |     "latlons.head()"
 781 |    ]
 782 |   },
 783 |   {
 784 |    "cell_type": "code",
 785 |    "execution_count": 22,
 786 |    "metadata": {
 787 |     "collapsed": false
 788 |    },
 789 |    "outputs": [
 790 |     {
 791 |      "data": {
 792 |       "text/html": [
 793 |        "<div>\n",
 794 |        "<table border=\"1\" class=\"dataframe\">\n",
 795 |        "  <thead>\n",
 796 |        "    <tr style=\"text-align: right;\">\n",
 797 |        "      <th></th>\n",
 798 |        "      <th>ticket_id</th>\n",
 799 |        "      <th>lat</th>\n",
 800 |        "      <th>lon</th>\n",
 801 |        "    </tr>\n",
 802 |        "    <tr>\n",
 803 |        "      <th>address</th>\n",
 804 |        "      <th></th>\n",
 805 |        "      <th></th>\n",
 806 |        "      <th></th>\n",
 807 |        "    </tr>\n",
 808 |        "  </thead>\n",
 809 |        "  <tbody>\n",
 810 |        "    <tr>\n",
 811 |        "      <th>-11064 gratiot, Detroit MI</th>\n",
 812 |        "      <td>328722</td>\n",
 813 |        "      <td>42.406935</td>\n",
 814 |        "      <td>-82.995599</td>\n",
 815 |        "    </tr>\n",
 816 |        "    <tr>\n",
 817 |        "      <th>-11871 wilfred, Detroit MI</th>\n",
 818 |        "      <td>350971</td>\n",
 819 |        "      <td>42.411288</td>\n",
 820 |        "      <td>-82.993674</td>\n",
 821 |        "    </tr>\n",
 822 |        "    <tr>\n",
 823 |        "      <th>-15126 harper, Detroit MI</th>\n",
 824 |        "      <td>344821</td>\n",
 825 |        "      <td>42.406402</td>\n",
 826 |        "      <td>-82.957525</td>\n",
 827 |        "    </tr>\n",
 828 |        "    <tr>\n",
 829 |        "      <th>0 10th st, Detroit MI</th>\n",
 830 |        "      <td>24928</td>\n",
 831 |        "      <td>42.325689</td>\n",
 832 |        "      <td>-83.064330</td>\n",
 833 |        "    </tr>\n",
 834 |        "    <tr>\n",
 835 |        "      <th>0 10th st, Detroit MI</th>\n",
 836 |        "      <td>71887</td>\n",
 837 |        "      <td>42.325689</td>\n",
 838 |        "      <td>-83.064330</td>\n",
 839 |        "    </tr>\n",
 840 |        "  </tbody>\n",
 841 |        "</table>\n",
 842 |        "</div>"
 843 |       ],
 844 |       "text/plain": [
 845 |        "                            ticket_id        lat        lon\n",
 846 |        "address                                                    \n",
 847 |        "-11064 gratiot, Detroit MI     328722  42.406935 -82.995599\n",
 848 |        "-11871 wilfred, Detroit MI     350971  42.411288 -82.993674\n",
 849 |        "-15126 harper, Detroit MI      344821  42.406402 -82.957525\n",
 850 |        "0 10th st, Detroit MI           24928  42.325689 -83.064330\n",
 851 |        "0 10th st, Detroit MI           71887  42.325689 -83.064330"
 852 |       ]
 853 |      },
 854 |      "execution_count": 22,
 855 |      "metadata": {},
 856 |      "output_type": "execute_result"
 857 |     }
 858 |    ],
 859 |    "source": [
 860 |     "address = address.set_index('address').join(latlons.set_index('address'), how='left')\n",
 861 |     "address.head()"
 862 |    ]
 863 |   },
 864 |   {
 865 |    "cell_type": "code",
 866 |    "execution_count": 23,
 867 |    "metadata": {
 868 |     "collapsed": false
 869 |    },
 870 |    "outputs": [
 871 |     {
 872 |      "data": {
 873 |       "text/html": [
 874 |        "<div>\n",
 875 |        "<table border=\"1\" class=\"dataframe\">\n",
 876 |        "  <thead>\n",
 877 |        "    <tr style=\"text-align: right;\">\n",
 878 |        "      <th></th>\n",
 879 |        "      <th>agency_name</th>\n",
 880 |        "      <th>inspector_name</th>\n",
 881 |        "      <th>violator_name</th>\n",
 882 |        "      <th>violation_street_number</th>\n",
 883 |        "      <th>violation_street_name</th>\n",
 884 |        "      <th>violation_zip_code</th>\n",
 885 |        "      <th>mailing_address_str_number</th>\n",
 886 |        "      <th>mailing_address_str_name</th>\n",
 887 |        "      <th>city</th>\n",
 888 |        "      <th>state</th>\n",
 889 |        "      <th>...</th>\n",
 890 |        "      <th>payment_amount</th>\n",
 891 |        "      <th>balance_due</th>\n",
 892 |        "      <th>payment_date</th>\n",
 893 |        "      <th>payment_status</th>\n",
 894 |        "      <th>collection_status</th>\n",
 895 |        "      <th>grafitti_status</th>\n",
 896 |        "      <th>compliance_detail</th>\n",
 897 |        "      <th>compliance</th>\n",
 898 |        "      <th>lat</th>\n",
 899 |        "      <th>lon</th>\n",
 900 |        "    </tr>\n",
 901 |        "    <tr>\n",
 902 |        "      <th>ticket_id</th>\n",
 903 |        "      <th></th>\n",
 904 |        "      <th></th>\n",
 905 |        "      <th></th>\n",
 906 |        "      <th></th>\n",
 907 |        "      <th></th>\n",
 908 |        "      <th></th>\n",
 909 |        "      <th></th>\n",
 910 |        "      <th></th>\n",
 911 |        "      <th></th>\n",
 912 |        "      <th></th>\n",
 913 |        "      <th></th>\n",
 914 |        "      <th></th>\n",
 915 |        "      <th></th>\n",
 916 |        "      <th></th>\n",
 917 |        "      <th></th>\n",
 918 |        "      <th></th>\n",
 919 |        "      <th></th>\n",
 920 |        "      <th></th>\n",
 921 |        "      <th></th>\n",
 922 |        "      <th></th>\n",
 923 |        "      <th></th>\n",
 924 |        "    </tr>\n",
 925 |        "  </thead>\n",
 926 |        "  <tbody>\n",
 927 |        "    <tr>\n",
 928 |        "      <th>22056</th>\n",
 929 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
 930 |        "      <td>Sims, Martinzie</td>\n",
 931 |        "      <td>INVESTMENT INC., MIDWEST MORTGAGE</td>\n",
 932 |        "      <td>2900.0</td>\n",
 933 |        "      <td>TYLER</td>\n",
 934 |        "      <td>NaN</td>\n",
 935 |        "      <td>3.0</td>\n",
 936 |        "      <td>S. WICKER</td>\n",
 937 |        "      <td>CHICAGO</td>\n",
 938 |        "      <td>IL</td>\n",
 939 |        "      <td>...</td>\n",
 940 |        "      <td>0.0</td>\n",
 941 |        "      <td>305.0</td>\n",
 942 |        "      <td>NaN</td>\n",
 943 |        "      <td>NO PAYMENT APPLIED</td>\n",
 944 |        "      <td>NaN</td>\n",
 945 |        "      <td>NaN</td>\n",
 946 |        "      <td>non-compliant by no payment</td>\n",
 947 |        "      <td>0.0</td>\n",
 948 |        "      <td>42.390729</td>\n",
 949 |        "      <td>-83.124268</td>\n",
 950 |        "    </tr>\n",
 951 |        "    <tr>\n",
 952 |        "      <th>27586</th>\n",
 953 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
 954 |        "      <td>Williams, Darrin</td>\n",
 955 |        "      <td>Michigan, Covenant House</td>\n",
 956 |        "      <td>4311.0</td>\n",
 957 |        "      <td>CENTRAL</td>\n",
 958 |        "      <td>NaN</td>\n",
 959 |        "      <td>2959.0</td>\n",
 960 |        "      <td>Martin Luther King</td>\n",
 961 |        "      <td>Detroit</td>\n",
 962 |        "      <td>MI</td>\n",
 963 |        "      <td>...</td>\n",
 964 |        "      <td>780.0</td>\n",
 965 |        "      <td>75.0</td>\n",
 966 |        "      <td>2005-06-02 00:00:00</td>\n",
 967 |        "      <td>PAID IN FULL</td>\n",
 968 |        "      <td>NaN</td>\n",
 969 |        "      <td>NaN</td>\n",
 970 |        "      <td>compliant by late payment within 1 month</td>\n",
 971 |        "      <td>1.0</td>\n",
 972 |        "      <td>42.326937</td>\n",
 973 |        "      <td>-83.135118</td>\n",
 974 |        "    </tr>\n",
 975 |        "    <tr>\n",
 976 |        "      <th>22062</th>\n",
 977 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
 978 |        "      <td>Sims, Martinzie</td>\n",
 979 |        "      <td>SANDERS, DERRON</td>\n",
 980 |        "      <td>1449.0</td>\n",
 981 |        "      <td>LONGFELLOW</td>\n",
 982 |        "      <td>NaN</td>\n",
 983 |        "      <td>23658.0</td>\n",
 984 |        "      <td>P.O. BOX</td>\n",
 985 |        "      <td>DETROIT</td>\n",
 986 |        "      <td>MI</td>\n",
 987 |        "      <td>...</td>\n",
 988 |        "      <td>0.0</td>\n",
 989 |        "      <td>0.0</td>\n",
 990 |        "      <td>NaN</td>\n",
 991 |        "      <td>NO PAYMENT APPLIED</td>\n",
 992 |        "      <td>NaN</td>\n",
 993 |        "      <td>NaN</td>\n",
 994 |        "      <td>not responsible by disposition</td>\n",
 995 |        "      <td>NaN</td>\n",
 996 |        "      <td>42.380516</td>\n",
 997 |        "      <td>-83.096069</td>\n",
 998 |        "    </tr>\n",
 999 |        "    <tr>\n",
1000 |        "      <th>22084</th>\n",
1001 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
1002 |        "      <td>Sims, Martinzie</td>\n",
1003 |        "      <td>MOROSI, MIKE</td>\n",
1004 |        "      <td>1441.0</td>\n",
1005 |        "      <td>LONGFELLOW</td>\n",
1006 |        "      <td>NaN</td>\n",
1007 |        "      <td>5.0</td>\n",
1008 |        "      <td>ST. CLAIR</td>\n",
1009 |        "      <td>DETROIT</td>\n",
1010 |        "      <td>MI</td>\n",
1011 |        "      <td>...</td>\n",
1012 |        "      <td>0.0</td>\n",
1013 |        "      <td>0.0</td>\n",
1014 |        "      <td>NaN</td>\n",
1015 |        "      <td>NO PAYMENT APPLIED</td>\n",
1016 |        "      <td>NaN</td>\n",
1017 |        "      <td>NaN</td>\n",
1018 |        "      <td>not responsible by disposition</td>\n",
1019 |        "      <td>NaN</td>\n",
1020 |        "      <td>42.380570</td>\n",
1021 |        "      <td>-83.095919</td>\n",
1022 |        "    </tr>\n",
1023 |        "    <tr>\n",
1024 |        "      <th>22093</th>\n",
1025 |        "      <td>Buildings, Safety Engineering &amp; Env Department</td>\n",
1026 |        "      <td>Sims, Martinzie</td>\n",
1027 |        "      <td>NATHANIEL, NEAL</td>\n",
1028 |        "      <td>2449.0</td>\n",
1029 |        "      <td>CHURCHILL</td>\n",
1030 |        "      <td>NaN</td>\n",
1031 |        "      <td>7449.0</td>\n",
1032 |        "      <td>CHURCHILL</td>\n",
1033 |        "      <td>DETROIT</td>\n",
1034 |        "      <td>MI</td>\n",
1035 |        "      <td>...</td>\n",
1036 |        "      <td>0.0</td>\n",
1037 |        "      <td>0.0</td>\n",
1038 |        "      <td>NaN</td>\n",
1039 |        "      <td>NO PAYMENT APPLIED</td>\n",
1040 |        "      <td>NaN</td>\n",
1041 |        "      <td>NaN</td>\n",
1042 |        "      <td>not responsible by disposition</td>\n",
1043 |        "      <td>NaN</td>\n",
1044 |        "      <td>42.145257</td>\n",
1045 |        "      <td>-83.208233</td>\n",
1046 |        "    </tr>\n",
1047 |        "  </tbody>\n",
1048 |        "</table>\n",
1049 |        "<p>5 rows × 35 columns</p>\n",
1050 |        "</div>"
1051 |       ],
1052 |       "text/plain": [
1053 |        "                                              agency_name    inspector_name  \\\n",
1054 |        "ticket_id                                                                     \n",
1055 |        "22056      Buildings, Safety Engineering & Env Department   Sims, Martinzie   \n",
1056 |        "27586      Buildings, Safety Engineering & Env Department  Williams, Darrin   \n",
1057 |        "22062      Buildings, Safety Engineering & Env Department   Sims, Martinzie   \n",
1058 |        "22084      Buildings, Safety Engineering & Env Department   Sims, Martinzie   \n",
1059 |        "22093      Buildings, Safety Engineering & Env Department   Sims, Martinzie   \n",
1060 |        "\n",
1061 |        "                               violator_name  violation_street_number  \\\n",
1062 |        "ticket_id                                                               \n",
1063 |        "22056      INVESTMENT INC., MIDWEST MORTGAGE                   2900.0   \n",
1064 |        "27586               Michigan, Covenant House                   4311.0   \n",
1065 |        "22062                        SANDERS, DERRON                   1449.0   \n",
1066 |        "22084                           MOROSI, MIKE                   1441.0   \n",
1067 |        "22093                        NATHANIEL, NEAL                   2449.0   \n",
1068 |        "\n",
1069 |        "          violation_street_name  violation_zip_code  \\\n",
1070 |        "ticket_id                                             \n",
1071 |        "22056                     TYLER                 NaN   \n",
1072 |        "27586                   CENTRAL                 NaN   \n",
1073 |        "22062                LONGFELLOW                 NaN   \n",
1074 |        "22084                LONGFELLOW                 NaN   \n",
1075 |        "22093                 CHURCHILL                 NaN   \n",
1076 |        "\n",
1077 |        "           mailing_address_str_number mailing_address_str_name     city state  \\\n",
1078 |        "ticket_id                                                                       \n",
1079 |        "22056                             3.0                S. WICKER  CHICAGO    IL   \n",
1080 |        "27586                          2959.0       Martin Luther King  Detroit    MI   \n",
1081 |        "22062                         23658.0                 P.O. BOX  DETROIT    MI   \n",
1082 |        "22084                             5.0                ST. CLAIR  DETROIT    MI   \n",
1083 |        "22093                          7449.0                CHURCHILL  DETROIT    MI   \n",
1084 |        "\n",
1085 |        "             ...     payment_amount balance_due         payment_date  \\\n",
1086 |        "ticket_id    ...                                                       \n",
1087 |        "22056        ...                0.0       305.0                  NaN   \n",
1088 |        "27586        ...              780.0        75.0  2005-06-02 00:00:00   \n",
1089 |        "22062        ...                0.0         0.0                  NaN   \n",
1090 |        "22084        ...                0.0         0.0                  NaN   \n",
1091 |        "22093        ...                0.0         0.0                  NaN   \n",
1092 |        "\n",
1093 |        "               payment_status collection_status grafitti_status  \\\n",
1094 |        "ticket_id                                                         \n",
1095 |        "22056      NO PAYMENT APPLIED               NaN             NaN   \n",
1096 |        "27586            PAID IN FULL               NaN             NaN   \n",
1097 |        "22062      NO PAYMENT APPLIED               NaN             NaN   \n",
1098 |        "22084      NO PAYMENT APPLIED               NaN             NaN   \n",
1099 |        "22093      NO PAYMENT APPLIED               NaN             NaN   \n",
1100 |        "\n",
1101 |        "                                  compliance_detail compliance        lat  \\\n",
1102 |        "ticket_id                                                                   \n",
1103 |        "22056                   non-compliant by no payment        0.0  42.390729   \n",
1104 |        "27586      compliant by late payment within 1 month        1.0  42.326937   \n",
1105 |        "22062                not responsible by disposition        NaN  42.380516   \n",
1106 |        "22084                not responsible by disposition        NaN  42.380570   \n",
1107 |        "22093                not responsible by disposition        NaN  42.145257   \n",
1108 |        "\n",
1109 |        "                 lon  \n",
1110 |        "ticket_id             \n",
1111 |        "22056     -83.124268  \n",
1112 |        "27586     -83.135118  \n",
1113 |        "22062     -83.096069  \n",
1114 |        "22084     -83.095919  \n",
1115 |        "22093     -83.208233  \n",
1116 |        "\n",
1117 |        "[5 rows x 35 columns]"
1118 |       ]
1119 |      },
1120 |      "execution_count": 23,
1121 |      "metadata": {},
1122 |      "output_type": "execute_result"
1123 |     }
1124 |    ],
1125 |    "source": [
1126 |     "train_data = train_data.set_index('ticket_id').join(address.set_index('ticket_id'))\n",
1127 |     "train_data.head()"
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "code",
1132 |    "execution_count": 27,
1133 |    "metadata": {
1134 |     "collapsed": false
1135 |    },
1136 |    "outputs": [
1137 |     {
1138 |      "data": {
1139 |       "text/html": [
1140 |        "<div>\n",
1141 |        "<table border=\"1\" class=\"dataframe\">\n",
1142 |        "  <thead>\n",
1143 |        "    <tr style=\"text-align: right;\">\n",
1144 |        "      <th></th>\n",
1145 |        "      <th>agency_name</th>\n",
1146 |        "      <th>inspector_name</th>\n",
1147 |        "      <th>violator_name</th>\n",
1148 |        "      <th>violation_street_number</th>\n",
1149 |        "      <th>violation_street_name</th>\n",
1150 |        "      <th>violation_zip_code</th>\n",
1151 |        "      <th>mailing_address_str_number</th>\n",
1152 |        "      <th>mailing_address_str_name</th>\n",
1153 |        "      <th>city</th>\n",
1154 |        "      <th>state</th>\n",
1155 |        "      <th>...</th>\n",
1156 |        "      <th>fine_amount</th>\n",
1157 |        "      <th>admin_fee</th>\n",
1158 |        "      <th>state_fee</th>\n",
1159 |        "      <th>late_fee</th>\n",
1160 |        "      <th>discount_amount</th>\n",
1161 |        "      <th>clean_up_cost</th>\n",
1162 |        "      <th>judgment_amount</th>\n",
1163 |        "      <th>grafitti_status</th>\n",
1164 |        "      <th>lat</th>\n",
1165 |        "      <th>lon</th>\n",
1166 |        "    </tr>\n",
1167 |        "    <tr>\n",
1168 |        "      <th>ticket_id</th>\n",
1169 |        "      <th></th>\n",
1170 |        "      <th></th>\n",
1171 |        "      <th></th>\n",
1172 |        "      <th></th>\n",
1173 |        "      <th></th>\n",
1174 |        "      <th></th>\n",
1175 |        "      <th></th>\n",
1176 |        "      <th></th>\n",
1177 |        "      <th></th>\n",
1178 |        "      <th></th>\n",
1179 |        "      <th></th>\n",
1180 |        "      <th></th>\n",
1181 |        "      <th></th>\n",
1182 |        "      <th></th>\n",
1183 |        "      <th></th>\n",
1184 |        "      <th></th>\n",
1185 |        "      <th></th>\n",
1186 |        "      <th></th>\n",
1187 |        "      <th></th>\n",
1188 |        "      <th></th>\n",
1189 |        "      <th></th>\n",
1190 |        "    </tr>\n",
1191 |        "  </thead>\n",
1192 |        "  <tbody>\n",
1193 |        "    <tr>\n",
1194 |        "      <th>284932</th>\n",
1195 |        "      <td>Department of Public Works</td>\n",
1196 |        "      <td>Granberry, Aisha B</td>\n",
1197 |        "      <td>FLUELLEN, JOHN A</td>\n",
1198 |        "      <td>10041.0</td>\n",
1199 |        "      <td>ROSEBERRY</td>\n",
1200 |        "      <td>NaN</td>\n",
1201 |        "      <td>141</td>\n",
1202 |        "      <td>ROSEBERRY</td>\n",
1203 |        "      <td>DETROIT</td>\n",
1204 |        "      <td>MI</td>\n",
1205 |        "      <td>...</td>\n",
1206 |        "      <td>200.0</td>\n",
1207 |        "      <td>20.0</td>\n",
1208 |        "      <td>10.0</td>\n",
1209 |        "      <td>20.0</td>\n",
1210 |        "      <td>0.0</td>\n",
1211 |        "      <td>0.0</td>\n",
1212 |        "      <td>250.0</td>\n",
1213 |        "      <td>NaN</td>\n",
1214 |        "      <td>42.407581</td>\n",
1215 |        "      <td>-82.986642</td>\n",
1216 |        "    </tr>\n",
1217 |        "    <tr>\n",
1218 |        "      <th>285362</th>\n",
1219 |        "      <td>Department of Public Works</td>\n",
1220 |        "      <td>Lusk, Gertrina</td>\n",
1221 |        "      <td>WHIGHAM, THELMA</td>\n",
1222 |        "      <td>18520.0</td>\n",
1223 |        "      <td>EVERGREEN</td>\n",
1224 |        "      <td>NaN</td>\n",
1225 |        "      <td>19136</td>\n",
1226 |        "      <td>GLASTONBURY</td>\n",
1227 |        "      <td>DETROIT</td>\n",
1228 |        "      <td>MI</td>\n",
1229 |        "      <td>...</td>\n",
1230 |        "      <td>1000.0</td>\n",
1231 |        "      <td>20.0</td>\n",
1232 |        "      <td>10.0</td>\n",
1233 |        "      <td>100.0</td>\n",
1234 |        "      <td>0.0</td>\n",
1235 |        "      <td>0.0</td>\n",
1236 |        "      <td>1130.0</td>\n",
1237 |        "      <td>NaN</td>\n",
1238 |        "      <td>42.426239</td>\n",
1239 |        "      <td>-83.238259</td>\n",
1240 |        "    </tr>\n",
1241 |        "    <tr>\n",
1242 |        "      <th>285361</th>\n",
1243 |        "      <td>Department of Public Works</td>\n",
1244 |        "      <td>Lusk, Gertrina</td>\n",
1245 |        "      <td>WHIGHAM, THELMA</td>\n",
1246 |        "      <td>18520.0</td>\n",
1247 |        "      <td>EVERGREEN</td>\n",
1248 |        "      <td>NaN</td>\n",
1249 |        "      <td>19136</td>\n",
1250 |        "      <td>GLASTONBURY</td>\n",
1251 |        "      <td>DETROIT</td>\n",
1252 |        "      <td>MI</td>\n",
1253 |        "      <td>...</td>\n",
1254 |        "      <td>100.0</td>\n",
1255 |        "      <td>20.0</td>\n",
1256 |        "      <td>10.0</td>\n",
1257 |        "      <td>10.0</td>\n",
1258 |        "      <td>0.0</td>\n",
1259 |        "      <td>0.0</td>\n",
1260 |        "      <td>140.0</td>\n",
1261 |        "      <td>NaN</td>\n",
1262 |        "      <td>42.426239</td>\n",
1263 |        "      <td>-83.238259</td>\n",
1264 |        "    </tr>\n",
1265 |        "    <tr>\n",
1266 |        "      <th>285338</th>\n",
1267 |        "      <td>Department of Public Works</td>\n",
1268 |        "      <td>Talbert, Reginald</td>\n",
1269 |        "      <td>HARABEDIEN, POPKIN</td>\n",
1270 |        "      <td>1835.0</td>\n",
1271 |        "      <td>CENTRAL</td>\n",
1272 |        "      <td>NaN</td>\n",
1273 |        "      <td>2246</td>\n",
1274 |        "      <td>NELSON</td>\n",
1275 |        "      <td>WOODHAVEN</td>\n",
1276 |        "      <td>MI</td>\n",
1277 |        "      <td>...</td>\n",
1278 |        "      <td>200.0</td>\n",
1279 |        "      <td>20.0</td>\n",
1280 |        "      <td>10.0</td>\n",
1281 |        "      <td>20.0</td>\n",
1282 |        "      <td>0.0</td>\n",
1283 |        "      <td>0.0</td>\n",
1284 |        "      <td>250.0</td>\n",
1285 |        "      <td>NaN</td>\n",
1286 |        "      <td>42.309661</td>\n",
1287 |        "      <td>-83.122426</td>\n",
1288 |        "    </tr>\n",
1289 |        "    <tr>\n",
1290 |        "      <th>285346</th>\n",
1291 |        "      <td>Department of Public Works</td>\n",
1292 |        "      <td>Talbert, Reginald</td>\n",
1293 |        "      <td>CORBELL, STANLEY</td>\n",
1294 |        "      <td>1700.0</td>\n",
1295 |        "      <td>CENTRAL</td>\n",
1296 |        "      <td>NaN</td>\n",
1297 |        "      <td>3435</td>\n",
1298 |        "      <td>MUNGER</td>\n",
1299 |        "      <td>LIVONIA</td>\n",
1300 |        "      <td>MI</td>\n",
1301 |        "      <td>...</td>\n",
1302 |        "      <td>100.0</td>\n",
1303 |        "      <td>20.0</td>\n",
1304 |        "      <td>10.0</td>\n",
1305 |        "      <td>10.0</td>\n",
1306 |        "      <td>0.0</td>\n",
1307 |        "      <td>0.0</td>\n",
1308 |        "      <td>140.0</td>\n",
1309 |        "      <td>NaN</td>\n",
1310 |        "      <td>42.308830</td>\n",
1311 |        "      <td>-83.121116</td>\n",
1312 |        "    </tr>\n",
1313 |        "  </tbody>\n",
1314 |        "</table>\n",
1315 |        "<p>5 rows × 28 columns</p>\n",
1316 |        "</div>"
1317 |       ],
1318 |       "text/plain": [
1319 |        "                          agency_name      inspector_name       violator_name  \\\n",
1320 |        "ticket_id                                                                       \n",
1321 |        "284932     Department of Public Works  Granberry, Aisha B    FLUELLEN, JOHN A   \n",
1322 |        "285362     Department of Public Works      Lusk, Gertrina     WHIGHAM, THELMA   \n",
1323 |        "285361     Department of Public Works      Lusk, Gertrina     WHIGHAM, THELMA   \n",
1324 |        "285338     Department of Public Works   Talbert, Reginald  HARABEDIEN, POPKIN   \n",
1325 |        "285346     Department of Public Works   Talbert, Reginald    CORBELL, STANLEY   \n",
1326 |        "\n",
1327 |        "           violation_street_number violation_street_name violation_zip_code  \\\n",
1328 |        "ticket_id                                                                     \n",
1329 |        "284932                     10041.0             ROSEBERRY                NaN   \n",
1330 |        "285362                     18520.0             EVERGREEN                NaN   \n",
1331 |        "285361                     18520.0             EVERGREEN                NaN   \n",
1332 |        "285338                      1835.0               CENTRAL                NaN   \n",
1333 |        "285346                      1700.0               CENTRAL                NaN   \n",
1334 |        "\n",
1335 |        "          mailing_address_str_number mailing_address_str_name       city  \\\n",
1336 |        "ticket_id                                                                  \n",
1337 |        "284932                           141                ROSEBERRY    DETROIT   \n",
1338 |        "285362                         19136              GLASTONBURY    DETROIT   \n",
1339 |        "285361                         19136              GLASTONBURY    DETROIT   \n",
1340 |        "285338                          2246                   NELSON  WOODHAVEN   \n",
1341 |        "285346                          3435                   MUNGER    LIVONIA   \n",
1342 |        "\n",
1343 |        "          state    ...     fine_amount  admin_fee state_fee late_fee  \\\n",
1344 |        "ticket_id          ...                                                 \n",
1345 |        "284932       MI    ...           200.0       20.0      10.0     20.0   \n",
1346 |        "285362       MI    ...          1000.0       20.0      10.0    100.0   \n",
1347 |        "285361       MI    ...           100.0       20.0      10.0     10.0   \n",
1348 |        "285338       MI    ...           200.0       20.0      10.0     20.0   \n",
1349 |        "285346       MI    ...           100.0       20.0      10.0     10.0   \n",
1350 |        "\n",
1351 |        "          discount_amount clean_up_cost judgment_amount grafitti_status  \\\n",
1352 |        "ticket_id                                                                 \n",
1353 |        "284932                0.0           0.0           250.0             NaN   \n",
1354 |        "285362                0.0           0.0          1130.0             NaN   \n",
1355 |        "285361                0.0           0.0           140.0             NaN   \n",
1356 |        "285338                0.0           0.0           250.0             NaN   \n",
1357 |        "285346                0.0           0.0           140.0             NaN   \n",
1358 |        "\n",
1359 |        "                 lat        lon  \n",
1360 |        "ticket_id                        \n",
1361 |        "284932     42.407581 -82.986642  \n",
1362 |        "285362     42.426239 -83.238259  \n",
1363 |        "285361     42.426239 -83.238259  \n",
1364 |        "285338     42.309661 -83.122426  \n",
1365 |        "285346     42.308830 -83.121116  \n",
1366 |        "\n",
1367 |        "[5 rows x 28 columns]"
1368 |       ]
1369 |      },
1370 |      "execution_count": 27,
1371 |      "metadata": {},
1372 |      "output_type": "execute_result"
1373 |     }
1374 |    ],
1375 |    "source": [
1376 |     "test_data = test_data.set_index('ticket_id').join(address.set_index('ticket_id'))\n",
1377 |     "test_data.head()"
1378 |    ]
1379 |   },
1380 |   {
1381 |    "cell_type": "code",
1382 |    "execution_count": 44,
1383 |    "metadata": {
1384 |     "collapsed": false
1385 |    },
1386 |    "outputs": [
1387 |     {
1388 |      "data": {
1389 |       "text/plain": [
1390 |        "(237790, 35)"
1391 |       ]
1392 |      },
1393 |      "execution_count": 44,
1394 |      "metadata": {},
1395 |      "output_type": "execute_result"
1396 |     }
1397 |    ],
1398 |    "source": [
1399 |     "train_data[train_data['late_fee']!=10].shape"
1400 |    ]
1401 |   },
1402 |   {
1403 |    "cell_type": "code",
1404 |    "execution_count": 45,
1405 |    "metadata": {
1406 |     "collapsed": true
1407 |    },
1408 |    "outputs": [],
1409 |    "source": [
1410 |     "train_data = train_data[(train_data['compliance'] == 0) | (train_data['compliance'] == 1)]"
1411 |    ]
1412 |   },
1413 |   {
1414 |    "cell_type": "code",
1415 |    "execution_count": 46,
1416 |    "metadata": {
1417 |     "collapsed": false
1418 |    },
1419 |    "outputs": [
1420 |     {
1421 |      "data": {
1422 |       "text/plain": [
1423 |        "(159880, 35)"
1424 |       ]
1425 |      },
1426 |      "execution_count": 46,
1427 |      "metadata": {},
1428 |      "output_type": "execute_result"
1429 |     }
1430 |    ],
1431 |    "source": [
1432 |     "train_data.shape"
1433 |    ]
1434 |   },
1435 |   {
1436 |    "cell_type": "code",
1437 |    "execution_count": 51,
1438 |    "metadata": {
1439 |     "collapsed": false
1440 |    },
1441 |    "outputs": [
1442 |     {
1443 |      "data": {
1444 |       "text/plain": [
1445 |        "189"
1446 |       ]
1447 |      },
1448 |      "execution_count": 51,
1449 |      "metadata": {},
1450 |      "output_type": "execute_result"
1451 |     }
1452 |    ],
1453 |    "source": [
1454 |     "len(train_data['violation_code'].unique())"
1455 |    ]
1456 |   },
1457 |   {
1458 |    "cell_type": "code",
1459 |    "execution_count": 52,
1460 |    "metadata": {
1461 |     "collapsed": false
1462 |    },
1463 |    "outputs": [
1464 |     {
1465 |      "data": {
1466 |       "text/plain": [
1467 |        "4093"
1468 |       ]
1469 |      },
1470 |      "execution_count": 52,
1471 |      "metadata": {},
1472 |      "output_type": "execute_result"
1473 |     }
1474 |    ],
1475 |    "source": [
1476 |     "len(train_data['city'].unique())"
1477 |    ]
1478 |   },
1479 |   {
1480 |    "cell_type": "code",
1481 |    "execution_count": 53,
1482 |    "metadata": {
1483 |     "collapsed": false
1484 |    },
1485 |    "outputs": [
1486 |     {
1487 |      "data": {
1488 |       "text/plain": [
1489 |        "60"
1490 |       ]
1491 |      },
1492 |      "execution_count": 53,
1493 |      "metadata": {},
1494 |      "output_type": "execute_result"
1495 |     }
1496 |    ],
1497 |    "source": [
1498 |     "len(train_data['state'].unique())"
1499 |    ]
1500 |   },
1501 |   {
1502 |    "cell_type": "code",
1503 |    "execution_count": 54,
1504 |    "metadata": {
1505 |     "collapsed": false
1506 |    },
1507 |    "outputs": [
1508 |     {
1509 |      "data": {
1510 |       "text/plain": [
1511 |        "5"
1512 |       ]
1513 |      },
1514 |      "execution_count": 54,
1515 |      "metadata": {},
1516 |      "output_type": "execute_result"
1517 |     }
1518 |    ],
1519 |    "source": [
1520 |     "len(train_data['agency_name'].unique())"
1521 |    ]
1522 |   },
1523 |   {
1524 |    "cell_type": "code",
1525 |    "execution_count": 95,
1526 |    "metadata": {
1527 |     "collapsed": false
1528 |    },
1529 |    "outputs": [
1530 |     {
1531 |      "data": {
1532 |       "text/plain": [
1533 |        "3"
1534 |       ]
1535 |      },
1536 |      "execution_count": 95,
1537 |      "metadata": {},
1538 |      "output_type": "execute_result"
1539 |     }
1540 |    ],
1541 |    "source": [
1542 |     "len(test_data['agency_name'].unique())"
1543 |    ]
1544 |   },
1545 |   {
1546 |    "cell_type": "code",
1547 |    "execution_count": 55,
1548 |    "metadata": {
1549 |     "collapsed": false
1550 |    },
1551 |    "outputs": [
1552 |     {
1553 |      "data": {
1554 |       "text/plain": [
1555 |        "4"
1556 |       ]
1557 |      },
1558 |      "execution_count": 55,
1559 |      "metadata": {},
1560 |      "output_type": "execute_result"
1561 |     }
1562 |    ],
1563 |    "source": [
1564 |     "len(train_data['disposition'].unique())"
1565 |    ]
1566 |   },
1567 |   {
1568 |    "cell_type": "code",
1569 |    "execution_count": 56,
1570 |    "metadata": {
1571 |     "collapsed": false
1572 |    },
1573 |    "outputs": [
1574 |     {
1575 |      "data": {
1576 |       "text/plain": [
1577 |        "ticket_id\n",
1578 |        "22056    2004-03-16 11:40:00\n",
1579 |        "27586    2004-04-23 12:30:00\n",
1580 |        "22046    2004-05-01 11:50:00\n",
1581 |        "18738    2004-06-14 14:15:00\n",
1582 |        "18735    2004-06-16 12:30:00\n",
1583 |        "Name: ticket_issued_date, dtype: object"
1584 |       ]
1585 |      },
1586 |      "execution_count": 56,
1587 |      "metadata": {},
1588 |      "output_type": "execute_result"
1589 |     }
1590 |    ],
1591 |    "source": [
1592 |     "train_data['ticket_issued_date'].head()"
1593 |    ]
1594 |   },
1595 |   {
1596 |    "cell_type": "code",
1597 |    "execution_count": 62,
1598 |    "metadata": {
1599 |     "collapsed": false
1600 |    },
1601 |    "outputs": [
1602 |     {
1603 |      "data": {
1604 |       "text/html": [
1605 |        "<div>\n",
1606 |        "<table border=\"1\" class=\"dataframe\">\n",
1607 |        "  <thead>\n",
1608 |        "    <tr style=\"text-align: right;\">\n",
1609 |        "      <th></th>\n",
1610 |        "      <th>agency_name</th>\n",
1611 |        "      <th>inspector_name</th>\n",
1612 |        "      <th>violator_name</th>\n",
1613 |        "      <th>violation_street_number</th>\n",
1614 |        "      <th>violation_street_name</th>\n",
1615 |        "      <th>violation_zip_code</th>\n",
1616 |        "      <th>mailing_address_str_number</th>\n",
1617 |        "      <th>mailing_address_str_name</th>\n",
1618 |        "      <th>city</th>\n",
1619 |        "      <th>state</th>\n",
1620 |        "      <th>...</th>\n",
1621 |        "      <th>payment_amount</th>\n",
1622 |        "      <th>balance_due</th>\n",
1623 |        "      <th>payment_date</th>\n",
1624 |        "      <th>payment_status</th>\n",
1625 |        "      <th>collection_status</th>\n",
1626 |        "      <th>grafitti_status</th>\n",
1627 |        "      <th>compliance_detail</th>\n",
1628 |        "      <th>compliance</th>\n",
1629 |        "      <th>lat</th>\n",
1630 |        "      <th>lon</th>\n",
1631 |        "    </tr>\n",
1632 |        "    <tr>\n",
1633 |        "      <th>ticket_id</th>\n",
1634 |        "      <th></th>\n",
1635 |        "      <th></th>\n",
1636 |        "      <th></th>\n",
1637 |        "      <th></th>\n",
1638 |        "      <th></th>\n",
1639 |        "      <th></th>\n",
1640 |        "      <th></th>\n",
1641 |        "      <th></th>\n",
1642 |        "      <th></th>\n",
1643 |        "      <th></th>\n",
1644 |        "      <th></th>\n",
1645 |        "      <th></th>\n",
1646 |        "      <th></th>\n",
1647 |        "      <th></th>\n",
1648 |        "      <th></th>\n",
1649 |        "      <th></th>\n",
1650 |        "      <th></th>\n",
1651 |        "      <th></th>\n",
1652 |        "      <th></th>\n",
1653 |        "      <th></th>\n",
1654 |        "      <th></th>\n",
1655 |        "    </tr>\n",
1656 |        "  </thead>\n",
1657 |        "  <tbody>\n",
1658 |        "  </tbody>\n",
1659 |        "</table>\n",
1660 |        "<p>0 rows × 35 columns</p>\n",
1661 |        "</div>"
1662 |       ],
1663 |       "text/plain": [
1664 |        "Empty DataFrame\n",
1665 |        "Columns: [agency_name, inspector_name, violator_name, violation_street_number, violation_street_name, violation_zip_code, mailing_address_str_number, mailing_address_str_name, city, state, zip_code, non_us_str_code, country, ticket_issued_date, hearing_date, violation_code, violation_description, disposition, fine_amount, admin_fee, state_fee, late_fee, discount_amount, clean_up_cost, judgment_amount, payment_amount, balance_due, payment_date, payment_status, collection_status, grafitti_status, compliance_detail, compliance, lat, lon]\n",
1666 |        "Index: []\n",
1667 |        "\n",
1668 |        "[0 rows x 35 columns]"
1669 |       ]
1670 |      },
1671 |      "execution_count": 62,
1672 |      "metadata": {},
1673 |      "output_type": "execute_result"
1674 |     }
1675 |    ],
1676 |    "source": [
1677 |     "train_data[train_data['ticket_issued_date'].isnull()]"
1678 |    ]
1679 |   },
1680 |   {
1681 |    "cell_type": "code",
1682 |    "execution_count": 67,
1683 |    "metadata": {
1684 |     "collapsed": false
1685 |    },
1686 |    "outputs": [],
1687 |    "source": [
1688 |     "train_data = train_data[~train_data['hearing_date'].isnull()]"
1689 |    ]
1690 |   },
1691 |   {
1692 |    "cell_type": "code",
1693 |    "execution_count": 78,
1694 |    "metadata": {
1695 |     "collapsed": false
1696 |    },
1697 |    "outputs": [
1698 |     {
1699 |      "data": {
1700 |       "text/plain": [
1701 |        "2197"
1702 |       ]
1703 |      },
1704 |      "execution_count": 78,
1705 |      "metadata": {},
1706 |      "output_type": "execute_result"
1707 |     }
1708 |    ],
1709 |    "source": [
1710 |     "len(test_data[test_data['hearing_date'].isnull()])"
1711 |    ]
1712 |   },
1713 |   {
1714 |    "cell_type": "code",
1715 |    "execution_count": 79,
1716 |    "metadata": {
1717 |     "collapsed": false
1718 |    },
1719 |    "outputs": [
1720 |     {
1721 |      "data": {
1722 |       "text/plain": [
1723 |        "0"
1724 |       ]
1725 |      },
1726 |      "execution_count": 79,
1727 |      "metadata": {},
1728 |      "output_type": "execute_result"
1729 |     }
1730 |    ],
1731 |    "source": [
1732 |     "len(test_data[test_data['ticket_issued_date'].isnull()])"
1733 |    ]
1734 |   },
1735 |   {
1736 |    "cell_type": "code",
1737 |    "execution_count": 68,
1738 |    "metadata": {
1739 |     "collapsed": false
1740 |    },
1741 |    "outputs": [
1742 |     {
1743 |      "data": {
1744 |       "text/plain": [
1745 |        "ticket_id\n",
1746 |        "22056    2005-03-21 10:30:00\n",
1747 |        "27586    2005-05-06 13:30:00\n",
1748 |        "22046    2005-03-21 10:30:00\n",
1749 |        "18738    2005-02-22 15:00:00\n",
1750 |        "18735    2005-02-22 15:00:00\n",
1751 |        "Name: hearing_date, dtype: object"
1752 |       ]
1753 |      },
1754 |      "execution_count": 68,
1755 |      "metadata": {},
1756 |      "output_type": "execute_result"
1757 |     }
1758 |    ],
1759 |    "source": [
1760 |     "train_data['hearing_date'].head()"
1761 |    ]
1762 |   },
1763 |   {
1764 |    "cell_type": "code",
1765 |    "execution_count": 86,
1766 |    "metadata": {
1767 |     "collapsed": true
1768 |    },
1769 |    "outputs": [],
1770 |    "source": [
1771 |     "from datetime import datetime\n",
1772 |     "def time_gap(hearing_date_str, ticket_issued_date_str):\n",
1773 |     "    if not hearing_date_str: return 73\n",
1774 |     "    hearing_date = datetime.strptime(hearing_date_str, \"%Y-%m-%d %H:%M:%S\")\n",
1775 |     "    ticket_issued_date = datetime.strptime(ticket_issued_date_str, \"%Y-%m-%d %H:%M:%S\")\n",
1776 |     "    gap = hearing_date - ticket_issued_date\n",
1777 |     "    return gap.days"
1778 |    ]
1779 |   },
1780 |   {
1781 |    "cell_type": "code",
1782 |    "execution_count": 76,
1783 |    "metadata": {
1784 |     "collapsed": false
1785 |    },
1786 |    "outputs": [
1787 |     {
1788 |      "data": {
1789 |       "text/plain": [
1790 |        "251"
1791 |       ]
1792 |      },
1793 |      "execution_count": 76,
1794 |      "metadata": {},
1795 |      "output_type": "execute_result"
1796 |     }
1797 |    ],
1798 |    "source": [
1799 |     "gap = datetime.strptime(\"2005-02-22 15:00:00\", \"%Y-%m-%d %H:%M:%S\") - datetime.strptime(\"2004-06-16 12:30:00\", \"%Y-%m-%d %H:%M:%S\")\n",
1800 |     "gap.days"
1801 |    ]
1802 |   },
1803 |   {
1804 |    "cell_type": "code",
1805 |    "execution_count": 83,
1806 |    "metadata": {
1807 |     "collapsed": false
1808 |    },
1809 |    "outputs": [],
1810 |    "source": [
1811 |     "train_data['time_gap'] = train_data.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1)"
1812 |    ]
1813 |   },
1814 |   {
1815 |    "cell_type": "code",
1816 |    "execution_count": 85,
1817 |    "metadata": {
1818 |     "collapsed": false
1819 |    },
1820 |    "outputs": [
1821 |     {
1822 |      "data": {
1823 |       "text/plain": [
1824 |        "72.647410321133961"
1825 |       ]
1826 |      },
1827 |      "execution_count": 85,
1828 |      "metadata": {},
1829 |      "output_type": "execute_result"
1830 |     }
1831 |    ],
1832 |    "source": [
1833 |     "train_data['time_gap'].mean()"
1834 |    ]
1835 |   },
1836 |   {
1837 |    "cell_type": "code",
1838 |    "execution_count": null,
1839 |    "metadata": {
1840 |     "collapsed": true
1841 |    },
1842 |    "outputs": [],
1843 |    "source": []
1844 |   },
1845 |   {
1846 |    "cell_type": "code",
1847 |    "execution_count": 87,
1848 |    "metadata": {
1849 |     "collapsed": true
1850 |    },
1851 |    "outputs": [],
1852 |    "source": [
1853 |     "feature_to_be_splitted = ['agency_name', 'state', 'disposition']"
1854 |    ]
1855 |   },
1856 |   {
1857 |    "cell_type": "code",
1858 |    "execution_count": 98,
1859 |    "metadata": {
1860 |     "collapsed": false
1861 |    },
1862 |    "outputs": [
1863 |     {
1864 |      "data": {
1865 |       "text/plain": [
1866 |        "True"
1867 |       ]
1868 |      },
1869 |      "execution_count": 98,
1870 |      "metadata": {},
1871 |      "output_type": "execute_result"
1872 |     }
1873 |    ],
1874 |    "source": [
1875 |     "'balance_due' in train_data"
1876 |    ]
1877 |   },
1878 |   {
1879 |    "cell_type": "code",
1880 |    "execution_count": 99,
1881 |    "metadata": {
1882 |     "collapsed": false
1883 |    },
1884 |    "outputs": [],
1885 |    "source": [
1886 |     "import pandas as pd\n",
1887 |     "import numpy as np\n",
1888 |     "\n",
1889 |     "def blight_model():\n",
1890 |     "    from sklearn.neural_network import MLPClassifier\n",
1891 |     "    from sklearn.preprocessing import MinMaxScaler\n",
1892 |     "    from sklearn.tree import DecisionTreeClassifier\n",
1893 |     "    from datetime import datetime\n",
1894 |     "    def time_gap(hearing_date_str, ticket_issued_date_str):\n",
1895 |     "        if not hearing_date_str or type(hearing_date_str)!=str: return 73\n",
1896 |     "        hearing_date = datetime.strptime(hearing_date_str, \"%Y-%m-%d %H:%M:%S\")\n",
1897 |     "        ticket_issued_date = datetime.strptime(ticket_issued_date_str, \"%Y-%m-%d %H:%M:%S\")\n",
1898 |     "        gap = hearing_date - ticket_issued_date\n",
1899 |     "        return gap.days\n",
1900 |     "    train_data = pd.read_csv('train.csv', encoding = 'ISO-8859-1')\n",
1901 |     "    test_data = pd.read_csv('test.csv')\n",
1902 |     "    train_data = train_data[(train_data['compliance'] == 0) | (train_data['compliance'] == 1)]\n",
1903 |     "    address =  pd.read_csv('addresses.csv')\n",
1904 |     "    latlons = pd.read_csv('latlons.csv')\n",
1905 |     "    address = address.set_index('address').join(latlons.set_index('address'), how='left')\n",
1906 |     "    train_data = train_data.set_index('ticket_id').join(address.set_index('ticket_id'))\n",
1907 |     "    test_data = test_data.set_index('ticket_id').join(address.set_index('ticket_id'))\n",
1908 |     "    train_data = train_data[~train_data['hearing_date'].isnull()]\n",
1909 |     "    train_data['time_gap'] = train_data.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1)\n",
1910 |     "    test_data['time_gap'] = test_data.apply(lambda row: time_gap(row['hearing_date'], row['ticket_issued_date']), axis=1)\n",
1911 |     "    feature_to_be_splitted = ['agency_name', 'state', 'disposition']\n",
1912 |     "    train_data.lat.fillna(method='pad', inplace=True)\n",
1913 |     "    train_data.lon.fillna(method='pad', inplace=True)\n",
1914 |     "    train_data.state.fillna(method='pad', inplace=True)\n",
1915 |     "\n",
1916 |     "    test_data.lat.fillna(method='pad', inplace=True)\n",
1917 |     "    test_data.lon.fillna(method='pad', inplace=True)\n",
1918 |     "    test_data.state.fillna(method='pad', inplace=True)\n",
1919 |     "    train_data = pd.get_dummies(train_data, columns=feature_to_be_splitted)\n",
1920 |     "    test_data = pd.get_dummies(test_data, columns=feature_to_be_splitted)\n",
1921 |     "    list_to_remove_train = [\n",
1922 |     "        'balance_due',\n",
1923 |     "        'collection_status',\n",
1924 |     "        'compliance_detail',\n",
1925 |     "        'payment_amount',\n",
1926 |     "        'payment_date',\n",
1927 |     "        'payment_status'\n",
1928 |     "    ]\n",
1929 |     "    list_to_remove_all = ['fine_amount', 'violator_name', 'zip_code', 'country', 'city',\n",
1930 |     "                          'inspector_name', 'violation_street_number', 'violation_street_name',\n",
1931 |     "                          'violation_zip_code', 'violation_description',\n",
1932 |     "                          'mailing_address_str_number', 'mailing_address_str_name',\n",
1933 |     "                          'non_us_str_code',\n",
1934 |     "                          'ticket_issued_date', 'hearing_date', 'grafitti_status', 'violation_code']\n",
1935 |     "    train_data.drop(list_to_remove_train, axis=1, inplace=True)\n",
1936 |     "    train_data.drop(list_to_remove_all, axis=1, inplace=True)\n",
1937 |     "    test_data.drop(list_to_remove_all, axis=1, inplace=True)\n",
1938 |     "    train_features = train_data.columns.drop('compliance')\n",
1939 |     "    train_features_set = set(train_features)\n",
1940 |     "    \n",
1941 |     "    for feature in set(train_features):\n",
1942 |     "        if feature not in test_data:\n",
1943 |     "            train_features_set.remove(feature)\n",
1944 |     "    train_features = list(train_features_set)\n",
1945 |     "    \n",
1946 |     "    X_train = train_data[train_features]\n",
1947 |     "    y_train = train_data.compliance\n",
1948 |     "    X_test = test_data[train_features]\n",
1949 |     "    \n",
1950 |     "    scaler = MinMaxScaler()\n",
1951 |     "    X_train_scaled = scaler.fit_transform(X_train)\n",
1952 |     "    X_test_scaled = scaler.transform(X_test)\n",
1953 |     "    \n",
1954 |     "    clf = MLPClassifier(hidden_layer_sizes = [100, 10], alpha = 5,\n",
1955 |     "                       random_state = 0, solver='lbfgs', verbose=0)\n",
1956 |     "#     clf = DecisionTreeClassifier()\n",
1957 |     "    clf.fit(X_train_scaled, y_train)\n",
1958 |     "\n",
1959 |     "    test_proba = clf.predict_proba(X_test_scaled)[:,1]\n",
1960 |     "\n",
1961 |     "    \n",
1962 |     "    test_df = pd.read_csv('test.csv', encoding = \"ISO-8859-1\")\n",
1963 |     "    test_df['compliance'] = test_proba\n",
1964 |     "    test_df.set_index('ticket_id', inplace=True)\n",
1965 |     "    \n",
1966 |     "    return test_df.compliance"
1967 |    ]
1968 |   },
1969 |   {
1970 |    "cell_type": "code",
1971 |    "execution_count": null,
1972 |    "metadata": {
1973 |     "collapsed": false
1974 |    },
1975 |    "outputs": [
1976 |     {
1977 |      "name": "stderr",
1978 |      "output_type": "stream",
1979 |      "text": [
1980 |       "/opt/conda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning: Columns (11,12,31) have mixed types. Specify dtype option on import or set low_memory=False.\n",
1981 |       "  if self.run_code(code, result):\n"
1982 |      ]
1983 |     }
1984 |    ],
1985 |    "source": [
1986 |     "# predictions = blight_model()"
1987 |    ]
1988 |   }
1989 |  ],
1990 |  "metadata": {
1991 |   "coursera": {
1992 |    "course_slug": "python-machine-learning",
1993 |    "graded_item_id": "nNS8l",
1994 |    "launcher_item_id": "yWWk7",
1995 |    "part_id": "w8BSS"
1996 |   },
1997 |   "kernelspec": {
1998 |    "display_name": "Python 3",
1999 |    "language": "python",
2000 |    "name": "python3"
2001 |   },
2002 |   "language_info": {
2003 |    "codemirror_mode": {
2004 |     "name": "ipython",
2005 |     "version": 3
2006 |    },
2007 |    "file_extension": ".py",
2008 |    "mimetype": "text/x-python",
2009 |    "name": "python",
2010 |    "nbconvert_exporter": "python",
2011 |    "pygments_lexer": "ipython3",
2012 |    "version": "3.5.2"
2013 |   }
2014 |  },
2015 |  "nbformat": 4,
2016 |  "nbformat_minor": 2
2017 | }
2018 | 


--------------------------------------------------------------------------------