├── .gitignore ├── 01 - Jupyter Notebooks, Prerequisits.ipynb ├── 02 - Matplotlib Refersher.ipynb ├── 03 - Exploration with Pandas, Matplotlib and Seaborn.ipynb ├── 04 - Machine Learning with Scikit-learn.ipynb ├── 05 - More data - the adult dataset.ipynb ├── 06 - Closing comments.ipynb ├── LICENSE ├── README.md ├── bonus - Cross-validation and Grid Search.ipynb ├── bonus - Trees.ipynb ├── check_env.ipynb ├── data ├── adult.csv └── housing.csv ├── images ├── check_env-1.png ├── check_env-2.png ├── cross_validation.svg ├── data_representation.svg ├── download-repo.png ├── grid_search_cross_validation.svg ├── supervised_workflow.svg ├── tab-help.png └── train_test_split_matrix.svg ├── solutions └── solutions.py └── tree_plotting.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /01 - Jupyter Notebooks, Prerequisits.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Jupyter notebooks\n", 8 | "Important shortcuts:\n", 9 | "\n", 10 | "- run and move to next cell: shift + return\n", 11 | "- run and stay on same cell: alt + return\n", 12 | "- insert cell below: ctrl + m, then b (or then a for \"above\")\n", 13 | "\n", 14 | "Two modes:\n", 15 | "- insert mode\n", 16 | "- edit mode" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Getting Help\n", 24 | "- Shortcuts: press h in edit mode\n", 25 | "- press tab inside method calls (press tab again to see more):\n", 26 | "![tab help](images/tab-help.png)\n", 27 | "- use \"?\" and run cell" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "import numpy as np" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "np.bincount()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "np.bincount?" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Exercise:\n", 68 | "Use the help to find out what the options to ``np.unique`` are.\n", 69 | "Use np.unique to convert the array ``['one', 'two', 'three', 'one', 'two', 'three']`` into the array ``[0, 2, 1, 0, 2, 1]``." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "ar = ['one', 'two', 'three', 'one', 'two', 'three']\n", 81 | "# your solution here" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Plotting with matplotlib\n", 89 | "Need to use either \n", 90 | "```\n", 91 | "% matplotlib inline\n", 92 | "```\n", 93 | "or\n", 94 | "```\n", 95 | "% matplotlib notebook\n", 96 | "```\n", 97 | "Only one in each notebook!\n", 98 | "using ``inline`` will just sent ``png`` images to browser, using ``notebook`` will provide\n", 99 | "interactivity and allow updating old figures.\n", 100 | "With ``notebook`` you need to make sure to create a new figure before plotting, otherwise the last one will be updated!" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "%matplotlib notebook\n", 112 | "import matplotlib.pyplot as plt\n", 113 | "\n", 114 | "X = np.random.normal(size=(12, 2))\n", 115 | "plt.scatter(X[:, 0], X[:, 1])" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "plt.plot(X[:, 0])" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "# create a new figure\n", 138 | "plt.figure()\n", 139 | "plt.plot(X[:, 0])" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Exercise\n", 147 | "Create a new figure and plot a sin wave. You can use ``np.linspace`` to create equally spaced numbers in a given range." 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "## Best practices for data analysis in Jupyter\n", 155 | "- use standard imports\n", 156 | "- don't ``import *``\n", 157 | "- be mindful of the state in the notebook!" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "x = 1" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "x = x + 1" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "print(x)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "Avoid cells you can't run again:" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": true 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "data = {'a': [1, 2, 3], 'b': [999, 1, 2]}" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "column_a = data.pop('a')" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "print(column_a)\n", 231 | "print(data)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## Not mutating variables helps" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "x = 1" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "x2 = x + 1" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "print(x2)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "## Exercise\n", 279 | "Rewrite the code for the ``data`` dict above so that you don't mutate ``data``, but that the ``print`` stays the same." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "# solution Here" 291 | ] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python [default]", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.6.1" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 2 315 | } 316 | -------------------------------------------------------------------------------- /02 - Matplotlib Refersher.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Matplotlib API refresher" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "% matplotlib notebook\n", 19 | "import matplotlib.pyplot as plt" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Matplotlib \"stateful\" api\n", 27 | "Modifies \"current figure\"" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "plt.plot(range(10))\n", 39 | "plt.plot(range(10, 0, -1))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "import numpy as np\n", 51 | "plt.plot(np.sin(np.linspace(-3, 3, 20)))" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Works also with subplot" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "plt.figure()\n", 70 | "# create a subplot by specifying grid width, grid height and index:\n", 71 | "# 2x2 grid, first plot (one-indexed)\n", 72 | "plt.subplot(2, 2, 1)\n", 73 | "# plt.title changes \"current axes\"\n", 74 | "plt.title(\"first plot\")\n", 75 | "plt.plot(np.random.uniform(size=10))\n", 76 | "\n", 77 | "plt.subplot(2, 2, 2)\n", 78 | "# now subplot 2 is current\n", 79 | "plt.title(\"second plot\")\n", 80 | "plt.plot(np.random.uniform(size=10), 'o')\n", 81 | "\n", 82 | "plt.subplot(2, 2, 3)\n", 83 | "plt.title(\"third plot\")\n", 84 | "plt.barh(range(10), np.random.uniform(size=10))\n", 85 | "\n", 86 | "plt.subplot(2, 2, 4)\n", 87 | "plt.title(\"fourth plot\")\n", 88 | "plt.imshow(np.random.uniform(size=(10, 10)))\n", 89 | "plt.tight_layout()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Object oriented / Axis oriented API is more powerful\n", 97 | "Have an object per axes, plot directly to axes.\n", 98 | "\n", 99 | "methods modifying the axes have ``set_``!" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "plt.figure()\n", 111 | "ax11 = plt.subplot(2, 2, 1)\n", 112 | "ax21 = plt.subplot(2, 2, 2)\n", 113 | "ax12 = plt.subplot(2, 2, 3)\n", 114 | "ax22 = plt.subplot(2, 2, 4)\n", 115 | "\n", 116 | "ax11.set_title(\"ax11\")\n", 117 | "ax21.set_title(\"ax21\")\n", 118 | "ax12.set_title(\"ax12\")\n", 119 | "ax22.set_title(\"ax22\")\n", 120 | "\n", 121 | "ax21.plot(np.random.randn(10))\n", 122 | "plt.tight_layout()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "## My favorite interface: plt.subplots!" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "fig, axes = plt.subplots(2, 2)\n", 145 | "ax11, ax21, ax12, ax22 = axes.ravel()\n", 146 | "ax11.set_title(\"ax11\")\n", 147 | "ax21.set_title(\"ax21\")\n", 148 | "ax12.set_title(\"ax12\")\n", 149 | "ax22.set_title(\"ax22\")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## Exercise\n", 157 | "Create a grid plot with one row and four columns where the first entry plots the function ``f(x) = x``, the second ``f(x)=x ** 2``, the third ``f(x)=x ** 3`` and the fourth ``f(x)=x**4``." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "# Your solution\n" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "## More fun with subplots!" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "import numpy as np\n", 187 | "sin = np.sin(np.linspace(-4, 4, 100))\n", 188 | "fig, axes = plt.subplots(2, 2)\n", 189 | "plt.plot(sin)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "fig, axes = plt.subplots(2, 2)\n", 201 | "axes[0, 0].plot(sin)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "asdf = plt.gca()\n", 213 | "asdf.plot(sin, c='k')" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## More on plotting commands and styling" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "fig, ax = plt.subplots(2, 4, figsize=(10, 5))\n", 232 | "ax[0, 0].plot(sin)\n", 233 | "ax[0, 1].plot(range(100), sin) # same as above\n", 234 | "ax[0, 2].plot(np.linspace(-4, 4, 100), sin)\n", 235 | "ax[0, 3].plot(sin[::10], 'o')\n", 236 | "ax[1, 0].plot(sin, c='r')\n", 237 | "ax[1, 1].plot(sin, '--')\n", 238 | "ax[1, 2].plot(sin, lw=3)\n", 239 | "ax[1, 3].plot(sin[::10], '--o')\n", 240 | "plt.tight_layout() # makes stuff fit - usually works" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "# Exercise\n", 248 | "See how many lines you can put in a plot an still distinguish them (using the styles described above).\n", 249 | "How many can you distinguish if you don't use color?\n", 250 | "See the [lines bars and markers](https://matplotlib.org/gallery.html#lines_bars_and_markers) section of the matplotlib examples for more different styles" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "collapsed": true 258 | }, 259 | "outputs": [], 260 | "source": [ 261 | "# solution" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "### Scatter vs plot\n", 269 | "Scatter allows modifying individual points, plot only allows modifying them all the same way:" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "x = np.random.uniform(size=50)\n", 281 | "y = x + np.random.normal(0, .1, size=50)\n", 282 | "\n", 283 | "fig, ax = plt.subplots(2, 2, figsize=(5, 5),\n", 284 | " subplot_kw={'xticks': (), 'yticks': ()})\n", 285 | "ax[0, 0].scatter(x, y)\n", 286 | "ax[0, 0].set_title(\"scatter\")\n", 287 | "ax[0, 1].plot(x, y, 'o')\n", 288 | "ax[0, 1].set_title(\"plot\")\n", 289 | "ax[1, 0].scatter(x, y, c=x-y, cmap='bwr', edgecolor='k')\n", 290 | "ax[1, 1].scatter(x, y, c=x-y, s=np.abs(np.random.normal(scale=20, size=50)), cmap='bwr', edgecolor='k')\n", 291 | "plt.tight_layout()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "# Imshow, interpolation, colormaps\n", 299 | "- three important kinds of color maps: sequential, diverging, qualitative\n", 300 | "- default colormap: viridis\n", 301 | "- default qualitative colormap: tab10" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "collapsed": true 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "from matplotlib.cbook import get_sample_data\n", 313 | "f = get_sample_data(\"axes_grid/bivariate_normal.npy\", asfileobj=False)\n", 314 | "arr = np.load(f)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "collapsed": true 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "fig, ax = plt.subplots(2, 2)\n", 326 | "im1 = ax[0, 0].imshow(arr)\n", 327 | "ax[0, 1].imshow(arr, interpolation='bilinear')\n", 328 | "im3 = ax[1, 0].imshow(arr, cmap='gray')\n", 329 | "im4 = ax[1, 1].imshow(arr, cmap='bwr', vmin=-1.5, vmax=1.5)\n", 330 | "plt.colorbar(im1, ax=ax[0, 0])\n", 331 | "plt.colorbar(im3, ax=ax[1, 0])\n", 332 | "plt.colorbar(im4, ax=ax[1, 1])" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## The problem of overplotting" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "x1, y1 = 1 / np.random.uniform(-1000, 100, size=(2, 10000))\n", 351 | "x2, y2 = np.dot(np.random.uniform(size=(2, 2)), np.random.normal(size=(2, 1000)))\n", 352 | "x = np.hstack([x1, x2])\n", 353 | "y = np.hstack([y1, y2])" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "plt.figure()\n", 365 | "plt.xlim(-1, 1)\n", 366 | "plt.ylim(-1, 1)\n", 367 | "plt.scatter(x, y)" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": { 374 | "collapsed": true 375 | }, 376 | "outputs": [], 377 | "source": [ 378 | "fig, ax = plt.subplots(1, 3, figsize=(10, 4),\n", 379 | " subplot_kw={'xlim': (-1, 1),\n", 380 | " 'ylim': (-1, 1)})\n", 381 | "ax[0].scatter(x, y)\n", 382 | "ax[1].scatter(x, y, alpha=.1)\n", 383 | "ax[2].scatter(x, y, alpha=.01)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": { 390 | "collapsed": true 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "plt.figure()\n", 395 | "plt.hexbin(x, y, bins='log', extent=(-1, 1, -1, 1), gridsize=50, linewidths=0)\n", 396 | "plt.colorbar()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "# Twinx" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": { 410 | "collapsed": true 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "df = pd.DataFrame({'Math PhDs awareded (US)': {'2000': 1050,\n", 415 | " '2001': 1010,\n", 416 | " '2002': 919,\n", 417 | " '2003': 993,\n", 418 | " '2004': 1076,\n", 419 | " '2005': 1205,\n", 420 | " '2006': 1325,\n", 421 | " '2007': 1393,\n", 422 | " '2008': 1399,\n", 423 | " '2009': 1554},\n", 424 | " 'Total revenue by arcades (US)': {'2000': 1196000000,\n", 425 | " '2001': 1176000000,\n", 426 | " '2002': 1269000000,\n", 427 | " '2003': 1240000000,\n", 428 | " '2004': 1307000000,\n", 429 | " '2005': 1435000000,\n", 430 | " '2006': 1601000000,\n", 431 | " '2007': 1654000000,\n", 432 | " '2008': 1803000000,\n", 433 | " '2009': 1734000000}})" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": { 440 | "collapsed": true 441 | }, 442 | "outputs": [], 443 | "source": [ 444 | "# could also do df.plot()\n", 445 | "phds = df['Math PhDs awareded (US)']\n", 446 | "revenue = df['Total revenue by arcades (US)']\n", 447 | "years = df.index\n", 448 | "\n", 449 | "plt.figure()\n", 450 | "ax1 = plt.gca()\n", 451 | "line1, = ax1.plot(years, phds)\n", 452 | "line2, = ax1.plot(years, revenue, c='r')\n", 453 | "plt.legend((line1, line2), (\"math PhDs awarded\", \"revenue by arcades\"))" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [ 464 | "plt.figure()\n", 465 | "ax1 = plt.gca()\n", 466 | "line1, = ax1.plot(years, phds)\n", 467 | "ax2 = ax1.twinx()\n", 468 | "line2, = ax2.plot(years, revenue, c='r')\n", 469 | "plt.legend((line1, line2), (\"math PhDs awarded\", \"revenue by arcades\"))\n", 470 | "ax1.set_ylabel(\"Math PhDs awarded\")\n", 471 | "ax2.set_ylabel(\"revenue by arcades\")" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "# Exercise\n", 479 | "Get another dataset from http://www.tylervigen.com/spurious-correlations and plot it using ``twinx``." 480 | ] 481 | } 482 | ], 483 | "metadata": { 484 | "anaconda-cloud": {}, 485 | "kernelspec": { 486 | "display_name": "Python [conda root]", 487 | "language": "python", 488 | "name": "conda-root-py" 489 | }, 490 | "language_info": { 491 | "codemirror_mode": { 492 | "name": "ipython", 493 | "version": 3 494 | }, 495 | "file_extension": ".py", 496 | "mimetype": "text/x-python", 497 | "name": "python", 498 | "nbconvert_exporter": "python", 499 | "pygments_lexer": "ipython3", 500 | "version": "3.6.1" 501 | } 502 | }, 503 | "nbformat": 4, 504 | "nbformat_minor": 2 505 | } 506 | -------------------------------------------------------------------------------- /03 - Exploration with Pandas, Matplotlib and Seaborn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib notebook\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "#import seaborn as sns\n", 14 | "import pandas as pd\n", 15 | "import numpy as np\n", 16 | "plt.rcParams['figure.dpi'] = 100" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Data Loading" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "# Loading the california housing dataset CSV\n", 31 | "\n", 32 | "```\n", 33 | "We collected information on the variables using all the block groups in California from the 1990 Census. In this sample a block group on average includes 1425.5 individuals living in a geographically co mpact area. Naturally, the geographical area included varies inversely with the population density. W e computed distances among the centroids of each block group as measured in latitude and longitude. W e excluded all the block groups reporting zero entries for the independent and dependent variables. T he final data contained 20,640 observations on 9 variables. The dependent variable is ln(median house value).\n", 34 | "```" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "housing = pd.read_csv(\"data/housing.csv\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "housing.head()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "housing.shape" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "housing.ocean_proximity.value_counts()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "continuous_dependent = housing.columns[:-2]" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "housing[continuous_dependent].hist();" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "housing[continuous_dependent].hist(bins=\"auto\")\n", 102 | "plt.tight_layout()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "housing.population.sort_values().tail()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "# Exercise\n", 119 | "Come up with a way to visualize the population data which can help us inspect the outliers." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "# your solution here" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "# my solution\n", 142 | "# . . . " 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "# Exercise\n", 150 | "We can do better!" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "# your solution here" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": true 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "# my solution\n", 173 | "# . . . " 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "# Exercise\n", 181 | "Create a 2 x 3 subplot grid showing housing_median_age, total_rooms, total_bedrooms, population, households and median_income\n", 182 | "each as a hexbin." 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "# Dealing with missing values a little bit" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "housing.isnull().sum()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "housing_nonull = housing.dropna().copy()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "# scatter matrix / pair plot" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "scrolled": false 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "pd.plotting.scatter_matrix(housing_nonull.iloc[:, 2:-2]);" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "scrolled": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "pd.plotting.scatter_matrix(housing_nonull.iloc[:, 2:-2], c=housing.ocean_proximity);\n", 239 | "# error" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "unique_proximity, ocean_proximity_int = np.unique(housing_nonull.ocean_proximity, return_inverse=True)\n", 249 | "ocean_proximity_int" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "unique_proximity" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "scrolled": false 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "pd.plotting.scatter_matrix(housing_nonull.iloc[:, 2:-2], c=ocean_proximity_int);" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "plt.matshow(np.arange(4).reshape(1, 4))\n", 279 | "plt.xticks(range(4), unique_proximity)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "scrolled": false 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "import seaborn.apionly as sns\n", 291 | "sns.pairplot(housing_nonull.iloc[:, 2:], hue='ocean_proximity')" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "# Exercise\n", 299 | "- Confirm the coloring of the scatter matrix makes sense by plotting latitude vs longitude colored by ocean distance.\n", 300 | "- Are the two outliers the same for all the plots?" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": true 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "# solution here" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "## Exploring the target (dependent variable)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "housing_nonull.plot('median_income', 'median_house_value', kind='scatter', alpha=.1)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "## Exercise\n", 335 | "Do a scatter plot of all the continuous dependent variables against the median house value." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "# solution" 347 | ] 348 | } 349 | ], 350 | "metadata": { 351 | "kernelspec": { 352 | "display_name": "Python [conda root]", 353 | "language": "python", 354 | "name": "conda-root-py" 355 | }, 356 | "language_info": { 357 | "codemirror_mode": { 358 | "name": "ipython", 359 | "version": 3 360 | }, 361 | "file_extension": ".py", 362 | "mimetype": "text/x-python", 363 | "name": "python", 364 | "nbconvert_exporter": "python", 365 | "pygments_lexer": "ipython3", 366 | "version": "3.6.1" 367 | } 368 | }, 369 | "nbformat": 4, 370 | "nbformat_minor": 2 371 | } 372 | -------------------------------------------------------------------------------- /04 - Machine Learning with Scikit-learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Basics of Supervised Learning\n", 8 | "- Learn from (input, output) pairs\n", 9 | "- Generalize to new input, predict unknown output\n", 10 | "\n", 11 | "![supervised_workflow](images/supervised_workflow.svg)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import matplotlib.pyplot as plt\n", 23 | "import numpy as np\n", 24 | "%matplotlib notebook" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "from sklearn.datasets import load_digits\n", 36 | "import numpy as np\n", 37 | "digits = load_digits()\n", 38 | "digits.keys()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "digits.data.shape" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "digits.target.shape" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "digits.target" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "np.bincount(digits.target)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "plt.matshow(digits.data[0].reshape(8, 8), cmap=plt.cm.Greys)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "digits.target[0]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "fig, axes = plt.subplots(4, 4)\n", 116 | "for x, y, ax in zip(digits.data, digits.target, axes.ravel()):\n", 117 | " ax.set_title(y)\n", 118 | " ax.imshow(x.reshape(8, 8), cmap=\"gray_r\")\n", 119 | " ax.set_xticks(())\n", 120 | " ax.set_yticks(())\n", 121 | "plt.tight_layout()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "![train_test_split](images/train_test_split_matrix.svg)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "from sklearn.model_selection import train_test_split\n", 140 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n", 141 | " digits.target)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Really Simple API\n", 149 | "-------------------\n", 150 | "0) Import your model class" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "from sklearn.svm import LinearSVC" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "1) Instantiate an object and set the parameters" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "svm = LinearSVC()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "2) Fit the model" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "collapsed": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "svm.fit(X_train, y_train)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "3) Apply / evaluate" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "print(svm.predict(X_train))\n", 216 | "print(y_train)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "collapsed": true 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "svm.score(X_train, y_train)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": true 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "svm.score(X_test, y_test)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "And again\n", 246 | "---------" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": true 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "from sklearn.ensemble import RandomForestClassifier" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "rf = RandomForestClassifier(n_estimators=50)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "rf.fit(X_train, y_train)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "rf.score(X_test, y_test)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "# Exercise\n", 298 | "Load the iris dataset from seaborn using\n", 299 | "\n", 300 | "```python\n", 301 | "iris = sns.load_dataset(\"iris\")\n", 302 | "```\n", 303 | "Visualize the dataset. Extract the features (independent variables) and the target (dependent variable).\n", 304 | "Split it into training and test set using ``train_test_split``.\n", 305 | "\n", 306 | "\n", 307 | "Then train an evaluate a classifier of your choice. Try ``sklearn.neighbors.KNeighborsClassifier`` or ``sklearn.ensemble.RandomForestClassifier`` for example.\n" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "# your solution" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "# Dummy encoding of categorical variables" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": { 332 | "collapsed": true 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "import pandas as pd\n", 337 | "df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],\n", 338 | " 'boro': ['Manhatten', 'Queens', 'Manhatten', 'Brooklyn', 'Brooklyn', 'Bronx']})\n", 339 | "df" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "pd.get_dummies(df)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "collapsed": true 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],\n", 362 | " 'boro': [0, 1,0, 2, 2, 3]})\n", 363 | "df" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "collapsed": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "pd.get_dummies(df)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "# Exercise\n", 382 | "Load the california housing data from data/housing.csv and apply dummy encoding." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "collapsed": true 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "# solution" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "# Scaling data" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": { 407 | "collapsed": true 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "import seaborn.apionly as sns\n", 412 | "iris = sns.load_dataset(\"iris\")\n", 413 | "iris.head()" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "collapsed": true 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "X = iris.iloc[:, :-1] # could do iris.pop(\"species\") but that is changing \"iris\"\n", 425 | "y = iris.species\n", 426 | "X.shape" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "collapsed": true 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": { 444 | "collapsed": true 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "from sklearn.preprocessing import StandardScaler\n", 449 | "scaler = StandardScaler()\n", 450 | "scaler.fit(X_train)\n", 451 | "X_train_scaled = scaler.transform(X_train)" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": { 458 | "collapsed": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "X_train_scaled[:10]" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": { 469 | "collapsed": true 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "X_test_scaled = scaler.transform(X_test)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": { 480 | "collapsed": true 481 | }, 482 | "outputs": [], 483 | "source": [ 484 | "svm = LinearSVC()\n", 485 | "svm.fit(X_train_scaled, y_train)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": { 492 | "collapsed": true 493 | }, 494 | "outputs": [], 495 | "source": [ 496 | "svm.predict(X_test_scaled)" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": { 503 | "collapsed": true 504 | }, 505 | "outputs": [], 506 | "source": [ 507 | "svm.score(X_test_scaled, y_test)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "# Exercise\n", 515 | "- load the california housing data and drop columns with missing values\n", 516 | "- Separate features and target in the california housing dataset (with dummy encoding)\n", 517 | "- use train_test_split to split it into training and test data\n", 518 | "- use the StandardScaler to scale training and test data\n", 519 | "- Fit the sklearn.linear_modle.Ridge model (ridge regression, a linear regression model) and evaluate it on the test data.\n", 520 | "\n", 521 | "Note: the score method computes the R^2 for regression problems" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": { 528 | "collapsed": true 529 | }, 530 | "outputs": [], 531 | "source": [ 532 | "# solution here" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": { 539 | "collapsed": true 540 | }, 541 | "outputs": [], 542 | "source": [ 543 | "# Inspecting the ridge model" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": { 550 | "collapsed": true 551 | }, 552 | "outputs": [], 553 | "source": [ 554 | "X_train.columns" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": { 561 | "collapsed": true 562 | }, 563 | "outputs": [], 564 | "source": [ 565 | "ridge.coef_" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": { 572 | "collapsed": true 573 | }, 574 | "outputs": [], 575 | "source": [ 576 | "plt.figure()\n", 577 | "plt.barh(range(len(ridge.coef_)), ridge.coef_)\n", 578 | "plt.yticks(range(len(ridge.coef_)), X_train.columns);" 579 | ] 580 | } 581 | ], 582 | "metadata": { 583 | "anaconda-cloud": {}, 584 | "kernelspec": { 585 | "display_name": "Python [default]", 586 | "language": "python", 587 | "name": "python3" 588 | }, 589 | "language_info": { 590 | "codemirror_mode": { 591 | "name": "ipython", 592 | "version": 3 593 | }, 594 | "file_extension": ".py", 595 | "mimetype": "text/x-python", 596 | "name": "python", 597 | "nbconvert_exporter": "python", 598 | "pygments_lexer": "ipython3", 599 | "version": "3.6.1" 600 | } 601 | }, 602 | "nbformat": 4, 603 | "nbformat_minor": 1 604 | } 605 | -------------------------------------------------------------------------------- /05 - More data - the adult dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib notebook\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "#import seaborn as sns\n", 14 | "import pandas as pd\n", 15 | "import numpy as np\n", 16 | "plt.rcParams['figure.dpi'] = 100" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Using the adult dataset" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "adult = pd.read_csv(\"data/adult.csv\", index_col=0)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "adult.head()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "adult.income.value_counts()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "adult.income.value_counts().plot(kind=\"barh\")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "adult.education.value_counts()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "adult.groupby(\"income\")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "adult.groupby(\"income\")['education'].value_counts()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "education_counts = adult.groupby(\"income\")['education'].value_counts()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "education_counts.unstack(\"income\")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "unstacked_education = education_counts.unstack(\"income\")\n", 118 | "unstacked_education.plot(kind=\"barh\")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "(unstacked_education / unstacked_education.sum(axis=0)).plot(kind=\"barh\")" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "unstacked_education.columns" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "plt.figure()\n", 146 | "(unstacked_education[\" >50K\"] / unstacked_education.sum(axis=1)).plot(kind=\"barh\")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "# Exercise\n", 154 | "Group the data by gender, and compare the income distributions over genders.\n", 155 | "Do a similar plot for some of the other variables." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "# solution" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "source": [ 175 | "# Exercise\n", 176 | "Apply the basic machine learning workflow to the dataset.\n", 177 | "For simplicity you might want to drop the \"native-country\" column.\n", 178 | "Proceed as follows:\n", 179 | "- separate features and target\n", 180 | "- do dummy encoding of the categorical features\n", 181 | "- split data in training and test set\n", 182 | "- scale the data\n", 183 | "- apply a machine learning model. Start with ``sklearn.linear_model.LogisticRegression``, a linear classifier.\n", 184 | "- visualize the coefficients in a bar-plot (if there are too many, only show the ones of larges magnitude)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python [conda root]", 200 | "language": "python", 201 | "name": "conda-root-py" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.6.1" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 2 218 | } 219 | -------------------------------------------------------------------------------- /06 - Closing comments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Version control and nbdime\n", 8 | "- Do version control\n", 9 | "- JSON is not fun to version control\n", 10 | "- nbdime eases the pain" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# Run all etc\n", 18 | "- make sure your notebooks are reproducible\n", 19 | "- avoid cells that modify state\n", 20 | "- use \"restart kernel and run all\" to check if your code actually works" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# Handling multiple kernels\n", 28 | "- Check out the docs at https://ipython.readthedocs.io/en/latest/install/kernel_install.html" 29 | ] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "Python [conda root]", 35 | "language": "python", 36 | "name": "conda-root-py" 37 | }, 38 | "language_info": { 39 | "codemirror_mode": { 40 | "name": "ipython", 41 | "version": 3 42 | }, 43 | "file_extension": ".py", 44 | "mimetype": "text/x-python", 45 | "name": "python", 46 | "nbconvert_exporter": "python", 47 | "pygments_lexer": "ipython3", 48 | "version": "3.6.1" 49 | } 50 | }, 51 | "nbformat": 4, 52 | "nbformat_minor": 2 53 | } 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Andreas Mueller 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Data analysis and machine learning in Jupyter 2 | ============================================= 3 | 4 | 5 | Instructor 6 | ----------- 7 | 8 | - [Andreas Mueller](http://amuller.github.io) [@amuellerml](https://twitter.com/amuellerml) - Columbia University; [Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do) 9 | 10 | 11 | Obtaining the Tutorial Material 12 | ------------------ 13 | 14 | 15 | If you have a GitHub account, it is probably most convenient if you clone or 16 | fork the GitHub repository. You can clone the repository by running: 17 | 18 | ```bash 19 | git clone https://github.com/amueller/jupytercon2017.git 20 | 21 | ``` 22 | 23 | If you are not familiar with git or don’t have an 24 | GitHub account, you can download the repository as a .zip file by heading over 25 | to the GitHub repository (https://github.com/amueller/jupytercon2017) in 26 | your browser and click the green “Download” button in the upper right. 27 | 28 | ![](images/download-repo.png) 29 | 30 | Please note that we may add and improve the material until shortly before the 31 | tutorial session, and we recommend you to update your copy of the materials one 32 | day before the tutorials. If you have an GitHub account and cloned the 33 | repository via GitHub, you can sync your existing local repository with: 34 | 35 | ```bash 36 | git pull origin master 37 | ``` 38 | 39 | If you don’t have a GitHub account, you may have to re-download the .zip 40 | archive from GitHub. 41 | 42 | 43 | Installation Notes 44 | ------------------ 45 | 46 | This tutorial will require recent installations of 47 | 48 | - [NumPy](http://www.numpy.org) 49 | - [SciPy](http://www.scipy.org) 50 | - [matplotlib](http://matplotlib.org) 51 | - [pandas](http://pandas.pydata.org) 52 | - [pillow](https://python-pillow.org) 53 | - [scikit-learn](http://scikit-learn.org/stable/) 54 | - [seaborn](https://seaborn.pydata.org/) 55 | - [IPython](http://ipython.readthedocs.org/en/stable/) 56 | - [Jupyter Notebook](http://jupyter.org) 57 | 58 | 59 | The last one is important, you should be able to type: 60 | 61 | jupyter notebook 62 | 63 | in your terminal window and see the notebook panel load in your web browser. 64 | Try opening and running a notebook from the material to see check that it works. 65 | 66 | For users who do not yet have these packages installed, a relatively 67 | painless way to install all the requirements is to use a Python distribution 68 | such as [Anaconda CE](http://store.continuum.io/ "Anaconda CE"), which includes 69 | the most relevant Python packages for science, math, engineering, and 70 | data analysis; Anaconda can be downloaded and installed for free 71 | including commercial use and redistribution. 72 | The code examples in this tutorial should be compatible to Python 2.7, 73 | Python 3.4-3.6. 74 | 75 | After obtaining the material, we **strongly recommend** you to open and execute 76 | the Jupyter Notebook `jupter notebook check_env.ipynb` that is located at the 77 | top level of this repository. Inside the repository, you can open the notebook 78 | by executing 79 | 80 | ```bash 81 | jupyter notebook check_env.ipynb 82 | ``` 83 | 84 | inside this repository. Inside the Notebook, you can run the code cell by 85 | clicking on the "Run Cells" button as illustrated in the figure below: 86 | 87 | ![](images/check_env-1.png) 88 | 89 | 90 | Finally, if your environment satisfies the requirements for the tutorials, the 91 | executed code cell will produce an output message as shown below: 92 | 93 | ![](images/check_env-2.png) 94 | 95 | Although not required, we also recommend you to update the required Python 96 | packages to their latest versions to ensure best compatibility with the 97 | teaching material. Please upgrade already installed packages by executing 98 | 99 | - `pip install [package-name] --upgrade` 100 | - or `conda update [package-name]` 101 | 102 | 103 | 104 | 105 | Outline 106 | ======= 107 | t.b.a. 108 | 109 | -------------------------------------------------------------------------------- /bonus - Cross-validation and Grid Search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Cross-validation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "![cross-validation](images/cross_validation.svg)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "%matplotlib notebook\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "plt.rcParams[\"figure.dpi\"] = 200\n", 28 | "import numpy as np" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "from sklearn.datasets import load_digits\n", 40 | "from sklearn.model_selection import train_test_split\n", 41 | "digits = load_digits()\n", 42 | "X_train, X_test, y_train, y_test = train_test_split(\n", 43 | " digits.data, digits.target)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "from sklearn.model_selection import cross_val_score\n", 55 | "from sklearn.neighbors import KNeighborsClassifier" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "cross_val_score(KNeighborsClassifier(),\n", 65 | " X_train, y_train, cv=5)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "from sklearn.model_selection import KFold, StratifiedKFold" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "cross_val_score(KNeighborsClassifier(),\n", 86 | " X_train, y_train, cv=KFold(n_splits=10, shuffle=True, random_state=42))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Grid Searches\n", 94 | "=================" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "![grid-search](images/grid_search_cross_validation.svg)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "Grid-Search with build-in cross validation" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "from sklearn.model_selection import GridSearchCV\n", 118 | "from sklearn.svm import SVC" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "Define parameter grid:" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "import numpy as np\n", 135 | "\n", 136 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n", 137 | " 'gamma' : 10. ** np.arange(-5, 0)}\n", 138 | "\n", 139 | "np.set_printoptions(suppress=True)\n", 140 | "print(param_grid)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3, cv=5)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "A GridSearchCV object behaves just like a normal classifier." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "scrolled": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "grid_search.fit(X_train, y_train)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "scrolled": true 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "grid_search.predict(X_test)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "grid_search.score(X_test, y_test)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "grid_search.best_params_" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "grid_search.best_score_" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "grid_search.best_estimator_" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "# We extract just the scores\n", 224 | "\n", 225 | "scores = grid_search.cv_results_['mean_test_score']\n", 226 | "scores = np.array(scores).reshape(6, 5)\n", 227 | "\n", 228 | "plt.matshow(scores)\n", 229 | "plt.xlabel('gamma')\n", 230 | "plt.ylabel('C')\n", 231 | "plt.colorbar()\n", 232 | "plt.xticks(np.arange(5), param_grid['gamma'])\n", 233 | "plt.yticks(np.arange(6), param_grid['C']);" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "# Exercises\n", 241 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier." 242 | ] 243 | } 244 | ], 245 | "metadata": { 246 | "anaconda-cloud": {}, 247 | "kernelspec": { 248 | "display_name": "Python [default]", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.6.1" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 1 267 | } 268 | -------------------------------------------------------------------------------- /bonus - Trees.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "% matplotlib inline\n", 14 | "plt.rcParams[\"figure.dpi\"] = 200\n", 15 | "np.set_printoptions(precision=3)\n", 16 | "import pandas as pd\n", 17 | "from sklearn.model_selection import train_test_split\n", 18 | "from sklearn.pipeline import make_pipeline\n", 19 | "from sklearn.preprocessing import scale, StandardScaler" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "from sklearn.datasets import load_breast_cancer\n", 31 | "cancer = load_breast_cancer()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "scrolled": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "print(cancer.DESCR)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "X_train, X_test, y_train, y_test = train_test_split(\n", 54 | " cancer.data, cancer.target, stratify=cancer.target, random_state=0)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "source": [ 63 | "# tree visualization" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from sklearn.tree import DecisionTreeClassifier\n", 73 | "tree = DecisionTreeClassifier(max_depth=2)\n", 74 | "tree.fit(X_train, y_train)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "from sklearn.tree import export_graphviz\n", 84 | "tree_dot = export_graphviz(tree, out_file=None, feature_names=cancer.feature_names, filled=True)\n", 85 | "print(tree_dot)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "# import graphviz\n", 97 | "# graphviz.Source(tree_dot)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from tree_plotting import plot_tree\n", 107 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "# Parameter Tuning" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "plt.figure(figsize=(20, 5))\n", 124 | "tree = DecisionTreeClassifier().fit(X_train, y_train)\n", 125 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "tree = DecisionTreeClassifier(max_depth=1).fit(X_train, y_train)\n", 135 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "tree = DecisionTreeClassifier(max_leaf_nodes=8).fit(X_train, y_train)\n", 145 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "tree = DecisionTreeClassifier(min_samples_split=50).fit(X_train, y_train)\n", 155 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "from sklearn.model_selection import GridSearchCV\n", 165 | "param_grid = {'max_depth':range(1, 7)}\n", 166 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=10)\n", 167 | "grid.fit(X_train, y_train)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "scrolled": true 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit\n", 179 | "param_grid = {'max_depth':range(1, 7)}\n", 180 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,\n", 181 | " cv=StratifiedShuffleSplit(100))\n", 182 | "grid.fit(X_train, y_train)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "scores = pd.DataFrame(grid.cv_results_)\n", 192 | "scores.plot(x='param_max_depth', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n", 193 | "plt.legend(loc=(1, 0))" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "from sklearn.model_selection import GridSearchCV\n", 203 | "param_grid = {'max_leaf_nodes':range(2, 20)}\n", 204 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=StratifiedShuffleSplit(100, random_state=1))\n", 205 | "grid.fit(X_train, y_train)\n", 206 | "\n", 207 | "scores = pd.DataFrame(grid.cv_results_)\n", 208 | "scores.plot(x='param_max_leaf_nodes', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n", 209 | "plt.legend(loc=(1, 0))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "scores = pd.DataFrame(grid.cv_results_)\n", 219 | "scores.plot(x='param_max_leaf_nodes', y='mean_train_score', yerr='std_train_score', ax=plt.gca())\n", 220 | "scores.plot(x='param_max_leaf_nodes', y='mean_test_score', yerr='std_test_score', ax=plt.gca())" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "grid.best_params_" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "plot_tree(grid.best_estimator_, feature_names=cancer.feature_names, filled=True)\n" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "pd.Series(grid.best_estimator_.feature_importances_,\n", 250 | " index=cancer.feature_names).plot(kind=\"barh\")" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "# Exercise\n", 258 | "Apply a decision tree to the \"adult\" dataset and visualize it.\n", 259 | "\n", 260 | "Tune parameters with grid-search; use max_features, try at least max_leaf_nodes and max_depth, but separately.\n", 261 | "\n", 262 | "Visualize the resulting tree and it's feature importances." 263 | ] 264 | } 265 | ], 266 | "metadata": { 267 | "anaconda-cloud": {}, 268 | "kernelspec": { 269 | "display_name": "Python [conda root]", 270 | "language": "python", 271 | "name": "conda-root-py" 272 | }, 273 | "language_info": { 274 | "codemirror_mode": { 275 | "name": "ipython", 276 | "version": 3 277 | }, 278 | "file_extension": ".py", 279 | "mimetype": "text/x-python", 280 | "name": "python", 281 | "nbconvert_exporter": "python", 282 | "pygments_lexer": "ipython3", 283 | "version": "3.6.1" 284 | } 285 | }, 286 | "nbformat": 4, 287 | "nbformat_minor": 2 288 | } 289 | -------------------------------------------------------------------------------- /check_env.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from __future__ import print_function\n", 12 | "from distutils.version import LooseVersion as Version\n", 13 | "import sys\n", 14 | "\n", 15 | "\n", 16 | "\n", 17 | "OK = '\\x1b[42m[ OK ]\\x1b[0m'\n", 18 | "FAIL = \"\\x1b[41m[FAIL]\\x1b[0m\"\n", 19 | "\n", 20 | "try:\n", 21 | " import importlib\n", 22 | "except ImportError:\n", 23 | " print(FAIL, \"Python version 3.4 (or 2.7) is required,\"\n", 24 | " \" but %s is installed.\" % sys.version)\n", 25 | "\n", 26 | " \n", 27 | "def import_version(pkg, min_ver, fail_msg=\"\"):\n", 28 | " mod = None\n", 29 | " try:\n", 30 | " mod = importlib.import_module(pkg)\n", 31 | " if pkg in {'PIL'}:\n", 32 | " ver = mod.VERSION\n", 33 | " else:\n", 34 | " ver = mod.__version__\n", 35 | " if Version(ver) < min_ver:\n", 36 | " print(FAIL, \"%s version %s or higher required, but %s installed.\"\n", 37 | " % (lib, min_ver, ver))\n", 38 | " else:\n", 39 | " print(OK, '%s version %s' % (pkg, ver))\n", 40 | " except ImportError:\n", 41 | " print(FAIL, '%s not installed. %s' % (pkg, fail_msg))\n", 42 | " return mod\n", 43 | "\n", 44 | "\n", 45 | "# first check the python version\n", 46 | "print('Using python in', sys.prefix)\n", 47 | "print(sys.version)\n", 48 | "pyversion = Version(sys.version)\n", 49 | "if pyversion >= \"3\":\n", 50 | " if pyversion < \"3.4\":\n", 51 | " print(FAIL, \"Python version 3.4 (or 2.7) is required,\"\n", 52 | " \" but %s is installed.\" % sys.version)\n", 53 | "elif pyversion >= \"2\":\n", 54 | " if pyversion < \"2.7\":\n", 55 | " print(FAIL, \"Python version 2.7 is required,\"\n", 56 | " \" but %s is installed.\" % sys.version)\n", 57 | "else:\n", 58 | " print(FAIL, \"Unknown Python version: %s\" % sys.version)\n", 59 | "\n", 60 | "print()\n", 61 | "requirements = {'numpy': \"1.6.1\", 'scipy': \"0.9\", 'matplotlib': \"1.0\",\n", 62 | " 'IPython': \"3.0\", 'sklearn': \"0.19\", 'pandas': \"0.18\",\n", 63 | " 'seaborn': \"0.5\", 'PIL': \"1.1.7\"}\n", 64 | "\n", 65 | "# now the dependencies\n", 66 | "for lib, required_version in list(requirements.items()):\n", 67 | " import_version(lib, required_version)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "anaconda-cloud": {}, 82 | "kernelspec": { 83 | "display_name": "Python [default]", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.6.1" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 1 102 | } 103 | -------------------------------------------------------------------------------- /images/check_env-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/jupytercon2017/923b29822fab2a91f0b97d5ba4bd4a56b6e88880/images/check_env-1.png -------------------------------------------------------------------------------- /images/check_env-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/jupytercon2017/923b29822fab2a91f0b97d5ba4bd4a56b6e88880/images/check_env-2.png -------------------------------------------------------------------------------- /images/cross_validation.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlFold 1 369 | Fold 2 395 | Fold 3 421 | Fold 4 447 | Fold 5 473 | Fold 1 499 | Fold 2 525 | Fold 3 551 | Fold 4 577 | Fold 5 603 | Fold 1 629 | Fold 2 655 | Fold 3 681 | Fold 4 707 | Fold 5 733 | Fold 1 759 | Fold 2 785 | Fold 3 811 | Fold 4 837 | Fold 5 863 | Fold 1 889 | Fold 2 915 | Fold 3 941 | Fold 4 967 | Fold 5 993 | Fold 1 1019 | Fold 2 1045 | Fold 3 1071 | Fold 4 1097 | Fold 5 1123 | Split 1 1145 | Split 2 1163 | Split 3 1181 | Split 4 1199 | Split 5 1217 | -------------------------------------------------------------------------------- /images/download-repo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/jupytercon2017/923b29822fab2a91f0b97d5ba4bd4a56b6e88880/images/download-repo.png -------------------------------------------------------------------------------- /images/supervised_workflow.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlTraining Data 335 | Test Data 358 | Training Labels 381 | Model 401 | Prediction 421 | Test Labels 486 | Evaluation 506 | Training 541 | Generalization 558 | -------------------------------------------------------------------------------- /images/tab-help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/jupytercon2017/923b29822fab2a91f0b97d5ba4bd4a56b6e88880/images/tab-help.png -------------------------------------------------------------------------------- /solutions/solutions.py: -------------------------------------------------------------------------------- 1 | ######### 02 2 | fig, axes = plt.subplots(1, 4) 3 | line = np.linspace(-3, 3, 20) 4 | for i, ax in enumerate(axes): 5 | ax.plot(line ** (i + 1)) 6 | 7 | ######### 03 8 | housing.plot(x='latitude', y='longitude', kind='scatter', c='population') 9 | 10 | housing.plot(x='latitude', y='longitude', kind='scatter', c='population', alpha=.5, cmap='viridis') 11 | 12 | 13 | # could have also done kind='hexbin' but that doesn't show us a helpful docstring: 14 | # housing.plot(x='latitude', y='longitude', kind='hexbin', C='population', cmap='viridis') 15 | housing.plot.hexbin(x='latitude', y='longitude', C='population', cmap='viridis', linewidth=0) 16 | 17 | housing.plot.hexbin(x='latitude', y='longitude', C='population', cmap='viridis', reduce_C_function=sum, linewidth=0) 18 | 19 | 20 | 21 | fig, axes = plt.subplots(2, 3, subplot_kw={'xticks': (), 'yticks': ()}) 22 | for column, ax in zip(housing.columns[2:-2], axes.ravel()): 23 | if column in ['housing_median_age', 'median_income']: 24 | reduce = np.mean 25 | else: 26 | reduce = np.sum 27 | housing.plot.hexbin(x='latitude', y='longitude', C=column, cmap='viridis', reduce_C_function=reduce, linewidth=0, ax=ax) 28 | ax.set_title(column) 29 | plt.tight_layout() 30 | 31 | 32 | housing.plot.hexbin(x='latitude', y='longitude', C='median_house_value', cmap='viridis', linewidth=0) 33 | 34 | 35 | # two outliers: 36 | print((housing_nonull.population > 20000).sum()) 37 | pd.plotting.scatter_matrix(housing_nonull.iloc[:, 2:-2], c=housing_nonull.population > 20000, cmap='tab10'); 38 | plt.figure() 39 | plt.scatter(housing_nonull.latitude, housing_nonull.longitude, c=plt.cm.tab10((housing_nonull.population > 20000).astype(int)), s=3) 40 | 41 | # vs dependent variable: 42 | fig, axes = plt.subplots(4, 2) 43 | for ax, column in zip(axes.ravel(), continuous_dependent): 44 | ax.scatter(housing_nonull[column], housing_nonull['median_house_value'], alpha=.01) 45 | ax.set_title(column) 46 | plt.tight_layout() 47 | 48 | # vs dependen variable with seaborn 49 | 50 | sns.pairplot(housing_nonull, x_vars=continuous_dependent, y_vars=["median_house_value"], 51 | kind="scatter", plot_kws={'alpha': .01, 'edgecolor': None}); 52 | # we'll see a nice way in the next notebook 53 | 54 | ####### 4 55 | # for the housing data 56 | housing = pd.read_csv("data/housing.csv") 57 | 58 | housing.head() 59 | 60 | housing_dummies = pd.get_dummies(housing) 61 | housing_dummies.head() 62 | 63 | # ridge regression on housing 64 | from sklearn.linear_model import Ridge 65 | 66 | housing = pd.read_csv("data/housing.csv") 67 | housing = housing.dropna(axis=0) 68 | housing_dummies = pd.get_dummies(housing) 69 | housing_dummies.head() 70 | y = housing_dummies.pop("median_house_value") 71 | X = housing_dummies 72 | print(X.head()) 73 | 74 | X_train, X_test, y_train, y_test = train_test_split(X, y) 75 | scaler = StandardScaler().fit(X_train) 76 | X_train_scaled = scaler.transform(X_train) 77 | X_test_scaled = scaler.transform(X_test) 78 | ridge = Ridge() 79 | 80 | ridge.fit(X_train_scaled, y_train) 81 | ridge.score(X_test_scaled, y_test) 82 | -------------------------------------------------------------------------------- /tree_plotting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numbers import Integral 3 | 4 | from sklearn.externals import six 5 | from sklearn.tree.export import _color_brew, _criterion, _tree 6 | 7 | 8 | def plot_tree(decision_tree, max_depth=None, feature_names=None, 9 | class_names=None, label='all', filled=False, 10 | leaves_parallel=False, impurity=True, node_ids=False, 11 | proportion=False, rotate=False, rounded=False, 12 | special_characters=False, precision=3, ax=None, fontsize=None): 13 | """Plot a decision tree. 14 | 15 | The sample counts that are shown are weighted with any sample_weights that 16 | might be present. 17 | 18 | Parameters 19 | ---------- 20 | decision_tree : decision tree classifier 21 | The decision tree to be exported to GraphViz. 22 | 23 | max_depth : int, optional (default=None) 24 | The maximum depth of the representation. If None, the tree is fully 25 | generated. 26 | 27 | feature_names : list of strings, optional (default=None) 28 | Names of each of the features. 29 | 30 | class_names : list of strings, bool or None, optional (default=None) 31 | Names of each of the target classes in ascending numerical order. 32 | Only relevant for classification and not supported for multi-output. 33 | If ``True``, shows a symbolic representation of the class name. 34 | 35 | label : {'all', 'root', 'none'}, optional (default='all') 36 | Whether to show informative labels for impurity, etc. 37 | Options include 'all' to show at every node, 'root' to show only at 38 | the top root node, or 'none' to not show at any node. 39 | 40 | filled : bool, optional (default=False) 41 | When set to ``True``, paint nodes to indicate majority class for 42 | classification, extremity of values for regression, or purity of node 43 | for multi-output. 44 | 45 | leaves_parallel : bool, optional (default=False) 46 | When set to ``True``, draw all leaf nodes at the bottom of the tree. 47 | 48 | impurity : bool, optional (default=True) 49 | When set to ``True``, show the impurity at each node. 50 | 51 | node_ids : bool, optional (default=False) 52 | When set to ``True``, show the ID number on each node. 53 | 54 | proportion : bool, optional (default=False) 55 | When set to ``True``, change the display of 'values' and/or 'samples' 56 | to be proportions and percentages respectively. 57 | 58 | rotate : bool, optional (default=False) 59 | When set to ``True``, orient tree left to right rather than top-down. 60 | 61 | rounded : bool, optional (default=False) 62 | When set to ``True``, draw node boxes with rounded corners and use 63 | Helvetica fonts instead of Times-Roman. 64 | 65 | special_characters : bool, optional (default=False) 66 | When set to ``False``, ignore special characters for PostScript 67 | compatibility. 68 | 69 | precision : int, optional (default=3) 70 | Number of digits of precision for floating point in the values of 71 | impurity, threshold and value attributes of each node. 72 | 73 | ax : matplotlib axis, optional (default=None) 74 | Axes to plot to. If None, use current axis. 75 | 76 | Examples 77 | -------- 78 | >>> from sklearn.datasets import load_iris 79 | 80 | >>> clf = tree.DecisionTreeClassifier() 81 | >>> iris = load_iris() 82 | 83 | >>> clf = clf.fit(iris.data, iris.target) 84 | >>> plot_tree(clf) # doctest: +SKIP 85 | 86 | """ 87 | exporter = _MPLTreeExporter( 88 | max_depth=max_depth, feature_names=feature_names, 89 | class_names=class_names, label=label, filled=filled, 90 | leaves_parallel=leaves_parallel, impurity=impurity, node_ids=node_ids, 91 | proportion=proportion, rotate=rotate, rounded=rounded, 92 | special_characters=special_characters, precision=precision, 93 | fontsize=fontsize) 94 | exporter.export(decision_tree, ax=ax) 95 | 96 | 97 | class _BaseTreeExporter(object): 98 | def get_color(self, value): 99 | # Find the appropriate color & intensity for a node 100 | if self.colors['bounds'] is None: 101 | # Classification tree 102 | color = list(self.colors['rgb'][np.argmax(value)]) 103 | sorted_values = sorted(value, reverse=True) 104 | if len(sorted_values) == 1: 105 | alpha = 0 106 | else: 107 | alpha = ((sorted_values[0] - sorted_values[1]) 108 | / (1 - sorted_values[1])) 109 | else: 110 | # Regression tree or multi-output 111 | color = list(self.colors['rgb'][0]) 112 | alpha = ((value - self.colors['bounds'][0]) / 113 | (self.colors['bounds'][1] - self.colors['bounds'][0])) 114 | # unpack numpy scalars 115 | alpha = float(alpha) 116 | # compute the color as alpha against white 117 | color = [int(round(alpha * c + (1 - alpha) * 255, 0)) for c in color] 118 | # Return html color code in #RRGGBB format 119 | hex_codes = [str(i) for i in range(10)] 120 | hex_codes.extend(['a', 'b', 'c', 'd', 'e', 'f']) 121 | color = [hex_codes[c // 16] + hex_codes[c % 16] for c in color] 122 | 123 | return '#' + ''.join(color) 124 | 125 | def get_fill_color(self, tree, node_id): 126 | # Fetch appropriate color for node 127 | if 'rgb' not in self.colors: 128 | # Initialize colors and bounds if required 129 | self.colors['rgb'] = _color_brew(tree.n_classes[0]) 130 | if tree.n_outputs != 1: 131 | # Find max and min impurities for multi-output 132 | self.colors['bounds'] = (np.min(-tree.impurity), 133 | np.max(-tree.impurity)) 134 | elif (tree.n_classes[0] == 1 and 135 | len(np.unique(tree.value)) != 1): 136 | # Find max and min values in leaf nodes for regression 137 | self.colors['bounds'] = (np.min(tree.value), 138 | np.max(tree.value)) 139 | if tree.n_outputs == 1: 140 | node_val = (tree.value[node_id][0, :] / 141 | tree.weighted_n_node_samples[node_id]) 142 | if tree.n_classes[0] == 1: 143 | # Regression 144 | node_val = tree.value[node_id][0, :] 145 | else: 146 | # If multi-output color node by impurity 147 | node_val = -tree.impurity[node_id] 148 | return self.get_color(node_val) 149 | 150 | def node_to_str(self, tree, node_id, criterion): 151 | # Generate the node content string 152 | if tree.n_outputs == 1: 153 | value = tree.value[node_id][0, :] 154 | else: 155 | value = tree.value[node_id] 156 | 157 | # Should labels be shown? 158 | labels = (self.label == 'root' and node_id == 0) or self.label == 'all' 159 | 160 | characters = self.characters 161 | node_string = characters[-1] 162 | 163 | # Write node ID 164 | if self.node_ids: 165 | if labels: 166 | node_string += 'node ' 167 | node_string += characters[0] + str(node_id) + characters[4] 168 | 169 | # Write decision criteria 170 | if tree.children_left[node_id] != _tree.TREE_LEAF: 171 | # Always write node decision criteria, except for leaves 172 | if self.feature_names is not None: 173 | feature = self.feature_names[tree.feature[node_id]] 174 | else: 175 | feature = "X%s%s%s" % (characters[1], 176 | tree.feature[node_id], 177 | characters[2]) 178 | node_string += '%s %s %s%s' % (feature, 179 | characters[3], 180 | round(tree.threshold[node_id], 181 | self.precision), 182 | characters[4]) 183 | 184 | # Write impurity 185 | if self.impurity: 186 | if isinstance(criterion, _criterion.FriedmanMSE): 187 | criterion = "friedman_mse" 188 | elif not isinstance(criterion, six.string_types): 189 | criterion = "impurity" 190 | if labels: 191 | node_string += '%s = ' % criterion 192 | node_string += (str(round(tree.impurity[node_id], self.precision)) 193 | + characters[4]) 194 | 195 | # Write node sample count 196 | if labels: 197 | node_string += 'samples = ' 198 | if self.proportion: 199 | percent = (100. * tree.n_node_samples[node_id] / 200 | float(tree.n_node_samples[0])) 201 | node_string += (str(round(percent, 1)) + '%' + 202 | characters[4]) 203 | else: 204 | node_string += (str(tree.n_node_samples[node_id]) + 205 | characters[4]) 206 | 207 | # Write node class distribution / regression value 208 | if self.proportion and tree.n_classes[0] != 1: 209 | # For classification this will show the proportion of samples 210 | value = value / tree.weighted_n_node_samples[node_id] 211 | if labels: 212 | node_string += 'value = ' 213 | if tree.n_classes[0] == 1: 214 | # Regression 215 | value_text = np.around(value, self.precision) 216 | elif self.proportion: 217 | # Classification 218 | value_text = np.around(value, self.precision) 219 | elif np.all(np.equal(np.mod(value, 1), 0)): 220 | # Classification without floating-point weights 221 | value_text = value.astype(int) 222 | else: 223 | # Classification with floating-point weights 224 | value_text = np.around(value, self.precision) 225 | # Strip whitespace 226 | value_text = str(value_text.astype('S32')).replace("b'", "'") 227 | value_text = value_text.replace("' '", ", ").replace("'", "") 228 | if tree.n_classes[0] == 1 and tree.n_outputs == 1: 229 | value_text = value_text.replace("[", "").replace("]", "") 230 | value_text = value_text.replace("\n ", characters[4]) 231 | node_string += value_text + characters[4] 232 | 233 | # Write node majority class 234 | if (self.class_names is not None and 235 | tree.n_classes[0] != 1 and 236 | tree.n_outputs == 1): 237 | # Only done for single-output classification trees 238 | if labels: 239 | node_string += 'class = ' 240 | if self.class_names is not True: 241 | class_name = self.class_names[np.argmax(value)] 242 | else: 243 | class_name = "y%s%s%s" % (characters[1], 244 | np.argmax(value), 245 | characters[2]) 246 | node_string += class_name 247 | 248 | # Clean up any trailing newlines 249 | if node_string.endswith(characters[4]): 250 | node_string = node_string[:-len(characters[4])] 251 | 252 | return node_string + characters[5] 253 | 254 | 255 | class _MPLTreeExporter(_BaseTreeExporter): 256 | def __init__(self, max_depth=None, feature_names=None, 257 | class_names=None, label='all', filled=False, 258 | leaves_parallel=False, impurity=True, node_ids=False, 259 | proportion=False, rotate=False, rounded=False, 260 | special_characters=False, precision=3, fontsize=None): 261 | self.max_depth = max_depth 262 | self.feature_names = feature_names 263 | self.class_names = class_names 264 | self.label = label 265 | self.filled = filled 266 | self.leaves_parallel = leaves_parallel 267 | self.impurity = impurity 268 | self.node_ids = node_ids 269 | self.proportion = proportion 270 | self.rotate = rotate 271 | self.rounded = rounded 272 | self.special_characters = special_characters 273 | self.precision = precision 274 | self.fontsize = fontsize 275 | self._scaley = 10 276 | 277 | # validate 278 | if isinstance(precision, Integral): 279 | if precision < 0: 280 | raise ValueError("'precision' should be greater or equal to 0." 281 | " Got {} instead.".format(precision)) 282 | else: 283 | raise ValueError("'precision' should be an integer. Got {}" 284 | " instead.".format(type(precision))) 285 | 286 | # The depth of each node for plotting with 'leaf' option 287 | self.ranks = {'leaves': []} 288 | # The colors to render each node with 289 | self.colors = {'bounds': None} 290 | 291 | self.characters = ['#', '[', ']', '<=', '\n', '', ''] 292 | 293 | self.bbox_args = dict(fc='w') 294 | if self.rounded: 295 | self.bbox_args['boxstyle'] = "round" 296 | self.arrow_args = dict(arrowstyle="<-") 297 | 298 | def _make_tree(self, node_id, et): 299 | # traverses _tree.Tree recursively, builds intermediate 300 | # "_reingold_tilford.Tree" object 301 | name = self.node_to_str(et, node_id, criterion='entropy') 302 | if (et.children_left[node_id] != et.children_right[node_id]): 303 | children = [self._make_tree(et.children_left[node_id], et), 304 | self._make_tree(et.children_right[node_id], et)] 305 | else: 306 | return Tree(name, node_id) 307 | return Tree(name, node_id, *children) 308 | 309 | def export(self, decision_tree, ax=None): 310 | import matplotlib.pyplot as plt 311 | from matplotlib.text import Annotation 312 | 313 | if ax is None: 314 | ax = plt.gca() 315 | ax.set_axis_off() 316 | my_tree = self._make_tree(0, decision_tree.tree_) 317 | dt = buchheim(my_tree) 318 | self._scalex = 1 319 | self.recurse(dt, decision_tree.tree_, ax) 320 | 321 | anns = [ann for ann in ax.get_children() 322 | if isinstance(ann, Annotation)] 323 | 324 | # get all the annotated points 325 | xys = [ann.xyann for ann in anns] 326 | 327 | mins = np.min(xys, axis=0) 328 | maxs = np.max(xys, axis=0) 329 | 330 | ax.set_xlim(mins[0], maxs[0]) 331 | ax.set_ylim(maxs[1], mins[1]) 332 | 333 | if self.fontsize is None: 334 | # get figure to data transform 335 | inv = ax.transData.inverted() 336 | renderer = ax.figure.canvas.get_renderer() 337 | # update sizes of all bboxes 338 | for ann in anns: 339 | ann.update_bbox_position_size(renderer) 340 | # get max box width 341 | widths = [inv.get_matrix()[0, 0] 342 | * ann.get_bbox_patch().get_window_extent().width 343 | for ann in anns] 344 | # get minimum max size to not be too big. 345 | max_width = max(max(widths), 1) 346 | # adjust fontsize to avoid overlap 347 | # width should be around 1 in data coordinates 348 | size = anns[0].get_fontsize() / max_width 349 | for ann in anns: 350 | ann.set_fontsize(size) 351 | 352 | def recurse(self, node, tree, ax, depth=0): 353 | kwargs = dict(bbox=self.bbox_args, ha='center', va='center', 354 | zorder=100 - 10 * depth) 355 | 356 | if self.fontsize is not None: 357 | kwargs['fontsize'] = self.fontsize 358 | 359 | xy = (node.x * self._scalex, node.y * self._scaley) 360 | 361 | if self.max_depth is None or depth <= self.max_depth: 362 | if self.filled: 363 | kwargs['bbox']['fc'] = self.get_fill_color(tree, 364 | node.tree.node_id) 365 | if node.parent is None: 366 | # root 367 | ax.annotate(node.tree.node, xy, **kwargs) 368 | else: 369 | xy_parent = (node.parent.x * self._scalex, 370 | node.parent.y * self._scaley) 371 | kwargs["arrowprops"] = self.arrow_args 372 | ax.annotate(node.tree.node, xy_parent, xy, **kwargs) 373 | for child in node.children: 374 | self.recurse(child, tree, ax, depth=depth + 1) 375 | 376 | else: 377 | xy_parent = (node.parent.x * self._scalex, node.parent.y * 378 | self._scaley) 379 | kwargs["arrowprops"] = self.arrow_args 380 | kwargs['bbox']['fc'] = 'grey' 381 | ax.annotate("\n (...) \n", xy_parent, xy, **kwargs) 382 | 383 | 384 | class DrawTree(object): 385 | def __init__(self, tree, parent=None, depth=0, number=1): 386 | self.x = -1. 387 | self.y = depth 388 | self.tree = tree 389 | self.children = [DrawTree(c, self, depth + 1, i + 1) 390 | for i, c 391 | in enumerate(tree.children)] 392 | self.parent = parent 393 | self.thread = None 394 | self.mod = 0 395 | self.ancestor = self 396 | self.change = self.shift = 0 397 | self._lmost_sibling = None 398 | # this is the number of the node in its group of siblings 1..n 399 | self.number = number 400 | 401 | def left(self): 402 | return self.thread or len(self.children) and self.children[0] 403 | 404 | def right(self): 405 | return self.thread or len(self.children) and self.children[-1] 406 | 407 | def lbrother(self): 408 | n = None 409 | if self.parent: 410 | for node in self.parent.children: 411 | if node == self: 412 | return n 413 | else: 414 | n = node 415 | return n 416 | 417 | def get_lmost_sibling(self): 418 | if not self._lmost_sibling and self.parent and self != \ 419 | self.parent.children[0]: 420 | self._lmost_sibling = self.parent.children[0] 421 | return self._lmost_sibling 422 | lmost_sibling = property(get_lmost_sibling) 423 | 424 | def __str__(self): 425 | return "%s: x=%s mod=%s" % (self.tree, self.x, self.mod) 426 | 427 | def __repr__(self): 428 | return self.__str__() 429 | 430 | 431 | def buchheim(tree): 432 | dt = firstwalk(DrawTree(tree)) 433 | min = second_walk(dt) 434 | if min < 0: 435 | third_walk(dt, -min) 436 | return dt 437 | 438 | 439 | def third_walk(tree, n): 440 | tree.x += n 441 | for c in tree.children: 442 | third_walk(c, n) 443 | 444 | 445 | def firstwalk(v, distance=1.): 446 | if len(v.children) == 0: 447 | if v.lmost_sibling: 448 | v.x = v.lbrother().x + distance 449 | else: 450 | v.x = 0. 451 | else: 452 | default_ancestor = v.children[0] 453 | for w in v.children: 454 | firstwalk(w) 455 | default_ancestor = apportion(w, default_ancestor, distance) 456 | # print("finished v =", v.tree, "children") 457 | execute_shifts(v) 458 | 459 | midpoint = (v.children[0].x + v.children[-1].x) / 2 460 | 461 | w = v.lbrother() 462 | if w: 463 | v.x = w.x + distance 464 | v.mod = v.x - midpoint 465 | else: 466 | v.x = midpoint 467 | return v 468 | 469 | 470 | def apportion(v, default_ancestor, distance): 471 | w = v.lbrother() 472 | if w is not None: 473 | # in buchheim notation: 474 | # i == inner; o == outer; r == right; l == left; r = +; l = - 475 | vir = vor = v 476 | vil = w 477 | vol = v.lmost_sibling 478 | sir = sor = v.mod 479 | sil = vil.mod 480 | sol = vol.mod 481 | while vil.right() and vir.left(): 482 | vil = vil.right() 483 | vir = vir.left() 484 | vol = vol.left() 485 | vor = vor.right() 486 | vor.ancestor = v 487 | shift = (vil.x + sil) - (vir.x + sir) + distance 488 | if shift > 0: 489 | move_subtree(ancestor(vil, v, default_ancestor), v, shift) 490 | sir = sir + shift 491 | sor = sor + shift 492 | sil += vil.mod 493 | sir += vir.mod 494 | sol += vol.mod 495 | sor += vor.mod 496 | if vil.right() and not vor.right(): 497 | vor.thread = vil.right() 498 | vor.mod += sil - sor 499 | else: 500 | if vir.left() and not vol.left(): 501 | vol.thread = vir.left() 502 | vol.mod += sir - sol 503 | default_ancestor = v 504 | return default_ancestor 505 | 506 | 507 | def move_subtree(wl, wr, shift): 508 | subtrees = wr.number - wl.number 509 | # print(wl.tree, "is conflicted with", wr.tree, 'moving', subtrees, 510 | # 'shift', shift) 511 | # print wl, wr, wr.number, wl.number, shift, subtrees, shift/subtrees 512 | wr.change -= shift / subtrees 513 | wr.shift += shift 514 | wl.change += shift / subtrees 515 | wr.x += shift 516 | wr.mod += shift 517 | 518 | 519 | def execute_shifts(v): 520 | shift = change = 0 521 | for w in v.children[::-1]: 522 | # print("shift:", w, shift, w.change) 523 | w.x += shift 524 | w.mod += shift 525 | change += w.change 526 | shift += w.shift + change 527 | 528 | 529 | def ancestor(vil, v, default_ancestor): 530 | # the relevant text is at the bottom of page 7 of 531 | # "Improving Walker's Algorithm to Run in Linear Time" by Buchheim et al, 532 | # (2002) 533 | # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.16.8757&rep=rep1&type=pdf 534 | if vil.ancestor in v.parent.children: 535 | return vil.ancestor 536 | else: 537 | return default_ancestor 538 | 539 | 540 | def second_walk(v, m=0, depth=0, min=None): 541 | v.x += m 542 | v.y = depth 543 | 544 | if min is None or v.x < min: 545 | min = v.x 546 | 547 | for w in v.children: 548 | min = second_walk(w, m + v.mod, depth + 1, min) 549 | 550 | return min 551 | 552 | 553 | class Tree(object): 554 | def __init__(self, node="", node_id=-1, *children): 555 | self.node = node 556 | self.width = len(node) 557 | self.node_id = node_id 558 | if children: 559 | self.children = children 560 | else: 561 | self.children = [] 562 | --------------------------------------------------------------------------------