├── .gitignore
├── 01 - Jupyter Notebooks, Prerequisits.ipynb
├── 02 - Matplotlib Refersher.ipynb
├── 03 - Exploration with Pandas, Matplotlib and Seaborn.ipynb
├── 04 - Machine Learning with Scikit-learn.ipynb
├── 05 - More data - the adult dataset.ipynb
├── 06 - Closing comments.ipynb
├── LICENSE
├── README.md
├── bonus - Cross-validation and Grid Search.ipynb
├── bonus - Trees.ipynb
├── check_env.ipynb
├── data
├── adult.csv
└── housing.csv
├── images
├── check_env-1.png
├── check_env-2.png
├── cross_validation.svg
├── data_representation.svg
├── download-repo.png
├── grid_search_cross_validation.svg
├── supervised_workflow.svg
├── tab-help.png
└── train_test_split_matrix.svg
├── solutions
└── solutions.py
└── tree_plotting.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/01 - Jupyter Notebooks, Prerequisits.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Jupyter notebooks\n",
8 | "Important shortcuts:\n",
9 | "\n",
10 | "- run and move to next cell: shift + return\n",
11 | "- run and stay on same cell: alt + return\n",
12 | "- insert cell below: ctrl + m, then b (or then a for \"above\")\n",
13 | "\n",
14 | "Two modes:\n",
15 | "- insert mode\n",
16 | "- edit mode"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "## Getting Help\n",
24 | "- Shortcuts: press h in edit mode\n",
25 | "- press tab inside method calls (press tab again to see more):\n",
26 | "\n",
27 | "- use \"?\" and run cell"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "collapsed": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "import numpy as np"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "np.bincount()"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {
56 | "collapsed": true
57 | },
58 | "outputs": [],
59 | "source": [
60 | "np.bincount?"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "## Exercise:\n",
68 | "Use the help to find out what the options to ``np.unique`` are.\n",
69 | "Use np.unique to convert the array ``['one', 'two', 'three', 'one', 'two', 'three']`` into the array ``[0, 2, 1, 0, 2, 1]``."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {
76 | "collapsed": true
77 | },
78 | "outputs": [],
79 | "source": [
80 | "ar = ['one', 'two', 'three', 'one', 'two', 'three']\n",
81 | "# your solution here"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## Plotting with matplotlib\n",
89 | "Need to use either \n",
90 | "```\n",
91 | "% matplotlib inline\n",
92 | "```\n",
93 | "or\n",
94 | "```\n",
95 | "% matplotlib notebook\n",
96 | "```\n",
97 | "Only one in each notebook!\n",
98 | "using ``inline`` will just sent ``png`` images to browser, using ``notebook`` will provide\n",
99 | "interactivity and allow updating old figures.\n",
100 | "With ``notebook`` you need to make sure to create a new figure before plotting, otherwise the last one will be updated!"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": true
108 | },
109 | "outputs": [],
110 | "source": [
111 | "%matplotlib notebook\n",
112 | "import matplotlib.pyplot as plt\n",
113 | "\n",
114 | "X = np.random.normal(size=(12, 2))\n",
115 | "plt.scatter(X[:, 0], X[:, 1])"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "collapsed": true
123 | },
124 | "outputs": [],
125 | "source": [
126 | "plt.plot(X[:, 0])"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "collapsed": true
134 | },
135 | "outputs": [],
136 | "source": [
137 | "# create a new figure\n",
138 | "plt.figure()\n",
139 | "plt.plot(X[:, 0])"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "## Exercise\n",
147 | "Create a new figure and plot a sin wave. You can use ``np.linspace`` to create equally spaced numbers in a given range."
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "metadata": {},
153 | "source": [
154 | "## Best practices for data analysis in Jupyter\n",
155 | "- use standard imports\n",
156 | "- don't ``import *``\n",
157 | "- be mindful of the state in the notebook!"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {
164 | "collapsed": true
165 | },
166 | "outputs": [],
167 | "source": [
168 | "x = 1"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": [
179 | "x = x + 1"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {
186 | "collapsed": true
187 | },
188 | "outputs": [],
189 | "source": [
190 | "print(x)"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "Avoid cells you can't run again:"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {
204 | "collapsed": true
205 | },
206 | "outputs": [],
207 | "source": [
208 | "data = {'a': [1, 2, 3], 'b': [999, 1, 2]}"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": null,
214 | "metadata": {
215 | "collapsed": true
216 | },
217 | "outputs": [],
218 | "source": [
219 | "column_a = data.pop('a')"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {
226 | "collapsed": true
227 | },
228 | "outputs": [],
229 | "source": [
230 | "print(column_a)\n",
231 | "print(data)"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "## Not mutating variables helps"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {
245 | "collapsed": true
246 | },
247 | "outputs": [],
248 | "source": [
249 | "x = 1"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {
256 | "collapsed": true
257 | },
258 | "outputs": [],
259 | "source": [
260 | "x2 = x + 1"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {
267 | "collapsed": true
268 | },
269 | "outputs": [],
270 | "source": [
271 | "print(x2)"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "## Exercise\n",
279 | "Rewrite the code for the ``data`` dict above so that you don't mutate ``data``, but that the ``print`` stays the same."
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {
286 | "collapsed": true
287 | },
288 | "outputs": [],
289 | "source": [
290 | "# solution Here"
291 | ]
292 | }
293 | ],
294 | "metadata": {
295 | "kernelspec": {
296 | "display_name": "Python [default]",
297 | "language": "python",
298 | "name": "python3"
299 | },
300 | "language_info": {
301 | "codemirror_mode": {
302 | "name": "ipython",
303 | "version": 3
304 | },
305 | "file_extension": ".py",
306 | "mimetype": "text/x-python",
307 | "name": "python",
308 | "nbconvert_exporter": "python",
309 | "pygments_lexer": "ipython3",
310 | "version": "3.6.1"
311 | }
312 | },
313 | "nbformat": 4,
314 | "nbformat_minor": 2
315 | }
316 |
--------------------------------------------------------------------------------
/02 - Matplotlib Refersher.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Matplotlib API refresher"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "% matplotlib notebook\n",
19 | "import matplotlib.pyplot as plt"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Matplotlib \"stateful\" api\n",
27 | "Modifies \"current figure\""
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "collapsed": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "plt.plot(range(10))\n",
39 | "plt.plot(range(10, 0, -1))"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {
46 | "collapsed": true
47 | },
48 | "outputs": [],
49 | "source": [
50 | "import numpy as np\n",
51 | "plt.plot(np.sin(np.linspace(-3, 3, 20)))"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": [
58 | "## Works also with subplot"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {
65 | "collapsed": true
66 | },
67 | "outputs": [],
68 | "source": [
69 | "plt.figure()\n",
70 | "# create a subplot by specifying grid width, grid height and index:\n",
71 | "# 2x2 grid, first plot (one-indexed)\n",
72 | "plt.subplot(2, 2, 1)\n",
73 | "# plt.title changes \"current axes\"\n",
74 | "plt.title(\"first plot\")\n",
75 | "plt.plot(np.random.uniform(size=10))\n",
76 | "\n",
77 | "plt.subplot(2, 2, 2)\n",
78 | "# now subplot 2 is current\n",
79 | "plt.title(\"second plot\")\n",
80 | "plt.plot(np.random.uniform(size=10), 'o')\n",
81 | "\n",
82 | "plt.subplot(2, 2, 3)\n",
83 | "plt.title(\"third plot\")\n",
84 | "plt.barh(range(10), np.random.uniform(size=10))\n",
85 | "\n",
86 | "plt.subplot(2, 2, 4)\n",
87 | "plt.title(\"fourth plot\")\n",
88 | "plt.imshow(np.random.uniform(size=(10, 10)))\n",
89 | "plt.tight_layout()"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "## Object oriented / Axis oriented API is more powerful\n",
97 | "Have an object per axes, plot directly to axes.\n",
98 | "\n",
99 | "methods modifying the axes have ``set_``!"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "collapsed": true
107 | },
108 | "outputs": [],
109 | "source": [
110 | "plt.figure()\n",
111 | "ax11 = plt.subplot(2, 2, 1)\n",
112 | "ax21 = plt.subplot(2, 2, 2)\n",
113 | "ax12 = plt.subplot(2, 2, 3)\n",
114 | "ax22 = plt.subplot(2, 2, 4)\n",
115 | "\n",
116 | "ax11.set_title(\"ax11\")\n",
117 | "ax21.set_title(\"ax21\")\n",
118 | "ax12.set_title(\"ax12\")\n",
119 | "ax22.set_title(\"ax22\")\n",
120 | "\n",
121 | "ax21.plot(np.random.randn(10))\n",
122 | "plt.tight_layout()"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {
129 | "collapsed": true
130 | },
131 | "outputs": [],
132 | "source": [
133 | "## My favorite interface: plt.subplots!"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "collapsed": true
141 | },
142 | "outputs": [],
143 | "source": [
144 | "fig, axes = plt.subplots(2, 2)\n",
145 | "ax11, ax21, ax12, ax22 = axes.ravel()\n",
146 | "ax11.set_title(\"ax11\")\n",
147 | "ax21.set_title(\"ax21\")\n",
148 | "ax12.set_title(\"ax12\")\n",
149 | "ax22.set_title(\"ax22\")"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "## Exercise\n",
157 | "Create a grid plot with one row and four columns where the first entry plots the function ``f(x) = x``, the second ``f(x)=x ** 2``, the third ``f(x)=x ** 3`` and the fourth ``f(x)=x**4``."
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "metadata": {
164 | "collapsed": true
165 | },
166 | "outputs": [],
167 | "source": [
168 | "# Your solution\n"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "## More fun with subplots!"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {
182 | "collapsed": true
183 | },
184 | "outputs": [],
185 | "source": [
186 | "import numpy as np\n",
187 | "sin = np.sin(np.linspace(-4, 4, 100))\n",
188 | "fig, axes = plt.subplots(2, 2)\n",
189 | "plt.plot(sin)"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {
196 | "collapsed": true
197 | },
198 | "outputs": [],
199 | "source": [
200 | "fig, axes = plt.subplots(2, 2)\n",
201 | "axes[0, 0].plot(sin)"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {
208 | "collapsed": true
209 | },
210 | "outputs": [],
211 | "source": [
212 | "asdf = plt.gca()\n",
213 | "asdf.plot(sin, c='k')"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "## More on plotting commands and styling"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {
227 | "collapsed": true
228 | },
229 | "outputs": [],
230 | "source": [
231 | "fig, ax = plt.subplots(2, 4, figsize=(10, 5))\n",
232 | "ax[0, 0].plot(sin)\n",
233 | "ax[0, 1].plot(range(100), sin) # same as above\n",
234 | "ax[0, 2].plot(np.linspace(-4, 4, 100), sin)\n",
235 | "ax[0, 3].plot(sin[::10], 'o')\n",
236 | "ax[1, 0].plot(sin, c='r')\n",
237 | "ax[1, 1].plot(sin, '--')\n",
238 | "ax[1, 2].plot(sin, lw=3)\n",
239 | "ax[1, 3].plot(sin[::10], '--o')\n",
240 | "plt.tight_layout() # makes stuff fit - usually works"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "# Exercise\n",
248 | "See how many lines you can put in a plot an still distinguish them (using the styles described above).\n",
249 | "How many can you distinguish if you don't use color?\n",
250 | "See the [lines bars and markers](https://matplotlib.org/gallery.html#lines_bars_and_markers) section of the matplotlib examples for more different styles"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": null,
256 | "metadata": {
257 | "collapsed": true
258 | },
259 | "outputs": [],
260 | "source": [
261 | "# solution"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "metadata": {},
267 | "source": [
268 | "### Scatter vs plot\n",
269 | "Scatter allows modifying individual points, plot only allows modifying them all the same way:"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {
276 | "collapsed": true
277 | },
278 | "outputs": [],
279 | "source": [
280 | "x = np.random.uniform(size=50)\n",
281 | "y = x + np.random.normal(0, .1, size=50)\n",
282 | "\n",
283 | "fig, ax = plt.subplots(2, 2, figsize=(5, 5),\n",
284 | " subplot_kw={'xticks': (), 'yticks': ()})\n",
285 | "ax[0, 0].scatter(x, y)\n",
286 | "ax[0, 0].set_title(\"scatter\")\n",
287 | "ax[0, 1].plot(x, y, 'o')\n",
288 | "ax[0, 1].set_title(\"plot\")\n",
289 | "ax[1, 0].scatter(x, y, c=x-y, cmap='bwr', edgecolor='k')\n",
290 | "ax[1, 1].scatter(x, y, c=x-y, s=np.abs(np.random.normal(scale=20, size=50)), cmap='bwr', edgecolor='k')\n",
291 | "plt.tight_layout()"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "# Imshow, interpolation, colormaps\n",
299 | "- three important kinds of color maps: sequential, diverging, qualitative\n",
300 | "- default colormap: viridis\n",
301 | "- default qualitative colormap: tab10"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "metadata": {
308 | "collapsed": true
309 | },
310 | "outputs": [],
311 | "source": [
312 | "from matplotlib.cbook import get_sample_data\n",
313 | "f = get_sample_data(\"axes_grid/bivariate_normal.npy\", asfileobj=False)\n",
314 | "arr = np.load(f)"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": null,
320 | "metadata": {
321 | "collapsed": true
322 | },
323 | "outputs": [],
324 | "source": [
325 | "fig, ax = plt.subplots(2, 2)\n",
326 | "im1 = ax[0, 0].imshow(arr)\n",
327 | "ax[0, 1].imshow(arr, interpolation='bilinear')\n",
328 | "im3 = ax[1, 0].imshow(arr, cmap='gray')\n",
329 | "im4 = ax[1, 1].imshow(arr, cmap='bwr', vmin=-1.5, vmax=1.5)\n",
330 | "plt.colorbar(im1, ax=ax[0, 0])\n",
331 | "plt.colorbar(im3, ax=ax[1, 0])\n",
332 | "plt.colorbar(im4, ax=ax[1, 1])"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "## The problem of overplotting"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {
346 | "collapsed": true
347 | },
348 | "outputs": [],
349 | "source": [
350 | "x1, y1 = 1 / np.random.uniform(-1000, 100, size=(2, 10000))\n",
351 | "x2, y2 = np.dot(np.random.uniform(size=(2, 2)), np.random.normal(size=(2, 1000)))\n",
352 | "x = np.hstack([x1, x2])\n",
353 | "y = np.hstack([y1, y2])"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "metadata": {
360 | "collapsed": true
361 | },
362 | "outputs": [],
363 | "source": [
364 | "plt.figure()\n",
365 | "plt.xlim(-1, 1)\n",
366 | "plt.ylim(-1, 1)\n",
367 | "plt.scatter(x, y)"
368 | ]
369 | },
370 | {
371 | "cell_type": "code",
372 | "execution_count": null,
373 | "metadata": {
374 | "collapsed": true
375 | },
376 | "outputs": [],
377 | "source": [
378 | "fig, ax = plt.subplots(1, 3, figsize=(10, 4),\n",
379 | " subplot_kw={'xlim': (-1, 1),\n",
380 | " 'ylim': (-1, 1)})\n",
381 | "ax[0].scatter(x, y)\n",
382 | "ax[1].scatter(x, y, alpha=.1)\n",
383 | "ax[2].scatter(x, y, alpha=.01)"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": null,
389 | "metadata": {
390 | "collapsed": true
391 | },
392 | "outputs": [],
393 | "source": [
394 | "plt.figure()\n",
395 | "plt.hexbin(x, y, bins='log', extent=(-1, 1, -1, 1), gridsize=50, linewidths=0)\n",
396 | "plt.colorbar()"
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "metadata": {},
402 | "source": [
403 | "# Twinx"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": null,
409 | "metadata": {
410 | "collapsed": true
411 | },
412 | "outputs": [],
413 | "source": [
414 | "df = pd.DataFrame({'Math PhDs awareded (US)': {'2000': 1050,\n",
415 | " '2001': 1010,\n",
416 | " '2002': 919,\n",
417 | " '2003': 993,\n",
418 | " '2004': 1076,\n",
419 | " '2005': 1205,\n",
420 | " '2006': 1325,\n",
421 | " '2007': 1393,\n",
422 | " '2008': 1399,\n",
423 | " '2009': 1554},\n",
424 | " 'Total revenue by arcades (US)': {'2000': 1196000000,\n",
425 | " '2001': 1176000000,\n",
426 | " '2002': 1269000000,\n",
427 | " '2003': 1240000000,\n",
428 | " '2004': 1307000000,\n",
429 | " '2005': 1435000000,\n",
430 | " '2006': 1601000000,\n",
431 | " '2007': 1654000000,\n",
432 | " '2008': 1803000000,\n",
433 | " '2009': 1734000000}})"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": null,
439 | "metadata": {
440 | "collapsed": true
441 | },
442 | "outputs": [],
443 | "source": [
444 | "# could also do df.plot()\n",
445 | "phds = df['Math PhDs awareded (US)']\n",
446 | "revenue = df['Total revenue by arcades (US)']\n",
447 | "years = df.index\n",
448 | "\n",
449 | "plt.figure()\n",
450 | "ax1 = plt.gca()\n",
451 | "line1, = ax1.plot(years, phds)\n",
452 | "line2, = ax1.plot(years, revenue, c='r')\n",
453 | "plt.legend((line1, line2), (\"math PhDs awarded\", \"revenue by arcades\"))"
454 | ]
455 | },
456 | {
457 | "cell_type": "code",
458 | "execution_count": null,
459 | "metadata": {
460 | "collapsed": true
461 | },
462 | "outputs": [],
463 | "source": [
464 | "plt.figure()\n",
465 | "ax1 = plt.gca()\n",
466 | "line1, = ax1.plot(years, phds)\n",
467 | "ax2 = ax1.twinx()\n",
468 | "line2, = ax2.plot(years, revenue, c='r')\n",
469 | "plt.legend((line1, line2), (\"math PhDs awarded\", \"revenue by arcades\"))\n",
470 | "ax1.set_ylabel(\"Math PhDs awarded\")\n",
471 | "ax2.set_ylabel(\"revenue by arcades\")"
472 | ]
473 | },
474 | {
475 | "cell_type": "markdown",
476 | "metadata": {},
477 | "source": [
478 | "# Exercise\n",
479 | "Get another dataset from http://www.tylervigen.com/spurious-correlations and plot it using ``twinx``."
480 | ]
481 | }
482 | ],
483 | "metadata": {
484 | "anaconda-cloud": {},
485 | "kernelspec": {
486 | "display_name": "Python [conda root]",
487 | "language": "python",
488 | "name": "conda-root-py"
489 | },
490 | "language_info": {
491 | "codemirror_mode": {
492 | "name": "ipython",
493 | "version": 3
494 | },
495 | "file_extension": ".py",
496 | "mimetype": "text/x-python",
497 | "name": "python",
498 | "nbconvert_exporter": "python",
499 | "pygments_lexer": "ipython3",
500 | "version": "3.6.1"
501 | }
502 | },
503 | "nbformat": 4,
504 | "nbformat_minor": 2
505 | }
506 |
--------------------------------------------------------------------------------
/03 - Exploration with Pandas, Matplotlib and Seaborn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib notebook\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "#import seaborn as sns\n",
14 | "import pandas as pd\n",
15 | "import numpy as np\n",
16 | "plt.rcParams['figure.dpi'] = 100"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "# Data Loading"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "# Loading the california housing dataset CSV\n",
31 | "\n",
32 | "```\n",
33 | "We collected information on the variables using all the block groups in California from the 1990 Census. In this sample a block group on average includes 1425.5 individuals living in a geographically co mpact area. Naturally, the geographical area included varies inversely with the population density. W e computed distances among the centroids of each block group as measured in latitude and longitude. W e excluded all the block groups reporting zero entries for the independent and dependent variables. T he final data contained 20,640 observations on 9 variables. The dependent variable is ln(median house value).\n",
34 | "```"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {
41 | "collapsed": true
42 | },
43 | "outputs": [],
44 | "source": [
45 | "housing = pd.read_csv(\"data/housing.csv\")"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "housing.head()"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "housing.shape"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "housing.ocean_proximity.value_counts()"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {
79 | "collapsed": true
80 | },
81 | "outputs": [],
82 | "source": [
83 | "continuous_dependent = housing.columns[:-2]"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "housing[continuous_dependent].hist();"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "housing[continuous_dependent].hist(bins=\"auto\")\n",
102 | "plt.tight_layout()"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "housing.population.sort_values().tail()"
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "metadata": {},
117 | "source": [
118 | "# Exercise\n",
119 | "Come up with a way to visualize the population data which can help us inspect the outliers."
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "collapsed": true
127 | },
128 | "outputs": [],
129 | "source": [
130 | "# your solution here"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {
137 | "collapsed": true
138 | },
139 | "outputs": [],
140 | "source": [
141 | "# my solution\n",
142 | "# . . . "
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "# Exercise\n",
150 | "We can do better!"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {
157 | "collapsed": true
158 | },
159 | "outputs": [],
160 | "source": [
161 | "# your solution here"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "collapsed": true
169 | },
170 | "outputs": [],
171 | "source": [
172 | "# my solution\n",
173 | "# . . . "
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "# Exercise\n",
181 | "Create a 2 x 3 subplot grid showing housing_median_age, total_rooms, total_bedrooms, population, households and median_income\n",
182 | "each as a hexbin."
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "# Dealing with missing values a little bit"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {},
196 | "outputs": [],
197 | "source": [
198 | "housing.isnull().sum()"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {
205 | "collapsed": true
206 | },
207 | "outputs": [],
208 | "source": [
209 | "housing_nonull = housing.dropna().copy()"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "# scatter matrix / pair plot"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {
223 | "scrolled": false
224 | },
225 | "outputs": [],
226 | "source": [
227 | "pd.plotting.scatter_matrix(housing_nonull.iloc[:, 2:-2]);"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "scrolled": true
235 | },
236 | "outputs": [],
237 | "source": [
238 | "pd.plotting.scatter_matrix(housing_nonull.iloc[:, 2:-2], c=housing.ocean_proximity);\n",
239 | "# error"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {},
246 | "outputs": [],
247 | "source": [
248 | "unique_proximity, ocean_proximity_int = np.unique(housing_nonull.ocean_proximity, return_inverse=True)\n",
249 | "ocean_proximity_int"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "unique_proximity"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {
265 | "scrolled": false
266 | },
267 | "outputs": [],
268 | "source": [
269 | "pd.plotting.scatter_matrix(housing_nonull.iloc[:, 2:-2], c=ocean_proximity_int);"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "plt.matshow(np.arange(4).reshape(1, 4))\n",
279 | "plt.xticks(range(4), unique_proximity)"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {
286 | "scrolled": false
287 | },
288 | "outputs": [],
289 | "source": [
290 | "import seaborn.apionly as sns\n",
291 | "sns.pairplot(housing_nonull.iloc[:, 2:], hue='ocean_proximity')"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "# Exercise\n",
299 | "- Confirm the coloring of the scatter matrix makes sense by plotting latitude vs longitude colored by ocean distance.\n",
300 | "- Are the two outliers the same for all the plots?"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {
307 | "collapsed": true
308 | },
309 | "outputs": [],
310 | "source": [
311 | "# solution here"
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "## Exploring the target (dependent variable)"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "housing_nonull.plot('median_income', 'median_house_value', kind='scatter', alpha=.1)"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {},
333 | "source": [
334 | "## Exercise\n",
335 | "Do a scatter plot of all the continuous dependent variables against the median house value."
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {
342 | "collapsed": true
343 | },
344 | "outputs": [],
345 | "source": [
346 | "# solution"
347 | ]
348 | }
349 | ],
350 | "metadata": {
351 | "kernelspec": {
352 | "display_name": "Python [conda root]",
353 | "language": "python",
354 | "name": "conda-root-py"
355 | },
356 | "language_info": {
357 | "codemirror_mode": {
358 | "name": "ipython",
359 | "version": 3
360 | },
361 | "file_extension": ".py",
362 | "mimetype": "text/x-python",
363 | "name": "python",
364 | "nbconvert_exporter": "python",
365 | "pygments_lexer": "ipython3",
366 | "version": "3.6.1"
367 | }
368 | },
369 | "nbformat": 4,
370 | "nbformat_minor": 2
371 | }
372 |
--------------------------------------------------------------------------------
/04 - Machine Learning with Scikit-learn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Basics of Supervised Learning\n",
8 | "- Learn from (input, output) pairs\n",
9 | "- Generalize to new input, predict unknown output\n",
10 | "\n",
11 | ""
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {
18 | "collapsed": true
19 | },
20 | "outputs": [],
21 | "source": [
22 | "import matplotlib.pyplot as plt\n",
23 | "import numpy as np\n",
24 | "%matplotlib notebook"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": null,
30 | "metadata": {
31 | "collapsed": true
32 | },
33 | "outputs": [],
34 | "source": [
35 | "from sklearn.datasets import load_digits\n",
36 | "import numpy as np\n",
37 | "digits = load_digits()\n",
38 | "digits.keys()"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "digits.data.shape"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {
56 | "collapsed": true
57 | },
58 | "outputs": [],
59 | "source": [
60 | "digits.target.shape"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "metadata": {
67 | "collapsed": true
68 | },
69 | "outputs": [],
70 | "source": [
71 | "digits.target"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": true
79 | },
80 | "outputs": [],
81 | "source": [
82 | "np.bincount(digits.target)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "collapsed": true
90 | },
91 | "outputs": [],
92 | "source": [
93 | "plt.matshow(digits.data[0].reshape(8, 8), cmap=plt.cm.Greys)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": true
101 | },
102 | "outputs": [],
103 | "source": [
104 | "digits.target[0]"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": true
112 | },
113 | "outputs": [],
114 | "source": [
115 | "fig, axes = plt.subplots(4, 4)\n",
116 | "for x, y, ax in zip(digits.data, digits.target, axes.ravel()):\n",
117 | " ax.set_title(y)\n",
118 | " ax.imshow(x.reshape(8, 8), cmap=\"gray_r\")\n",
119 | " ax.set_xticks(())\n",
120 | " ax.set_yticks(())\n",
121 | "plt.tight_layout()"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | ""
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {
135 | "collapsed": true
136 | },
137 | "outputs": [],
138 | "source": [
139 | "from sklearn.model_selection import train_test_split\n",
140 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n",
141 | " digits.target)"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "Really Simple API\n",
149 | "-------------------\n",
150 | "0) Import your model class"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {
157 | "collapsed": true
158 | },
159 | "outputs": [],
160 | "source": [
161 | "from sklearn.svm import LinearSVC"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "1) Instantiate an object and set the parameters"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": [
179 | "svm = LinearSVC()"
180 | ]
181 | },
182 | {
183 | "cell_type": "markdown",
184 | "metadata": {},
185 | "source": [
186 | "2) Fit the model"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {
193 | "collapsed": true
194 | },
195 | "outputs": [],
196 | "source": [
197 | "svm.fit(X_train, y_train)"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "3) Apply / evaluate"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": [
215 | "print(svm.predict(X_train))\n",
216 | "print(y_train)"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {
223 | "collapsed": true
224 | },
225 | "outputs": [],
226 | "source": [
227 | "svm.score(X_train, y_train)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "collapsed": true
235 | },
236 | "outputs": [],
237 | "source": [
238 | "svm.score(X_test, y_test)"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "And again\n",
246 | "---------"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "metadata": {
253 | "collapsed": true
254 | },
255 | "outputs": [],
256 | "source": [
257 | "from sklearn.ensemble import RandomForestClassifier"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {
264 | "collapsed": true
265 | },
266 | "outputs": [],
267 | "source": [
268 | "rf = RandomForestClassifier(n_estimators=50)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {
275 | "collapsed": true
276 | },
277 | "outputs": [],
278 | "source": [
279 | "rf.fit(X_train, y_train)"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {
286 | "collapsed": true
287 | },
288 | "outputs": [],
289 | "source": [
290 | "rf.score(X_test, y_test)"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "# Exercise\n",
298 | "Load the iris dataset from seaborn using\n",
299 | "\n",
300 | "```python\n",
301 | "iris = sns.load_dataset(\"iris\")\n",
302 | "```\n",
303 | "Visualize the dataset. Extract the features (independent variables) and the target (dependent variable).\n",
304 | "Split it into training and test set using ``train_test_split``.\n",
305 | "\n",
306 | "\n",
307 | "Then train an evaluate a classifier of your choice. Try ``sklearn.neighbors.KNeighborsClassifier`` or ``sklearn.ensemble.RandomForestClassifier`` for example.\n"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "collapsed": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "# your solution"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "# Dummy encoding of categorical variables"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "metadata": {
332 | "collapsed": true
333 | },
334 | "outputs": [],
335 | "source": [
336 | "import pandas as pd\n",
337 | "df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],\n",
338 | " 'boro': ['Manhatten', 'Queens', 'Manhatten', 'Brooklyn', 'Brooklyn', 'Bronx']})\n",
339 | "df"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {
346 | "collapsed": true
347 | },
348 | "outputs": [],
349 | "source": [
350 | "pd.get_dummies(df)"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {
357 | "collapsed": true
358 | },
359 | "outputs": [],
360 | "source": [
361 | "df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],\n",
362 | " 'boro': [0, 1,0, 2, 2, 3]})\n",
363 | "df"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {
370 | "collapsed": true
371 | },
372 | "outputs": [],
373 | "source": [
374 | "pd.get_dummies(df)"
375 | ]
376 | },
377 | {
378 | "cell_type": "markdown",
379 | "metadata": {},
380 | "source": [
381 | "# Exercise\n",
382 | "Load the california housing data from data/housing.csv and apply dummy encoding."
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {
389 | "collapsed": true
390 | },
391 | "outputs": [],
392 | "source": [
393 | "# solution"
394 | ]
395 | },
396 | {
397 | "cell_type": "markdown",
398 | "metadata": {},
399 | "source": [
400 | "# Scaling data"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "metadata": {
407 | "collapsed": true
408 | },
409 | "outputs": [],
410 | "source": [
411 | "import seaborn.apionly as sns\n",
412 | "iris = sns.load_dataset(\"iris\")\n",
413 | "iris.head()"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": null,
419 | "metadata": {
420 | "collapsed": true
421 | },
422 | "outputs": [],
423 | "source": [
424 | "X = iris.iloc[:, :-1] # could do iris.pop(\"species\") but that is changing \"iris\"\n",
425 | "y = iris.species\n",
426 | "X.shape"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {
433 | "collapsed": true
434 | },
435 | "outputs": [],
436 | "source": [
437 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "metadata": {
444 | "collapsed": true
445 | },
446 | "outputs": [],
447 | "source": [
448 | "from sklearn.preprocessing import StandardScaler\n",
449 | "scaler = StandardScaler()\n",
450 | "scaler.fit(X_train)\n",
451 | "X_train_scaled = scaler.transform(X_train)"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": null,
457 | "metadata": {
458 | "collapsed": true
459 | },
460 | "outputs": [],
461 | "source": [
462 | "X_train_scaled[:10]"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": null,
468 | "metadata": {
469 | "collapsed": true
470 | },
471 | "outputs": [],
472 | "source": [
473 | "X_test_scaled = scaler.transform(X_test)"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "metadata": {
480 | "collapsed": true
481 | },
482 | "outputs": [],
483 | "source": [
484 | "svm = LinearSVC()\n",
485 | "svm.fit(X_train_scaled, y_train)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": null,
491 | "metadata": {
492 | "collapsed": true
493 | },
494 | "outputs": [],
495 | "source": [
496 | "svm.predict(X_test_scaled)"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": null,
502 | "metadata": {
503 | "collapsed": true
504 | },
505 | "outputs": [],
506 | "source": [
507 | "svm.score(X_test_scaled, y_test)"
508 | ]
509 | },
510 | {
511 | "cell_type": "markdown",
512 | "metadata": {},
513 | "source": [
514 | "# Exercise\n",
515 | "- load the california housing data and drop columns with missing values\n",
516 | "- Separate features and target in the california housing dataset (with dummy encoding)\n",
517 | "- use train_test_split to split it into training and test data\n",
518 | "- use the StandardScaler to scale training and test data\n",
519 | "- Fit the sklearn.linear_modle.Ridge model (ridge regression, a linear regression model) and evaluate it on the test data.\n",
520 | "\n",
521 | "Note: the score method computes the R^2 for regression problems"
522 | ]
523 | },
524 | {
525 | "cell_type": "code",
526 | "execution_count": null,
527 | "metadata": {
528 | "collapsed": true
529 | },
530 | "outputs": [],
531 | "source": [
532 | "# solution here"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": null,
538 | "metadata": {
539 | "collapsed": true
540 | },
541 | "outputs": [],
542 | "source": [
543 | "# Inspecting the ridge model"
544 | ]
545 | },
546 | {
547 | "cell_type": "code",
548 | "execution_count": null,
549 | "metadata": {
550 | "collapsed": true
551 | },
552 | "outputs": [],
553 | "source": [
554 | "X_train.columns"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": null,
560 | "metadata": {
561 | "collapsed": true
562 | },
563 | "outputs": [],
564 | "source": [
565 | "ridge.coef_"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": null,
571 | "metadata": {
572 | "collapsed": true
573 | },
574 | "outputs": [],
575 | "source": [
576 | "plt.figure()\n",
577 | "plt.barh(range(len(ridge.coef_)), ridge.coef_)\n",
578 | "plt.yticks(range(len(ridge.coef_)), X_train.columns);"
579 | ]
580 | }
581 | ],
582 | "metadata": {
583 | "anaconda-cloud": {},
584 | "kernelspec": {
585 | "display_name": "Python [default]",
586 | "language": "python",
587 | "name": "python3"
588 | },
589 | "language_info": {
590 | "codemirror_mode": {
591 | "name": "ipython",
592 | "version": 3
593 | },
594 | "file_extension": ".py",
595 | "mimetype": "text/x-python",
596 | "name": "python",
597 | "nbconvert_exporter": "python",
598 | "pygments_lexer": "ipython3",
599 | "version": "3.6.1"
600 | }
601 | },
602 | "nbformat": 4,
603 | "nbformat_minor": 1
604 | }
605 |
--------------------------------------------------------------------------------
/05 - More data - the adult dataset.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib notebook\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "#import seaborn as sns\n",
14 | "import pandas as pd\n",
15 | "import numpy as np\n",
16 | "plt.rcParams['figure.dpi'] = 100"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "# Using the adult dataset"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "collapsed": true
31 | },
32 | "outputs": [],
33 | "source": [
34 | "adult = pd.read_csv(\"data/adult.csv\", index_col=0)"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "adult.head()"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "adult.income.value_counts()"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "adult.income.value_counts().plot(kind=\"barh\")"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "adult.education.value_counts()"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "adult.groupby(\"income\")"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "adult.groupby(\"income\")['education'].value_counts()"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {
95 | "collapsed": true
96 | },
97 | "outputs": [],
98 | "source": [
99 | "education_counts = adult.groupby(\"income\")['education'].value_counts()"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {},
106 | "outputs": [],
107 | "source": [
108 | "education_counts.unstack(\"income\")"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "unstacked_education = education_counts.unstack(\"income\")\n",
118 | "unstacked_education.plot(kind=\"barh\")"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "(unstacked_education / unstacked_education.sum(axis=0)).plot(kind=\"barh\")"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "unstacked_education.columns"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "plt.figure()\n",
146 | "(unstacked_education[\" >50K\"] / unstacked_education.sum(axis=1)).plot(kind=\"barh\")"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "# Exercise\n",
154 | "Group the data by gender, and compare the income distributions over genders.\n",
155 | "Do a similar plot for some of the other variables."
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {
162 | "collapsed": true
163 | },
164 | "outputs": [],
165 | "source": [
166 | "# solution"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {
172 | "collapsed": true
173 | },
174 | "source": [
175 | "# Exercise\n",
176 | "Apply the basic machine learning workflow to the dataset.\n",
177 | "For simplicity you might want to drop the \"native-country\" column.\n",
178 | "Proceed as follows:\n",
179 | "- separate features and target\n",
180 | "- do dummy encoding of the categorical features\n",
181 | "- split data in training and test set\n",
182 | "- scale the data\n",
183 | "- apply a machine learning model. Start with ``sklearn.linear_model.LogisticRegression``, a linear classifier.\n",
184 | "- visualize the coefficients in a bar-plot (if there are too many, only show the ones of larges magnitude)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "collapsed": true
192 | },
193 | "outputs": [],
194 | "source": []
195 | }
196 | ],
197 | "metadata": {
198 | "kernelspec": {
199 | "display_name": "Python [conda root]",
200 | "language": "python",
201 | "name": "conda-root-py"
202 | },
203 | "language_info": {
204 | "codemirror_mode": {
205 | "name": "ipython",
206 | "version": 3
207 | },
208 | "file_extension": ".py",
209 | "mimetype": "text/x-python",
210 | "name": "python",
211 | "nbconvert_exporter": "python",
212 | "pygments_lexer": "ipython3",
213 | "version": "3.6.1"
214 | }
215 | },
216 | "nbformat": 4,
217 | "nbformat_minor": 2
218 | }
219 |
--------------------------------------------------------------------------------
/06 - Closing comments.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Version control and nbdime\n",
8 | "- Do version control\n",
9 | "- JSON is not fun to version control\n",
10 | "- nbdime eases the pain"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "# Run all etc\n",
18 | "- make sure your notebooks are reproducible\n",
19 | "- avoid cells that modify state\n",
20 | "- use \"restart kernel and run all\" to check if your code actually works"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "# Handling multiple kernels\n",
28 | "- Check out the docs at https://ipython.readthedocs.io/en/latest/install/kernel_install.html"
29 | ]
30 | }
31 | ],
32 | "metadata": {
33 | "kernelspec": {
34 | "display_name": "Python [conda root]",
35 | "language": "python",
36 | "name": "conda-root-py"
37 | },
38 | "language_info": {
39 | "codemirror_mode": {
40 | "name": "ipython",
41 | "version": 3
42 | },
43 | "file_extension": ".py",
44 | "mimetype": "text/x-python",
45 | "name": "python",
46 | "nbconvert_exporter": "python",
47 | "pygments_lexer": "ipython3",
48 | "version": "3.6.1"
49 | }
50 | },
51 | "nbformat": 4,
52 | "nbformat_minor": 2
53 | }
54 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Andreas Mueller
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Data analysis and machine learning in Jupyter
2 | =============================================
3 |
4 |
5 | Instructor
6 | -----------
7 |
8 | - [Andreas Mueller](http://amuller.github.io) [@amuellerml](https://twitter.com/amuellerml) - Columbia University; [Introduction to Machine Learning with Python](http://shop.oreilly.com/product/0636920030515.do)
9 |
10 |
11 | Obtaining the Tutorial Material
12 | ------------------
13 |
14 |
15 | If you have a GitHub account, it is probably most convenient if you clone or
16 | fork the GitHub repository. You can clone the repository by running:
17 |
18 | ```bash
19 | git clone https://github.com/amueller/jupytercon2017.git
20 |
21 | ```
22 |
23 | If you are not familiar with git or don’t have an
24 | GitHub account, you can download the repository as a .zip file by heading over
25 | to the GitHub repository (https://github.com/amueller/jupytercon2017) in
26 | your browser and click the green “Download” button in the upper right.
27 |
28 | 
29 |
30 | Please note that we may add and improve the material until shortly before the
31 | tutorial session, and we recommend you to update your copy of the materials one
32 | day before the tutorials. If you have an GitHub account and cloned the
33 | repository via GitHub, you can sync your existing local repository with:
34 |
35 | ```bash
36 | git pull origin master
37 | ```
38 |
39 | If you don’t have a GitHub account, you may have to re-download the .zip
40 | archive from GitHub.
41 |
42 |
43 | Installation Notes
44 | ------------------
45 |
46 | This tutorial will require recent installations of
47 |
48 | - [NumPy](http://www.numpy.org)
49 | - [SciPy](http://www.scipy.org)
50 | - [matplotlib](http://matplotlib.org)
51 | - [pandas](http://pandas.pydata.org)
52 | - [pillow](https://python-pillow.org)
53 | - [scikit-learn](http://scikit-learn.org/stable/)
54 | - [seaborn](https://seaborn.pydata.org/)
55 | - [IPython](http://ipython.readthedocs.org/en/stable/)
56 | - [Jupyter Notebook](http://jupyter.org)
57 |
58 |
59 | The last one is important, you should be able to type:
60 |
61 | jupyter notebook
62 |
63 | in your terminal window and see the notebook panel load in your web browser.
64 | Try opening and running a notebook from the material to see check that it works.
65 |
66 | For users who do not yet have these packages installed, a relatively
67 | painless way to install all the requirements is to use a Python distribution
68 | such as [Anaconda CE](http://store.continuum.io/ "Anaconda CE"), which includes
69 | the most relevant Python packages for science, math, engineering, and
70 | data analysis; Anaconda can be downloaded and installed for free
71 | including commercial use and redistribution.
72 | The code examples in this tutorial should be compatible to Python 2.7,
73 | Python 3.4-3.6.
74 |
75 | After obtaining the material, we **strongly recommend** you to open and execute
76 | the Jupyter Notebook `jupter notebook check_env.ipynb` that is located at the
77 | top level of this repository. Inside the repository, you can open the notebook
78 | by executing
79 |
80 | ```bash
81 | jupyter notebook check_env.ipynb
82 | ```
83 |
84 | inside this repository. Inside the Notebook, you can run the code cell by
85 | clicking on the "Run Cells" button as illustrated in the figure below:
86 |
87 | 
88 |
89 |
90 | Finally, if your environment satisfies the requirements for the tutorials, the
91 | executed code cell will produce an output message as shown below:
92 |
93 | 
94 |
95 | Although not required, we also recommend you to update the required Python
96 | packages to their latest versions to ensure best compatibility with the
97 | teaching material. Please upgrade already installed packages by executing
98 |
99 | - `pip install [package-name] --upgrade`
100 | - or `conda update [package-name]`
101 |
102 |
103 |
104 |
105 | Outline
106 | =======
107 | t.b.a.
108 |
109 |
--------------------------------------------------------------------------------
/bonus - Cross-validation and Grid Search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Cross-validation"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | ""
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "%matplotlib notebook\n",
26 | "import matplotlib.pyplot as plt\n",
27 | "plt.rcParams[\"figure.dpi\"] = 200\n",
28 | "import numpy as np"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {
35 | "collapsed": true
36 | },
37 | "outputs": [],
38 | "source": [
39 | "from sklearn.datasets import load_digits\n",
40 | "from sklearn.model_selection import train_test_split\n",
41 | "digits = load_digits()\n",
42 | "X_train, X_test, y_train, y_test = train_test_split(\n",
43 | " digits.data, digits.target)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {
50 | "collapsed": true
51 | },
52 | "outputs": [],
53 | "source": [
54 | "from sklearn.model_selection import cross_val_score\n",
55 | "from sklearn.neighbors import KNeighborsClassifier"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "cross_val_score(KNeighborsClassifier(),\n",
65 | " X_train, y_train, cv=5)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {
72 | "collapsed": true
73 | },
74 | "outputs": [],
75 | "source": [
76 | "from sklearn.model_selection import KFold, StratifiedKFold"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "cross_val_score(KNeighborsClassifier(),\n",
86 | " X_train, y_train, cv=KFold(n_splits=10, shuffle=True, random_state=42))"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Grid Searches\n",
94 | "================="
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | ""
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "Grid-Search with build-in cross validation"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "from sklearn.model_selection import GridSearchCV\n",
118 | "from sklearn.svm import SVC"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "Define parameter grid:"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "import numpy as np\n",
135 | "\n",
136 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n",
137 | " 'gamma' : 10. ** np.arange(-5, 0)}\n",
138 | "\n",
139 | "np.set_printoptions(suppress=True)\n",
140 | "print(param_grid)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3, cv=5)"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "A GridSearchCV object behaves just like a normal classifier."
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {
163 | "scrolled": true
164 | },
165 | "outputs": [],
166 | "source": [
167 | "grid_search.fit(X_train, y_train)"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "scrolled": true
175 | },
176 | "outputs": [],
177 | "source": [
178 | "grid_search.predict(X_test)"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "grid_search.score(X_test, y_test)"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {},
194 | "outputs": [],
195 | "source": [
196 | "grid_search.best_params_"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": null,
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "grid_search.best_score_"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [],
213 | "source": [
214 | "grid_search.best_estimator_"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {},
221 | "outputs": [],
222 | "source": [
223 | "# We extract just the scores\n",
224 | "\n",
225 | "scores = grid_search.cv_results_['mean_test_score']\n",
226 | "scores = np.array(scores).reshape(6, 5)\n",
227 | "\n",
228 | "plt.matshow(scores)\n",
229 | "plt.xlabel('gamma')\n",
230 | "plt.ylabel('C')\n",
231 | "plt.colorbar()\n",
232 | "plt.xticks(np.arange(5), param_grid['gamma'])\n",
233 | "plt.yticks(np.arange(6), param_grid['C']);"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "# Exercises\n",
241 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier."
242 | ]
243 | }
244 | ],
245 | "metadata": {
246 | "anaconda-cloud": {},
247 | "kernelspec": {
248 | "display_name": "Python [default]",
249 | "language": "python",
250 | "name": "python3"
251 | },
252 | "language_info": {
253 | "codemirror_mode": {
254 | "name": "ipython",
255 | "version": 3
256 | },
257 | "file_extension": ".py",
258 | "mimetype": "text/x-python",
259 | "name": "python",
260 | "nbconvert_exporter": "python",
261 | "pygments_lexer": "ipython3",
262 | "version": "3.6.1"
263 | }
264 | },
265 | "nbformat": 4,
266 | "nbformat_minor": 1
267 | }
268 |
--------------------------------------------------------------------------------
/bonus - Trees.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import numpy as np\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "% matplotlib inline\n",
14 | "plt.rcParams[\"figure.dpi\"] = 200\n",
15 | "np.set_printoptions(precision=3)\n",
16 | "import pandas as pd\n",
17 | "from sklearn.model_selection import train_test_split\n",
18 | "from sklearn.pipeline import make_pipeline\n",
19 | "from sklearn.preprocessing import scale, StandardScaler"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {
26 | "collapsed": true
27 | },
28 | "outputs": [],
29 | "source": [
30 | "from sklearn.datasets import load_breast_cancer\n",
31 | "cancer = load_breast_cancer()"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {
38 | "scrolled": true
39 | },
40 | "outputs": [],
41 | "source": [
42 | "print(cancer.DESCR)"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "metadata": {
49 | "collapsed": true
50 | },
51 | "outputs": [],
52 | "source": [
53 | "X_train, X_test, y_train, y_test = train_test_split(\n",
54 | " cancer.data, cancer.target, stratify=cancer.target, random_state=0)"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "source": [
63 | "# tree visualization"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "from sklearn.tree import DecisionTreeClassifier\n",
73 | "tree = DecisionTreeClassifier(max_depth=2)\n",
74 | "tree.fit(X_train, y_train)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "from sklearn.tree import export_graphviz\n",
84 | "tree_dot = export_graphviz(tree, out_file=None, feature_names=cancer.feature_names, filled=True)\n",
85 | "print(tree_dot)"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "collapsed": true
93 | },
94 | "outputs": [],
95 | "source": [
96 | "# import graphviz\n",
97 | "# graphviz.Source(tree_dot)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "from tree_plotting import plot_tree\n",
107 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "# Parameter Tuning"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "plt.figure(figsize=(20, 5))\n",
124 | "tree = DecisionTreeClassifier().fit(X_train, y_train)\n",
125 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)\n"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "tree = DecisionTreeClassifier(max_depth=1).fit(X_train, y_train)\n",
135 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "tree = DecisionTreeClassifier(max_leaf_nodes=8).fit(X_train, y_train)\n",
145 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "tree = DecisionTreeClassifier(min_samples_split=50).fit(X_train, y_train)\n",
155 | "plot_tree(tree, feature_names=cancer.feature_names, filled=True)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "from sklearn.model_selection import GridSearchCV\n",
165 | "param_grid = {'max_depth':range(1, 7)}\n",
166 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=10)\n",
167 | "grid.fit(X_train, y_train)"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "scrolled": true
175 | },
176 | "outputs": [],
177 | "source": [
178 | "from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit\n",
179 | "param_grid = {'max_depth':range(1, 7)}\n",
180 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,\n",
181 | " cv=StratifiedShuffleSplit(100))\n",
182 | "grid.fit(X_train, y_train)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "scores = pd.DataFrame(grid.cv_results_)\n",
192 | "scores.plot(x='param_max_depth', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n",
193 | "plt.legend(loc=(1, 0))"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "from sklearn.model_selection import GridSearchCV\n",
203 | "param_grid = {'max_leaf_nodes':range(2, 20)}\n",
204 | "grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=StratifiedShuffleSplit(100, random_state=1))\n",
205 | "grid.fit(X_train, y_train)\n",
206 | "\n",
207 | "scores = pd.DataFrame(grid.cv_results_)\n",
208 | "scores.plot(x='param_max_leaf_nodes', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())\n",
209 | "plt.legend(loc=(1, 0))"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "scores = pd.DataFrame(grid.cv_results_)\n",
219 | "scores.plot(x='param_max_leaf_nodes', y='mean_train_score', yerr='std_train_score', ax=plt.gca())\n",
220 | "scores.plot(x='param_max_leaf_nodes', y='mean_test_score', yerr='std_test_score', ax=plt.gca())"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "grid.best_params_"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "metadata": {},
236 | "outputs": [],
237 | "source": [
238 | "plot_tree(grid.best_estimator_, feature_names=cancer.feature_names, filled=True)\n"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {
245 | "collapsed": true
246 | },
247 | "outputs": [],
248 | "source": [
249 | "pd.Series(grid.best_estimator_.feature_importances_,\n",
250 | " index=cancer.feature_names).plot(kind=\"barh\")"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "# Exercise\n",
258 | "Apply a decision tree to the \"adult\" dataset and visualize it.\n",
259 | "\n",
260 | "Tune parameters with grid-search; use max_features, try at least max_leaf_nodes and max_depth, but separately.\n",
261 | "\n",
262 | "Visualize the resulting tree and it's feature importances."
263 | ]
264 | }
265 | ],
266 | "metadata": {
267 | "anaconda-cloud": {},
268 | "kernelspec": {
269 | "display_name": "Python [conda root]",
270 | "language": "python",
271 | "name": "conda-root-py"
272 | },
273 | "language_info": {
274 | "codemirror_mode": {
275 | "name": "ipython",
276 | "version": 3
277 | },
278 | "file_extension": ".py",
279 | "mimetype": "text/x-python",
280 | "name": "python",
281 | "nbconvert_exporter": "python",
282 | "pygments_lexer": "ipython3",
283 | "version": "3.6.1"
284 | }
285 | },
286 | "nbformat": 4,
287 | "nbformat_minor": 2
288 | }
289 |
--------------------------------------------------------------------------------
/check_env.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from __future__ import print_function\n",
12 | "from distutils.version import LooseVersion as Version\n",
13 | "import sys\n",
14 | "\n",
15 | "\n",
16 | "\n",
17 | "OK = '\\x1b[42m[ OK ]\\x1b[0m'\n",
18 | "FAIL = \"\\x1b[41m[FAIL]\\x1b[0m\"\n",
19 | "\n",
20 | "try:\n",
21 | " import importlib\n",
22 | "except ImportError:\n",
23 | " print(FAIL, \"Python version 3.4 (or 2.7) is required,\"\n",
24 | " \" but %s is installed.\" % sys.version)\n",
25 | "\n",
26 | " \n",
27 | "def import_version(pkg, min_ver, fail_msg=\"\"):\n",
28 | " mod = None\n",
29 | " try:\n",
30 | " mod = importlib.import_module(pkg)\n",
31 | " if pkg in {'PIL'}:\n",
32 | " ver = mod.VERSION\n",
33 | " else:\n",
34 | " ver = mod.__version__\n",
35 | " if Version(ver) < min_ver:\n",
36 | " print(FAIL, \"%s version %s or higher required, but %s installed.\"\n",
37 | " % (lib, min_ver, ver))\n",
38 | " else:\n",
39 | " print(OK, '%s version %s' % (pkg, ver))\n",
40 | " except ImportError:\n",
41 | " print(FAIL, '%s not installed. %s' % (pkg, fail_msg))\n",
42 | " return mod\n",
43 | "\n",
44 | "\n",
45 | "# first check the python version\n",
46 | "print('Using python in', sys.prefix)\n",
47 | "print(sys.version)\n",
48 | "pyversion = Version(sys.version)\n",
49 | "if pyversion >= \"3\":\n",
50 | " if pyversion < \"3.4\":\n",
51 | " print(FAIL, \"Python version 3.4 (or 2.7) is required,\"\n",
52 | " \" but %s is installed.\" % sys.version)\n",
53 | "elif pyversion >= \"2\":\n",
54 | " if pyversion < \"2.7\":\n",
55 | " print(FAIL, \"Python version 2.7 is required,\"\n",
56 | " \" but %s is installed.\" % sys.version)\n",
57 | "else:\n",
58 | " print(FAIL, \"Unknown Python version: %s\" % sys.version)\n",
59 | "\n",
60 | "print()\n",
61 | "requirements = {'numpy': \"1.6.1\", 'scipy': \"0.9\", 'matplotlib': \"1.0\",\n",
62 | " 'IPython': \"3.0\", 'sklearn': \"0.19\", 'pandas': \"0.18\",\n",
63 | " 'seaborn': \"0.5\", 'PIL': \"1.1.7\"}\n",
64 | "\n",
65 | "# now the dependencies\n",
66 | "for lib, required_version in list(requirements.items()):\n",
67 | " import_version(lib, required_version)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "collapsed": true
75 | },
76 | "outputs": [],
77 | "source": []
78 | }
79 | ],
80 | "metadata": {
81 | "anaconda-cloud": {},
82 | "kernelspec": {
83 | "display_name": "Python [default]",
84 | "language": "python",
85 | "name": "python3"
86 | },
87 | "language_info": {
88 | "codemirror_mode": {
89 | "name": "ipython",
90 | "version": 3
91 | },
92 | "file_extension": ".py",
93 | "mimetype": "text/x-python",
94 | "name": "python",
95 | "nbconvert_exporter": "python",
96 | "pygments_lexer": "ipython3",
97 | "version": "3.6.1"
98 | }
99 | },
100 | "nbformat": 4,
101 | "nbformat_minor": 1
102 | }
103 |
--------------------------------------------------------------------------------
/images/check_env-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/jupytercon2017/923b29822fab2a91f0b97d5ba4bd4a56b6e88880/images/check_env-1.png
--------------------------------------------------------------------------------
/images/check_env-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/jupytercon2017/923b29822fab2a91f0b97d5ba4bd4a56b6e88880/images/check_env-2.png
--------------------------------------------------------------------------------
/images/cross_validation.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/images/download-repo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/jupytercon2017/923b29822fab2a91f0b97d5ba4bd4a56b6e88880/images/download-repo.png
--------------------------------------------------------------------------------
/images/supervised_workflow.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/images/tab-help.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/jupytercon2017/923b29822fab2a91f0b97d5ba4bd4a56b6e88880/images/tab-help.png
--------------------------------------------------------------------------------
/solutions/solutions.py:
--------------------------------------------------------------------------------
1 | ######### 02
2 | fig, axes = plt.subplots(1, 4)
3 | line = np.linspace(-3, 3, 20)
4 | for i, ax in enumerate(axes):
5 | ax.plot(line ** (i + 1))
6 |
7 | ######### 03
8 | housing.plot(x='latitude', y='longitude', kind='scatter', c='population')
9 |
10 | housing.plot(x='latitude', y='longitude', kind='scatter', c='population', alpha=.5, cmap='viridis')
11 |
12 |
13 | # could have also done kind='hexbin' but that doesn't show us a helpful docstring:
14 | # housing.plot(x='latitude', y='longitude', kind='hexbin', C='population', cmap='viridis')
15 | housing.plot.hexbin(x='latitude', y='longitude', C='population', cmap='viridis', linewidth=0)
16 |
17 | housing.plot.hexbin(x='latitude', y='longitude', C='population', cmap='viridis', reduce_C_function=sum, linewidth=0)
18 |
19 |
20 |
21 | fig, axes = plt.subplots(2, 3, subplot_kw={'xticks': (), 'yticks': ()})
22 | for column, ax in zip(housing.columns[2:-2], axes.ravel()):
23 | if column in ['housing_median_age', 'median_income']:
24 | reduce = np.mean
25 | else:
26 | reduce = np.sum
27 | housing.plot.hexbin(x='latitude', y='longitude', C=column, cmap='viridis', reduce_C_function=reduce, linewidth=0, ax=ax)
28 | ax.set_title(column)
29 | plt.tight_layout()
30 |
31 |
32 | housing.plot.hexbin(x='latitude', y='longitude', C='median_house_value', cmap='viridis', linewidth=0)
33 |
34 |
35 | # two outliers:
36 | print((housing_nonull.population > 20000).sum())
37 | pd.plotting.scatter_matrix(housing_nonull.iloc[:, 2:-2], c=housing_nonull.population > 20000, cmap='tab10');
38 | plt.figure()
39 | plt.scatter(housing_nonull.latitude, housing_nonull.longitude, c=plt.cm.tab10((housing_nonull.population > 20000).astype(int)), s=3)
40 |
41 | # vs dependent variable:
42 | fig, axes = plt.subplots(4, 2)
43 | for ax, column in zip(axes.ravel(), continuous_dependent):
44 | ax.scatter(housing_nonull[column], housing_nonull['median_house_value'], alpha=.01)
45 | ax.set_title(column)
46 | plt.tight_layout()
47 |
48 | # vs dependen variable with seaborn
49 |
50 | sns.pairplot(housing_nonull, x_vars=continuous_dependent, y_vars=["median_house_value"],
51 | kind="scatter", plot_kws={'alpha': .01, 'edgecolor': None});
52 | # we'll see a nice way in the next notebook
53 |
54 | ####### 4
55 | # for the housing data
56 | housing = pd.read_csv("data/housing.csv")
57 |
58 | housing.head()
59 |
60 | housing_dummies = pd.get_dummies(housing)
61 | housing_dummies.head()
62 |
63 | # ridge regression on housing
64 | from sklearn.linear_model import Ridge
65 |
66 | housing = pd.read_csv("data/housing.csv")
67 | housing = housing.dropna(axis=0)
68 | housing_dummies = pd.get_dummies(housing)
69 | housing_dummies.head()
70 | y = housing_dummies.pop("median_house_value")
71 | X = housing_dummies
72 | print(X.head())
73 |
74 | X_train, X_test, y_train, y_test = train_test_split(X, y)
75 | scaler = StandardScaler().fit(X_train)
76 | X_train_scaled = scaler.transform(X_train)
77 | X_test_scaled = scaler.transform(X_test)
78 | ridge = Ridge()
79 |
80 | ridge.fit(X_train_scaled, y_train)
81 | ridge.score(X_test_scaled, y_test)
82 |
--------------------------------------------------------------------------------
/tree_plotting.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from numbers import Integral
3 |
4 | from sklearn.externals import six
5 | from sklearn.tree.export import _color_brew, _criterion, _tree
6 |
7 |
8 | def plot_tree(decision_tree, max_depth=None, feature_names=None,
9 | class_names=None, label='all', filled=False,
10 | leaves_parallel=False, impurity=True, node_ids=False,
11 | proportion=False, rotate=False, rounded=False,
12 | special_characters=False, precision=3, ax=None, fontsize=None):
13 | """Plot a decision tree.
14 |
15 | The sample counts that are shown are weighted with any sample_weights that
16 | might be present.
17 |
18 | Parameters
19 | ----------
20 | decision_tree : decision tree classifier
21 | The decision tree to be exported to GraphViz.
22 |
23 | max_depth : int, optional (default=None)
24 | The maximum depth of the representation. If None, the tree is fully
25 | generated.
26 |
27 | feature_names : list of strings, optional (default=None)
28 | Names of each of the features.
29 |
30 | class_names : list of strings, bool or None, optional (default=None)
31 | Names of each of the target classes in ascending numerical order.
32 | Only relevant for classification and not supported for multi-output.
33 | If ``True``, shows a symbolic representation of the class name.
34 |
35 | label : {'all', 'root', 'none'}, optional (default='all')
36 | Whether to show informative labels for impurity, etc.
37 | Options include 'all' to show at every node, 'root' to show only at
38 | the top root node, or 'none' to not show at any node.
39 |
40 | filled : bool, optional (default=False)
41 | When set to ``True``, paint nodes to indicate majority class for
42 | classification, extremity of values for regression, or purity of node
43 | for multi-output.
44 |
45 | leaves_parallel : bool, optional (default=False)
46 | When set to ``True``, draw all leaf nodes at the bottom of the tree.
47 |
48 | impurity : bool, optional (default=True)
49 | When set to ``True``, show the impurity at each node.
50 |
51 | node_ids : bool, optional (default=False)
52 | When set to ``True``, show the ID number on each node.
53 |
54 | proportion : bool, optional (default=False)
55 | When set to ``True``, change the display of 'values' and/or 'samples'
56 | to be proportions and percentages respectively.
57 |
58 | rotate : bool, optional (default=False)
59 | When set to ``True``, orient tree left to right rather than top-down.
60 |
61 | rounded : bool, optional (default=False)
62 | When set to ``True``, draw node boxes with rounded corners and use
63 | Helvetica fonts instead of Times-Roman.
64 |
65 | special_characters : bool, optional (default=False)
66 | When set to ``False``, ignore special characters for PostScript
67 | compatibility.
68 |
69 | precision : int, optional (default=3)
70 | Number of digits of precision for floating point in the values of
71 | impurity, threshold and value attributes of each node.
72 |
73 | ax : matplotlib axis, optional (default=None)
74 | Axes to plot to. If None, use current axis.
75 |
76 | Examples
77 | --------
78 | >>> from sklearn.datasets import load_iris
79 |
80 | >>> clf = tree.DecisionTreeClassifier()
81 | >>> iris = load_iris()
82 |
83 | >>> clf = clf.fit(iris.data, iris.target)
84 | >>> plot_tree(clf) # doctest: +SKIP
85 |
86 | """
87 | exporter = _MPLTreeExporter(
88 | max_depth=max_depth, feature_names=feature_names,
89 | class_names=class_names, label=label, filled=filled,
90 | leaves_parallel=leaves_parallel, impurity=impurity, node_ids=node_ids,
91 | proportion=proportion, rotate=rotate, rounded=rounded,
92 | special_characters=special_characters, precision=precision,
93 | fontsize=fontsize)
94 | exporter.export(decision_tree, ax=ax)
95 |
96 |
97 | class _BaseTreeExporter(object):
98 | def get_color(self, value):
99 | # Find the appropriate color & intensity for a node
100 | if self.colors['bounds'] is None:
101 | # Classification tree
102 | color = list(self.colors['rgb'][np.argmax(value)])
103 | sorted_values = sorted(value, reverse=True)
104 | if len(sorted_values) == 1:
105 | alpha = 0
106 | else:
107 | alpha = ((sorted_values[0] - sorted_values[1])
108 | / (1 - sorted_values[1]))
109 | else:
110 | # Regression tree or multi-output
111 | color = list(self.colors['rgb'][0])
112 | alpha = ((value - self.colors['bounds'][0]) /
113 | (self.colors['bounds'][1] - self.colors['bounds'][0]))
114 | # unpack numpy scalars
115 | alpha = float(alpha)
116 | # compute the color as alpha against white
117 | color = [int(round(alpha * c + (1 - alpha) * 255, 0)) for c in color]
118 | # Return html color code in #RRGGBB format
119 | hex_codes = [str(i) for i in range(10)]
120 | hex_codes.extend(['a', 'b', 'c', 'd', 'e', 'f'])
121 | color = [hex_codes[c // 16] + hex_codes[c % 16] for c in color]
122 |
123 | return '#' + ''.join(color)
124 |
125 | def get_fill_color(self, tree, node_id):
126 | # Fetch appropriate color for node
127 | if 'rgb' not in self.colors:
128 | # Initialize colors and bounds if required
129 | self.colors['rgb'] = _color_brew(tree.n_classes[0])
130 | if tree.n_outputs != 1:
131 | # Find max and min impurities for multi-output
132 | self.colors['bounds'] = (np.min(-tree.impurity),
133 | np.max(-tree.impurity))
134 | elif (tree.n_classes[0] == 1 and
135 | len(np.unique(tree.value)) != 1):
136 | # Find max and min values in leaf nodes for regression
137 | self.colors['bounds'] = (np.min(tree.value),
138 | np.max(tree.value))
139 | if tree.n_outputs == 1:
140 | node_val = (tree.value[node_id][0, :] /
141 | tree.weighted_n_node_samples[node_id])
142 | if tree.n_classes[0] == 1:
143 | # Regression
144 | node_val = tree.value[node_id][0, :]
145 | else:
146 | # If multi-output color node by impurity
147 | node_val = -tree.impurity[node_id]
148 | return self.get_color(node_val)
149 |
150 | def node_to_str(self, tree, node_id, criterion):
151 | # Generate the node content string
152 | if tree.n_outputs == 1:
153 | value = tree.value[node_id][0, :]
154 | else:
155 | value = tree.value[node_id]
156 |
157 | # Should labels be shown?
158 | labels = (self.label == 'root' and node_id == 0) or self.label == 'all'
159 |
160 | characters = self.characters
161 | node_string = characters[-1]
162 |
163 | # Write node ID
164 | if self.node_ids:
165 | if labels:
166 | node_string += 'node '
167 | node_string += characters[0] + str(node_id) + characters[4]
168 |
169 | # Write decision criteria
170 | if tree.children_left[node_id] != _tree.TREE_LEAF:
171 | # Always write node decision criteria, except for leaves
172 | if self.feature_names is not None:
173 | feature = self.feature_names[tree.feature[node_id]]
174 | else:
175 | feature = "X%s%s%s" % (characters[1],
176 | tree.feature[node_id],
177 | characters[2])
178 | node_string += '%s %s %s%s' % (feature,
179 | characters[3],
180 | round(tree.threshold[node_id],
181 | self.precision),
182 | characters[4])
183 |
184 | # Write impurity
185 | if self.impurity:
186 | if isinstance(criterion, _criterion.FriedmanMSE):
187 | criterion = "friedman_mse"
188 | elif not isinstance(criterion, six.string_types):
189 | criterion = "impurity"
190 | if labels:
191 | node_string += '%s = ' % criterion
192 | node_string += (str(round(tree.impurity[node_id], self.precision))
193 | + characters[4])
194 |
195 | # Write node sample count
196 | if labels:
197 | node_string += 'samples = '
198 | if self.proportion:
199 | percent = (100. * tree.n_node_samples[node_id] /
200 | float(tree.n_node_samples[0]))
201 | node_string += (str(round(percent, 1)) + '%' +
202 | characters[4])
203 | else:
204 | node_string += (str(tree.n_node_samples[node_id]) +
205 | characters[4])
206 |
207 | # Write node class distribution / regression value
208 | if self.proportion and tree.n_classes[0] != 1:
209 | # For classification this will show the proportion of samples
210 | value = value / tree.weighted_n_node_samples[node_id]
211 | if labels:
212 | node_string += 'value = '
213 | if tree.n_classes[0] == 1:
214 | # Regression
215 | value_text = np.around(value, self.precision)
216 | elif self.proportion:
217 | # Classification
218 | value_text = np.around(value, self.precision)
219 | elif np.all(np.equal(np.mod(value, 1), 0)):
220 | # Classification without floating-point weights
221 | value_text = value.astype(int)
222 | else:
223 | # Classification with floating-point weights
224 | value_text = np.around(value, self.precision)
225 | # Strip whitespace
226 | value_text = str(value_text.astype('S32')).replace("b'", "'")
227 | value_text = value_text.replace("' '", ", ").replace("'", "")
228 | if tree.n_classes[0] == 1 and tree.n_outputs == 1:
229 | value_text = value_text.replace("[", "").replace("]", "")
230 | value_text = value_text.replace("\n ", characters[4])
231 | node_string += value_text + characters[4]
232 |
233 | # Write node majority class
234 | if (self.class_names is not None and
235 | tree.n_classes[0] != 1 and
236 | tree.n_outputs == 1):
237 | # Only done for single-output classification trees
238 | if labels:
239 | node_string += 'class = '
240 | if self.class_names is not True:
241 | class_name = self.class_names[np.argmax(value)]
242 | else:
243 | class_name = "y%s%s%s" % (characters[1],
244 | np.argmax(value),
245 | characters[2])
246 | node_string += class_name
247 |
248 | # Clean up any trailing newlines
249 | if node_string.endswith(characters[4]):
250 | node_string = node_string[:-len(characters[4])]
251 |
252 | return node_string + characters[5]
253 |
254 |
255 | class _MPLTreeExporter(_BaseTreeExporter):
256 | def __init__(self, max_depth=None, feature_names=None,
257 | class_names=None, label='all', filled=False,
258 | leaves_parallel=False, impurity=True, node_ids=False,
259 | proportion=False, rotate=False, rounded=False,
260 | special_characters=False, precision=3, fontsize=None):
261 | self.max_depth = max_depth
262 | self.feature_names = feature_names
263 | self.class_names = class_names
264 | self.label = label
265 | self.filled = filled
266 | self.leaves_parallel = leaves_parallel
267 | self.impurity = impurity
268 | self.node_ids = node_ids
269 | self.proportion = proportion
270 | self.rotate = rotate
271 | self.rounded = rounded
272 | self.special_characters = special_characters
273 | self.precision = precision
274 | self.fontsize = fontsize
275 | self._scaley = 10
276 |
277 | # validate
278 | if isinstance(precision, Integral):
279 | if precision < 0:
280 | raise ValueError("'precision' should be greater or equal to 0."
281 | " Got {} instead.".format(precision))
282 | else:
283 | raise ValueError("'precision' should be an integer. Got {}"
284 | " instead.".format(type(precision)))
285 |
286 | # The depth of each node for plotting with 'leaf' option
287 | self.ranks = {'leaves': []}
288 | # The colors to render each node with
289 | self.colors = {'bounds': None}
290 |
291 | self.characters = ['#', '[', ']', '<=', '\n', '', '']
292 |
293 | self.bbox_args = dict(fc='w')
294 | if self.rounded:
295 | self.bbox_args['boxstyle'] = "round"
296 | self.arrow_args = dict(arrowstyle="<-")
297 |
298 | def _make_tree(self, node_id, et):
299 | # traverses _tree.Tree recursively, builds intermediate
300 | # "_reingold_tilford.Tree" object
301 | name = self.node_to_str(et, node_id, criterion='entropy')
302 | if (et.children_left[node_id] != et.children_right[node_id]):
303 | children = [self._make_tree(et.children_left[node_id], et),
304 | self._make_tree(et.children_right[node_id], et)]
305 | else:
306 | return Tree(name, node_id)
307 | return Tree(name, node_id, *children)
308 |
309 | def export(self, decision_tree, ax=None):
310 | import matplotlib.pyplot as plt
311 | from matplotlib.text import Annotation
312 |
313 | if ax is None:
314 | ax = plt.gca()
315 | ax.set_axis_off()
316 | my_tree = self._make_tree(0, decision_tree.tree_)
317 | dt = buchheim(my_tree)
318 | self._scalex = 1
319 | self.recurse(dt, decision_tree.tree_, ax)
320 |
321 | anns = [ann for ann in ax.get_children()
322 | if isinstance(ann, Annotation)]
323 |
324 | # get all the annotated points
325 | xys = [ann.xyann for ann in anns]
326 |
327 | mins = np.min(xys, axis=0)
328 | maxs = np.max(xys, axis=0)
329 |
330 | ax.set_xlim(mins[0], maxs[0])
331 | ax.set_ylim(maxs[1], mins[1])
332 |
333 | if self.fontsize is None:
334 | # get figure to data transform
335 | inv = ax.transData.inverted()
336 | renderer = ax.figure.canvas.get_renderer()
337 | # update sizes of all bboxes
338 | for ann in anns:
339 | ann.update_bbox_position_size(renderer)
340 | # get max box width
341 | widths = [inv.get_matrix()[0, 0]
342 | * ann.get_bbox_patch().get_window_extent().width
343 | for ann in anns]
344 | # get minimum max size to not be too big.
345 | max_width = max(max(widths), 1)
346 | # adjust fontsize to avoid overlap
347 | # width should be around 1 in data coordinates
348 | size = anns[0].get_fontsize() / max_width
349 | for ann in anns:
350 | ann.set_fontsize(size)
351 |
352 | def recurse(self, node, tree, ax, depth=0):
353 | kwargs = dict(bbox=self.bbox_args, ha='center', va='center',
354 | zorder=100 - 10 * depth)
355 |
356 | if self.fontsize is not None:
357 | kwargs['fontsize'] = self.fontsize
358 |
359 | xy = (node.x * self._scalex, node.y * self._scaley)
360 |
361 | if self.max_depth is None or depth <= self.max_depth:
362 | if self.filled:
363 | kwargs['bbox']['fc'] = self.get_fill_color(tree,
364 | node.tree.node_id)
365 | if node.parent is None:
366 | # root
367 | ax.annotate(node.tree.node, xy, **kwargs)
368 | else:
369 | xy_parent = (node.parent.x * self._scalex,
370 | node.parent.y * self._scaley)
371 | kwargs["arrowprops"] = self.arrow_args
372 | ax.annotate(node.tree.node, xy_parent, xy, **kwargs)
373 | for child in node.children:
374 | self.recurse(child, tree, ax, depth=depth + 1)
375 |
376 | else:
377 | xy_parent = (node.parent.x * self._scalex, node.parent.y *
378 | self._scaley)
379 | kwargs["arrowprops"] = self.arrow_args
380 | kwargs['bbox']['fc'] = 'grey'
381 | ax.annotate("\n (...) \n", xy_parent, xy, **kwargs)
382 |
383 |
384 | class DrawTree(object):
385 | def __init__(self, tree, parent=None, depth=0, number=1):
386 | self.x = -1.
387 | self.y = depth
388 | self.tree = tree
389 | self.children = [DrawTree(c, self, depth + 1, i + 1)
390 | for i, c
391 | in enumerate(tree.children)]
392 | self.parent = parent
393 | self.thread = None
394 | self.mod = 0
395 | self.ancestor = self
396 | self.change = self.shift = 0
397 | self._lmost_sibling = None
398 | # this is the number of the node in its group of siblings 1..n
399 | self.number = number
400 |
401 | def left(self):
402 | return self.thread or len(self.children) and self.children[0]
403 |
404 | def right(self):
405 | return self.thread or len(self.children) and self.children[-1]
406 |
407 | def lbrother(self):
408 | n = None
409 | if self.parent:
410 | for node in self.parent.children:
411 | if node == self:
412 | return n
413 | else:
414 | n = node
415 | return n
416 |
417 | def get_lmost_sibling(self):
418 | if not self._lmost_sibling and self.parent and self != \
419 | self.parent.children[0]:
420 | self._lmost_sibling = self.parent.children[0]
421 | return self._lmost_sibling
422 | lmost_sibling = property(get_lmost_sibling)
423 |
424 | def __str__(self):
425 | return "%s: x=%s mod=%s" % (self.tree, self.x, self.mod)
426 |
427 | def __repr__(self):
428 | return self.__str__()
429 |
430 |
431 | def buchheim(tree):
432 | dt = firstwalk(DrawTree(tree))
433 | min = second_walk(dt)
434 | if min < 0:
435 | third_walk(dt, -min)
436 | return dt
437 |
438 |
439 | def third_walk(tree, n):
440 | tree.x += n
441 | for c in tree.children:
442 | third_walk(c, n)
443 |
444 |
445 | def firstwalk(v, distance=1.):
446 | if len(v.children) == 0:
447 | if v.lmost_sibling:
448 | v.x = v.lbrother().x + distance
449 | else:
450 | v.x = 0.
451 | else:
452 | default_ancestor = v.children[0]
453 | for w in v.children:
454 | firstwalk(w)
455 | default_ancestor = apportion(w, default_ancestor, distance)
456 | # print("finished v =", v.tree, "children")
457 | execute_shifts(v)
458 |
459 | midpoint = (v.children[0].x + v.children[-1].x) / 2
460 |
461 | w = v.lbrother()
462 | if w:
463 | v.x = w.x + distance
464 | v.mod = v.x - midpoint
465 | else:
466 | v.x = midpoint
467 | return v
468 |
469 |
470 | def apportion(v, default_ancestor, distance):
471 | w = v.lbrother()
472 | if w is not None:
473 | # in buchheim notation:
474 | # i == inner; o == outer; r == right; l == left; r = +; l = -
475 | vir = vor = v
476 | vil = w
477 | vol = v.lmost_sibling
478 | sir = sor = v.mod
479 | sil = vil.mod
480 | sol = vol.mod
481 | while vil.right() and vir.left():
482 | vil = vil.right()
483 | vir = vir.left()
484 | vol = vol.left()
485 | vor = vor.right()
486 | vor.ancestor = v
487 | shift = (vil.x + sil) - (vir.x + sir) + distance
488 | if shift > 0:
489 | move_subtree(ancestor(vil, v, default_ancestor), v, shift)
490 | sir = sir + shift
491 | sor = sor + shift
492 | sil += vil.mod
493 | sir += vir.mod
494 | sol += vol.mod
495 | sor += vor.mod
496 | if vil.right() and not vor.right():
497 | vor.thread = vil.right()
498 | vor.mod += sil - sor
499 | else:
500 | if vir.left() and not vol.left():
501 | vol.thread = vir.left()
502 | vol.mod += sir - sol
503 | default_ancestor = v
504 | return default_ancestor
505 |
506 |
507 | def move_subtree(wl, wr, shift):
508 | subtrees = wr.number - wl.number
509 | # print(wl.tree, "is conflicted with", wr.tree, 'moving', subtrees,
510 | # 'shift', shift)
511 | # print wl, wr, wr.number, wl.number, shift, subtrees, shift/subtrees
512 | wr.change -= shift / subtrees
513 | wr.shift += shift
514 | wl.change += shift / subtrees
515 | wr.x += shift
516 | wr.mod += shift
517 |
518 |
519 | def execute_shifts(v):
520 | shift = change = 0
521 | for w in v.children[::-1]:
522 | # print("shift:", w, shift, w.change)
523 | w.x += shift
524 | w.mod += shift
525 | change += w.change
526 | shift += w.shift + change
527 |
528 |
529 | def ancestor(vil, v, default_ancestor):
530 | # the relevant text is at the bottom of page 7 of
531 | # "Improving Walker's Algorithm to Run in Linear Time" by Buchheim et al,
532 | # (2002)
533 | # http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.16.8757&rep=rep1&type=pdf
534 | if vil.ancestor in v.parent.children:
535 | return vil.ancestor
536 | else:
537 | return default_ancestor
538 |
539 |
540 | def second_walk(v, m=0, depth=0, min=None):
541 | v.x += m
542 | v.y = depth
543 |
544 | if min is None or v.x < min:
545 | min = v.x
546 |
547 | for w in v.children:
548 | min = second_walk(w, m + v.mod, depth + 1, min)
549 |
550 | return min
551 |
552 |
553 | class Tree(object):
554 | def __init__(self, node="", node_id=-1, *children):
555 | self.node = node
556 | self.width = len(node)
557 | self.node_id = node_id
558 | if children:
559 | self.children = children
560 | else:
561 | self.children = []
562 |
--------------------------------------------------------------------------------