├── .gitignore
├── 01-introduction.ipynb
├── 02-supervised-learning.ipynb
├── 03-unsupervised-learning.ipynb
├── 04-representing-data-feature-engineering.ipynb
├── 05-model-evaluation-and-improvement.ipynb
├── 06-algorithm-chains-and-pipelines.ipynb
├── 07-working-with-text-data.ipynb
├── 08-conclusion.ipynb
├── README.md
├── cover.jpg
├── data
├── adult.data
├── citibike.csv
└── ram_price.csv
├── environment.yml
├── images
├── 05_gridsearch_overview.png
├── api_table.png
├── bag_of_words.png
├── bag_of_words.svg
├── classifier_comparison.png
├── dendrogram.png
├── iris_petal_sepal.png
├── iris_petal_sepal.svg
├── overfitting_underfitting_cartoon.png
├── overfitting_underfitting_cartoon.svg
├── pipeline.png
└── pipeline.svg
├── mglearn
├── __init__.py
├── datasets.py
├── make_blobs.py
├── plot_2d_separator.py
├── plot_agglomerative.py
├── plot_animal_tree.py
├── plot_cross_validation.py
├── plot_dbscan.py
├── plot_decomposition.py
├── plot_grid_search.py
├── plot_helpers.py
├── plot_improper_preprocessing.py
├── plot_interactive_tree.py
├── plot_kmeans.py
├── plot_kneighbors_regularization.py
├── plot_knn_classification.py
├── plot_knn_regression.py
├── plot_linear_regression.py
├── plot_linear_svc_regularization.py
├── plot_metrics.py
├── plot_nmf.py
├── plot_nn_graphs.py
├── plot_pca.py
├── plot_rbf_svm_parameters.py
├── plot_ridge.py
├── plot_scaling.py
├── plot_tree_nonmonotonous.py
├── plots.py
└── tools.py
└── preamble.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | .ipynb_checkpoints/
4 |
--------------------------------------------------------------------------------
/08-conclusion.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "hide_input": false
7 | },
8 | "source": [
9 | "## Outlook\n",
10 | "### Approaching a machine learning problem\n",
11 | "### Humans in the loop"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "### From prototype to production"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Testing production systems"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### Building your own estimator"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 1,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from sklearn.base import BaseEstimator, TransformerMixin\n",
42 | "\n",
43 | "class MyTransformer(BaseEstimator, TransformerMixin):\n",
44 | " def __init__(self, first_paramter=1, second_parameter=2):\n",
45 | " # all parameters must be specified in the __init__ function\n",
46 | " self.first_paramter = 1\n",
47 | " self.second_parameter = 2\n",
48 | " \n",
49 | " def fit(self, X, y=None):\n",
50 | " # fit should only take X and y as parameters\n",
51 | " # even if your model is unsupervised, you need to accept a y argument!\n",
52 | " \n",
53 | " # Model fitting code goes here\n",
54 | " print(\"fitting the model right here\")\n",
55 | " # fit returns self\n",
56 | " return self\n",
57 | " \n",
58 | " def transform(self, X):\n",
59 | " # transform takes as parameter only X\n",
60 | " \n",
61 | " # apply some transformation to X:\n",
62 | " X_transformed = X + 1\n",
63 | " return X_transformed"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "### Where to go from here\n",
71 | "#### Theory\n",
72 | "#### Other machine learning frameworks and packages\n",
73 | "#### Ranking, recommender systems, time series, and other kinds of learning\n",
74 | "#### Probabilistic modeling, inference and probabilistic programming"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "#### Neural Networks"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "#### Scaling to larger datasets"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "metadata": {},
94 | "source": [
95 | "#### Honing your skills"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "#### Conclusion"
103 | ]
104 | }
105 | ],
106 | "metadata": {
107 | "anaconda-cloud": {},
108 | "kernelspec": {
109 | "display_name": "Python [conda env:root] *",
110 | "language": "python",
111 | "name": "conda-root-py"
112 | },
113 | "language_info": {
114 | "codemirror_mode": {
115 | "name": "ipython",
116 | "version": 3
117 | },
118 | "file_extension": ".py",
119 | "mimetype": "text/x-python",
120 | "name": "python",
121 | "nbconvert_exporter": "python",
122 | "pygments_lexer": "ipython3",
123 | "version": "3.7.6"
124 | },
125 | "toc-autonumbering": false
126 | },
127 | "nbformat": 4,
128 | "nbformat_minor": 4
129 | }
130 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://mybinder.org/v2/gh/amueller/introduction_to_ml_with_python/master)
2 |
3 | # Introduction to Machine Learning with Python
4 |
5 | This repository holds the code for the forthcoming book "Introduction to Machine
6 | Learning with Python" by [Andreas Mueller](http://amueller.io) and [Sarah Guido](https://twitter.com/sarah_guido).
7 | You can find details about the book on the [O'Reilly website](http://shop.oreilly.com/product/0636920030515.do).
8 |
9 | The book requires the current stable version of scikit-learn, that is
10 | 0.20.0. Most of the book can also be used with previous versions of
11 | scikit-learn, though you need to adjust the import for everything from the
12 | ``model_selection`` module, mostly ``cross_val_score``, ``train_test_split``
13 | and ``GridSearchCV``.
14 |
15 |
16 | This repository provides the notebooks from which the book is created, together
17 | with the ``mglearn`` library of helper functions to create figures and
18 | datasets.
19 |
20 | For the curious ones, the cover depicts a [hellbender](https://en.wikipedia.org/wiki/Hellbender).
21 |
22 | All datasets are included in the repository, with the exception of the aclImdb dataset, which you can download from
23 | the page of [Andrew Maas](http://ai.stanford.edu/~amaas/data/sentiment/). See the book for details.
24 |
25 | If you get ``ImportError: No module named mglearn`` you can try to install mglearn into your python environment using
26 | the command ``pip install mglearn`` in your terminal or ``!pip install mglearn`` in Jupyter Notebook.
27 |
28 |
29 | ## Errata
30 | Please note that the first print of the book is missing the following line when listing the assumed imports:
31 |
32 | ```python
33 | from IPython.display import display
34 | ```
35 | Please add this line if you see an error involving ``display``.
36 |
37 |
38 | The first print of the book used a function called ``plot_group_kfold``.
39 | This has been renamed to ``plot_label_kfold`` because of a rename in
40 | scikit-learn.
41 |
42 | ## Setup
43 |
44 | To run the code, you need the packages ``numpy``, ``scipy``, ``scikit-learn``, ``matplotlib``, ``pandas`` and ``pillow``.
45 | Some of the visualizations of decision trees and neural networks structures also require ``graphviz``. The chapter
46 | on text processing also requires ``nltk`` and ``spacy``.
47 |
48 | The easiest way to set up an environment is by installing [Anaconda](https://www.continuum.io/downloads).
49 |
50 | ### Installing packages with conda:
51 | If you already have a Python environment set up, and you are using the ``conda`` package manager, you can get all packages by running
52 |
53 | conda install numpy scipy scikit-learn matplotlib pandas pillow graphviz python-graphviz
54 |
55 | For the chapter on text processing you also need to install ``nltk`` and ``spacy``:
56 |
57 | conda install nltk spacy
58 |
59 |
60 | ### Installing packages with pip
61 | If you already have a Python environment and are using pip to install packages, you need to run
62 |
63 | pip install numpy scipy scikit-learn matplotlib pandas pillow graphviz
64 |
65 | You also need to install the graphiz C-library, which is easiest using a package manager.
66 | If you are using OS X and homebrew, you can ``brew install graphviz``. If you are on Ubuntu or debian, you can ``apt-get install graphviz``.
67 | Installing graphviz on Windows can be tricky and using conda / anaconda is recommended.
68 | For the chapter on text processing you also need to install ``nltk`` and ``spacy``:
69 |
70 | pip install nltk spacy
71 |
72 | ### Downloading English language model
73 | For the text processing chapter, you need to download the English language model for spacy using
74 |
75 | python -m spacy download en
76 |
77 | ## Submitting Errata
78 |
79 | If you have errata for the (e-)book, please submit them via the [O'Reilly Website](http://www.oreilly.com/catalog/errata.csp?isbn=0636920030515).
80 | You can submit fixes to the code as pull-requests here, but I'd appreciate it if you would also submit them there, as this repository doesn't hold the
81 | "master notebooks".
82 |
83 | 
84 |
--------------------------------------------------------------------------------
/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/cover.jpg
--------------------------------------------------------------------------------
/data/ram_price.csv:
--------------------------------------------------------------------------------
1 | ,date,price
2 | 0,1957.0,411041792.0
3 | 1,1959.0,67947725.0
4 | 2,1960.0,5242880.0
5 | 3,1965.0,2642412.0
6 | 4,1970.0,734003.0
7 | 5,1973.0,399360.0
8 | 6,1974.0,314573.0
9 | 7,1975.0,421888.0
10 | 8,1975.08,180224.0
11 | 9,1975.25,67584.0
12 | 10,1975.75,49920.0
13 | 11,1976.0,40704.0
14 | 12,1976.17,48960.0
15 | 13,1976.42,23040.0
16 | 14,1976.58,32000.0
17 | 15,1977.08,36800.0
18 | 16,1978.17,28000.0
19 | 17,1978.25,29440.0
20 | 18,1978.33,19200.0
21 | 19,1978.5,24000.0
22 | 20,1978.58,16000.0
23 | 21,1978.75,15200.0
24 | 22,1979.0,10528.0
25 | 23,1979.75,6704.0
26 | 24,1980.0,6480.0
27 | 25,1981.0,8800.0
28 | 26,1981.58,4479.0
29 | 27,1982.0,3520.0
30 | 28,1982.17,4464.0
31 | 29,1982.67,1980.0
32 | 30,1983.0,2396.0
33 | 31,1983.67,1980.0
34 | 32,1984.0,1379.0
35 | 33,1984.58,1331.0
36 | 34,1985.0,880.0
37 | 35,1985.33,720.0
38 | 36,1985.42,550.0
39 | 37,1985.5,420.0
40 | 38,1985.58,350.0
41 | 39,1985.67,300.0
42 | 40,1985.83,300.0
43 | 41,1985.92,300.0
44 | 42,1986.0,300.0
45 | 43,1986.08,300.0
46 | 44,1986.17,300.0
47 | 45,1986.25,300.0
48 | 46,1986.33,190.0
49 | 47,1986.42,190.0
50 | 48,1986.5,190.0
51 | 49,1986.58,190.0
52 | 50,1986.67,190.0
53 | 51,1986.75,190.0
54 | 52,1986.92,190.0
55 | 53,1987.0,176.0
56 | 54,1987.08,176.0
57 | 55,1987.17,157.0
58 | 56,1987.25,154.0
59 | 57,1987.33,154.0
60 | 58,1987.42,154.0
61 | 59,1987.5,154.0
62 | 60,1987.58,154.0
63 | 61,1987.67,163.0
64 | 62,1987.75,133.0
65 | 63,1987.83,163.0
66 | 64,1987.92,163.0
67 | 65,1988.0,163.0
68 | 66,1988.08,182.0
69 | 67,1988.17,199.0
70 | 68,1988.33,199.0
71 | 69,1988.42,199.0
72 | 70,1988.5,505.0
73 | 71,1988.58,505.0
74 | 72,1988.67,505.0
75 | 73,1988.75,505.0
76 | 74,1988.83,505.0
77 | 75,1988.92,505.0
78 | 76,1989.0,505.0
79 | 77,1989.08,505.0
80 | 78,1989.17,505.0
81 | 79,1989.25,505.0
82 | 80,1989.42,344.0
83 | 81,1989.5,197.0
84 | 82,1989.58,188.0
85 | 83,1989.67,188.0
86 | 84,1989.75,128.0
87 | 85,1989.83,117.0
88 | 86,1989.92,113.0
89 | 87,1990.0,106.0
90 | 88,1990.17,98.3
91 | 89,1990.33,98.3
92 | 90,1990.42,89.5
93 | 91,1990.5,82.8
94 | 92,1990.58,81.1
95 | 93,1990.67,71.5
96 | 94,1990.75,59.0
97 | 95,1990.83,51.0
98 | 96,1990.92,45.5
99 | 97,1991.0,44.5
100 | 98,1991.08,44.5
101 | 99,1991.17,45.0
102 | 100,1991.25,45.0
103 | 101,1991.33,45.0
104 | 102,1991.42,43.8
105 | 103,1991.5,43.8
106 | 104,1991.58,41.3
107 | 105,1991.67,46.3
108 | 106,1991.75,45.0
109 | 107,1991.83,39.8
110 | 108,1991.92,39.8
111 | 109,1992.0,36.3
112 | 110,1992.08,36.3
113 | 111,1992.17,36.3
114 | 112,1992.25,34.8
115 | 113,1992.33,30.0
116 | 114,1992.42,32.5
117 | 115,1992.5,33.5
118 | 116,1992.58,31.0
119 | 117,1992.67,27.5
120 | 118,1992.75,26.3
121 | 119,1992.83,26.3
122 | 120,1992.92,26.3
123 | 121,1993.0,33.1
124 | 122,1993.08,27.5
125 | 123,1993.17,27.5
126 | 124,1993.25,27.5
127 | 125,1993.33,27.5
128 | 126,1993.42,30.0
129 | 127,1993.5,30.0
130 | 128,1993.58,30.0
131 | 129,1993.67,30.0
132 | 130,1993.75,36.0
133 | 131,1993.83,39.8
134 | 132,1993.92,35.8
135 | 133,1994.0,35.8
136 | 134,1994.08,35.8
137 | 135,1994.17,36.0
138 | 136,1994.25,37.3
139 | 137,1994.33,37.3
140 | 138,1994.42,37.3
141 | 139,1994.5,38.5
142 | 140,1994.58,37.0
143 | 141,1994.67,34.0
144 | 142,1994.75,33.5
145 | 143,1994.83,32.3
146 | 144,1994.92,32.3
147 | 145,1995.0,32.3
148 | 146,1995.08,32.0
149 | 147,1995.17,32.0
150 | 148,1995.25,31.2
151 | 149,1995.33,31.2
152 | 150,1995.42,31.1
153 | 151,1995.5,31.2
154 | 152,1995.58,30.6
155 | 153,1995.67,33.1
156 | 154,1995.75,33.1
157 | 155,1995.83,30.9
158 | 156,1995.92,30.9
159 | 157,1996.0,29.9
160 | 158,1996.08,28.8
161 | 159,1996.17,26.1
162 | 160,1996.25,24.7
163 | 161,1996.33,17.2
164 | 162,1996.42,14.9
165 | 163,1996.5,11.3
166 | 164,1996.58,9.06
167 | 165,1996.67,8.44
168 | 166,1996.75,8.0
169 | 167,1996.83,5.25
170 | 168,1996.92,5.25
171 | 169,1997.0,4.63
172 | 170,1997.08,3.63
173 | 171,1997.17,3.0
174 | 172,1997.25,3.0
175 | 173,1997.33,3.0
176 | 174,1997.42,3.69
177 | 175,1997.5,4.0
178 | 176,1997.58,4.13
179 | 177,1997.67,3.63
180 | 178,1997.75,3.41
181 | 179,1997.83,3.25
182 | 180,1997.92,2.16
183 | 181,1998.0,2.16
184 | 182,1998.08,0.91
185 | 183,1998.17,0.97
186 | 184,1998.25,1.22
187 | 185,1998.33,1.19
188 | 186,1998.42,0.97
189 | 187,1998.58,1.03
190 | 188,1998.67,0.97
191 | 189,1998.75,1.16
192 | 190,1998.83,0.84
193 | 191,1998.92,0.84
194 | 192,1999.08,1.44
195 | 193,1999.13,0.84
196 | 194,1999.17,1.25
197 | 195,1999.25,1.25
198 | 196,1999.33,0.86
199 | 197,1999.5,0.78
200 | 198,1999.67,0.87
201 | 199,1999.75,1.04
202 | 200,1999.83,1.34
203 | 201,1999.92,2.35
204 | 202,2000.0,1.56
205 | 203,2000.08,1.48
206 | 204,2000.17,1.08
207 | 205,2000.25,0.84
208 | 206,2000.33,0.7
209 | 207,2000.42,0.9
210 | 208,2000.5,0.77
211 | 209,2000.58,0.84
212 | 210,2000.67,1.07
213 | 211,2000.75,1.12
214 | 212,2000.83,1.12
215 | 213,2000.92,0.9
216 | 214,2001.0,0.75
217 | 215,2001.08,0.464
218 | 216,2001.17,0.464
219 | 217,2001.25,0.383
220 | 218,2001.33,0.387
221 | 219,2001.42,0.305
222 | 220,2001.5,0.352
223 | 221,2001.5,0.27
224 | 222,2001.58,0.191
225 | 223,2001.67,0.191
226 | 224,2001.75,0.169
227 | 225,2001.77,0.148
228 | 226,2002.08,0.134
229 | 227,2002.08,0.207
230 | 228,2002.25,0.193
231 | 229,2002.33,0.193
232 | 230,2002.42,0.33
233 | 231,2002.58,0.193
234 | 232,2002.75,0.193
235 | 233,2003.17,0.176
236 | 234,2003.25,0.076
237 | 235,2003.33,0.126
238 | 236,2003.42,0.115
239 | 237,2003.5,0.133
240 | 238,2003.58,0.129
241 | 239,2003.67,0.143
242 | 240,2003.75,0.148
243 | 241,2003.83,0.16
244 | 242,2003.99,0.166
245 | 243,2004.0,0.174
246 | 244,2004.08,0.148
247 | 245,2004.17,0.146
248 | 246,2004.33,0.156
249 | 247,2004.42,0.203
250 | 248,2004.5,0.176
251 | 249,2005.25,0.185
252 | 250,2005.42,0.149
253 | 251,2005.83,0.116
254 | 252,2005.92,0.185
255 | 253,2006.17,0.112
256 | 254,2006.33,0.073
257 | 255,2006.5,0.082
258 | 256,2006.67,0.073
259 | 257,2006.75,0.088
260 | 258,2006.83,0.098
261 | 259,2006.99,0.092
262 | 260,2007.0,0.082
263 | 261,2007.08,0.078
264 | 262,2007.17,0.066
265 | 263,2007.33,0.0464
266 | 264,2007.5,0.0386
267 | 265,2007.67,0.0351
268 | 266,2007.75,0.0322
269 | 267,2007.83,0.0244
270 | 268,2007.92,0.0244
271 | 269,2008.0,0.0232
272 | 270,2008.08,0.022
273 | 271,2008.33,0.022
274 | 272,2008.5,0.0207
275 | 273,2008.58,0.0176
276 | 274,2008.67,0.0146
277 | 275,2008.83,0.011
278 | 276,2008.92,0.0098
279 | 277,2009.0,0.0098
280 | 278,2009.08,0.0107
281 | 279,2009.25,0.0105
282 | 280,2009.42,0.0115
283 | 281,2009.5,0.011
284 | 282,2009.58,0.0127
285 | 283,2009.75,0.0183
286 | 284,2009.92,0.0205
287 | 285,2010.0,0.019
288 | 286,2010.08,0.0202
289 | 287,2010.17,0.0195
290 | 288,2010.33,0.0242
291 | 289,2010.5,0.021
292 | 290,2010.58,0.022
293 | 291,2010.75,0.0171
294 | 292,2010.83,0.0146
295 | 293,2010.92,0.0122
296 | 294,2011.0,0.01
297 | 295,2011.08,0.0103
298 | 296,2011.33,0.01
299 | 297,2011.42,0.0085
300 | 298,2011.67,0.0054
301 | 299,2011.75,0.0051
302 | 300,2012.0,0.0049
303 | 301,2012.08,0.0049
304 | 302,2012.25,0.005
305 | 303,2012.33,0.0049
306 | 304,2012.58,0.0048
307 | 305,2012.67,0.004
308 | 306,2012.83,0.0037
309 | 307,2013.0,0.0043
310 | 308,2013.08,0.0054
311 | 309,2013.33,0.0067
312 | 310,2013.42,0.0061
313 | 311,2013.58,0.0073
314 | 312,2013.67,0.0065
315 | 313,2013.75,0.0082
316 | 314,2013.83,0.0085
317 | 315,2013.92,0.0079
318 | 316,2014.08,0.0095
319 | 317,2014.17,0.0079
320 | 318,2014.25,0.0073
321 | 319,2014.42,0.0079
322 | 320,2014.58,0.0085
323 | 321,2014.67,0.0085
324 | 322,2014.83,0.0085
325 | 323,2015.0,0.0078
326 | 324,2015.08,0.0073
327 | 325,2015.25,0.0061
328 | 326,2015.33,0.0056
329 | 327,2015.5,0.0049
330 | 328,2015.58,0.0045
331 | 329,2015.67,0.0043
332 | 330,2015.75,0.0042
333 | 331,2015.83,0.0038
334 | 332,2015.92,0.0037
335 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: python-ml
2 | dependencies:
3 | - numpy
4 | - scipy
5 | - scikit-learn
6 | - matplotlib
7 | - pandas
8 | - pillow
9 | - graphviz
10 | - python-graphviz
11 | - imageio
12 | - joblib
13 |
--------------------------------------------------------------------------------
/images/05_gridsearch_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/05_gridsearch_overview.png
--------------------------------------------------------------------------------
/images/api_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/api_table.png
--------------------------------------------------------------------------------
/images/bag_of_words.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/bag_of_words.png
--------------------------------------------------------------------------------
/images/classifier_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/classifier_comparison.png
--------------------------------------------------------------------------------
/images/dendrogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/dendrogram.png
--------------------------------------------------------------------------------
/images/iris_petal_sepal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/iris_petal_sepal.png
--------------------------------------------------------------------------------
/images/overfitting_underfitting_cartoon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/overfitting_underfitting_cartoon.png
--------------------------------------------------------------------------------
/images/overfitting_underfitting_cartoon.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/pipeline.png
--------------------------------------------------------------------------------
/images/pipeline.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
855 |
--------------------------------------------------------------------------------
/mglearn/__init__.py:
--------------------------------------------------------------------------------
1 | from . import plots
2 | from . import tools
3 | from .plots import cm3, cm2
4 | from .tools import discrete_scatter
5 | from .plot_helpers import ReBl
6 |
7 | __version__ = "0.2.0"
8 |
9 | __all__ = ['tools', 'plots', 'cm3', 'cm2', 'discrete_scatter', 'ReBl']
10 |
--------------------------------------------------------------------------------
/mglearn/datasets.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import os
4 | from scipy import signal
5 | from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
6 | from sklearn.datasets import make_blobs
7 | from sklearn.utils import Bunch
8 |
9 | DATA_PATH = os.path.join(os.path.dirname(__file__), "data")
10 |
11 |
12 | def make_forge():
13 | # a carefully hand-designed dataset lol
14 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
15 | y[np.array([7, 27])] = 0
16 | mask = np.ones(len(X), dtype=bool)
17 | mask[np.array([0, 1, 5, 26])] = 0
18 | X, y = X[mask], y[mask]
19 | return X, y
20 |
21 |
22 | def make_wave(n_samples=100):
23 | rnd = np.random.RandomState(42)
24 | x = rnd.uniform(-3, 3, size=n_samples)
25 | y_no_noise = (np.sin(4 * x) + x)
26 | y = (y_no_noise + rnd.normal(size=len(x))) / 2
27 | return x.reshape(-1, 1), y
28 |
29 |
30 | def load_boston():
31 | try:
32 | from sklearn.datasets import load_boston
33 | return load_boston()
34 | except ImportError:
35 | pass
36 | data_url = "http://lib.stat.cmu.edu/datasets/boston"
37 | raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
38 | data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
39 | target = raw_df.values[1::2, 2]
40 | return Bunch(data=data, target=target)
41 |
42 |
43 | def load_extended_boston():
44 | boston = load_boston()
45 | X = boston.data
46 |
47 | X = MinMaxScaler().fit_transform(boston.data)
48 | X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
49 | return X, boston.target
50 |
51 |
52 | def load_citibike():
53 | data_mine = pd.read_csv(os.path.join(DATA_PATH, "citibike.csv"))
54 | data_mine['one'] = 1
55 | data_mine['starttime'] = pd.to_datetime(data_mine.starttime)
56 | data_starttime = data_mine.set_index("starttime")
57 | data_resampled = data_starttime.resample("3h").sum().fillna(0)
58 | return data_resampled.one
59 |
60 |
61 | def make_signals():
62 | # fix a random state seed
63 | rng = np.random.RandomState(42)
64 | n_samples = 2000
65 | time = np.linspace(0, 8, n_samples)
66 | # create three signals
67 | s1 = np.sin(2 * time) # Signal 1 : sinusoidal signal
68 | s2 = np.sign(np.sin(3 * time)) # Signal 2 : square signal
69 | s3 = signal.sawtooth(2 * np.pi * time) # Signal 3: saw tooth signal
70 |
71 | # concatenate the signals, add noise
72 | S = np.c_[s1, s2, s3]
73 | S += 0.2 * rng.normal(size=S.shape)
74 |
75 | S /= S.std(axis=0) # Standardize data
76 | S -= S.min()
77 | return S
78 |
--------------------------------------------------------------------------------
/mglearn/make_blobs.py:
--------------------------------------------------------------------------------
1 | import numbers
2 | import numpy as np
3 |
4 | from sklearn.utils import check_array, check_random_state
5 | from sklearn.utils import shuffle as shuffle_
6 | from sklearn.utils.deprecation import deprecated
7 |
8 |
9 | @deprecated("Please import make_blobs directly from scikit-learn")
10 | def make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=1.0,
11 | center_box=(-10.0, 10.0), shuffle=True, random_state=None):
12 | """Generate isotropic Gaussian blobs for clustering.
13 |
14 | Read more in the :ref:`User Guide `.
15 |
16 | Parameters
17 | ----------
18 | n_samples : int, or tuple, optional (default=100)
19 | The total number of points equally divided among clusters.
20 |
21 | n_features : int, optional (default=2)
22 | The number of features for each sample.
23 |
24 | centers : int or array of shape [n_centers, n_features], optional
25 | (default=3)
26 | The number of centers to generate, or the fixed center locations.
27 |
28 | cluster_std: float or sequence of floats, optional (default=1.0)
29 | The standard deviation of the clusters.
30 |
31 | center_box: pair of floats (min, max), optional (default=(-10.0, 10.0))
32 | The bounding box for each cluster center when centers are
33 | generated at random.
34 |
35 | shuffle : boolean, optional (default=True)
36 | Shuffle the samples.
37 |
38 | random_state : int, RandomState instance or None, optional (default=None)
39 | If int, random_state is the seed used by the random number generator;
40 | If RandomState instance, random_state is the random number generator;
41 | If None, the random number generator is the RandomState instance used
42 | by `np.random`.
43 |
44 | Returns
45 | -------
46 | X : array of shape [n_samples, n_features]
47 | The generated samples.
48 |
49 | y : array of shape [n_samples]
50 | The integer labels for cluster membership of each sample.
51 |
52 | Examples
53 | --------
54 | >>> from sklearn.datasets.samples_generator import make_blobs
55 | >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
56 | ... random_state=0)
57 | >>> print(X.shape)
58 | (10, 2)
59 | >>> y
60 | array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])
61 |
62 | See also
63 | --------
64 | make_classification: a more intricate variant
65 | """
66 | generator = check_random_state(random_state)
67 |
68 | if isinstance(centers, numbers.Integral):
69 | centers = generator.uniform(center_box[0], center_box[1],
70 | size=(centers, n_features))
71 | else:
72 | centers = check_array(centers)
73 | n_features = centers.shape[1]
74 |
75 | if isinstance(cluster_std, numbers.Real):
76 | cluster_std = np.ones(len(centers)) * cluster_std
77 |
78 | X = []
79 | y = []
80 |
81 | n_centers = centers.shape[0]
82 | if isinstance(n_samples, numbers.Integral):
83 | n_samples_per_center = [int(n_samples // n_centers)] * n_centers
84 | for i in range(n_samples % n_centers):
85 | n_samples_per_center[i] += 1
86 | else:
87 | n_samples_per_center = n_samples
88 |
89 | for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
90 | X.append(centers[i] + generator.normal(scale=std,
91 | size=(n, n_features)))
92 | y += [i] * n
93 |
94 | X = np.concatenate(X)
95 | y = np.array(y)
96 |
97 | if shuffle:
98 | X, y = shuffle_(X, y, random_state=generator)
99 |
100 | return X, y
101 |
--------------------------------------------------------------------------------
/mglearn/plot_2d_separator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from .plot_helpers import cm2, cm3, discrete_scatter
4 |
5 |
6 | def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None,
7 | alpha=1, cm=cm3):
8 | # multiclass
9 | if eps is None:
10 | eps = X.std() / 2.
11 |
12 | if ax is None:
13 | ax = plt.gca()
14 |
15 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
16 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
17 | xx = np.linspace(x_min, x_max, 1000)
18 | yy = np.linspace(y_min, y_max, 1000)
19 |
20 | X1, X2 = np.meshgrid(xx, yy)
21 | X_grid = np.c_[X1.ravel(), X2.ravel()]
22 | decision_values = classifier.predict(X_grid)
23 | ax.imshow(decision_values.reshape(X1.shape), extent=(x_min, x_max,
24 | y_min, y_max),
25 | aspect='auto', origin='lower', alpha=alpha, cmap=cm)
26 | ax.set_xlim(x_min, x_max)
27 | ax.set_ylim(y_min, y_max)
28 | ax.set_xticks(())
29 | ax.set_yticks(())
30 |
31 |
32 | def plot_2d_scores(classifier, X, ax=None, eps=None, alpha=1, cm="viridis",
33 | function=None):
34 | # binary with fill
35 | if eps is None:
36 | eps = X.std() / 2.
37 |
38 | if ax is None:
39 | ax = plt.gca()
40 |
41 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
42 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
43 | xx = np.linspace(x_min, x_max, 100)
44 | yy = np.linspace(y_min, y_max, 100)
45 |
46 | X1, X2 = np.meshgrid(xx, yy)
47 | X_grid = np.c_[X1.ravel(), X2.ravel()]
48 | if function is None:
49 | function = getattr(classifier, "decision_function",
50 | getattr(classifier, "predict_proba"))
51 | else:
52 | function = getattr(classifier, function)
53 | decision_values = function(X_grid)
54 | if decision_values.ndim > 1 and decision_values.shape[1] > 1:
55 | # predict_proba
56 | decision_values = decision_values[:, 1]
57 | grr = ax.imshow(decision_values.reshape(X1.shape),
58 | extent=(x_min, x_max, y_min, y_max), aspect='auto',
59 | origin='lower', alpha=alpha, cmap=cm)
60 |
61 | ax.set_xlim(x_min, x_max)
62 | ax.set_ylim(y_min, y_max)
63 | ax.set_xticks(())
64 | ax.set_yticks(())
65 | return grr
66 |
67 |
68 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1,
69 | cm=cm2, linewidth=None, threshold=None,
70 | linestyle="solid"):
71 | # binary?
72 | if eps is None:
73 | eps = X.std() / 2.
74 |
75 | if ax is None:
76 | ax = plt.gca()
77 |
78 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
79 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
80 | xx = np.linspace(x_min, x_max, 1000)
81 | yy = np.linspace(y_min, y_max, 1000)
82 |
83 | X1, X2 = np.meshgrid(xx, yy)
84 | X_grid = np.c_[X1.ravel(), X2.ravel()]
85 | try:
86 | decision_values = classifier.decision_function(X_grid)
87 | levels = [0] if threshold is None else [threshold]
88 | fill_levels = [decision_values.min()] + levels + [
89 | decision_values.max()]
90 | except AttributeError:
91 | # no decision_function
92 | decision_values = classifier.predict_proba(X_grid)[:, 1]
93 | levels = [.5] if threshold is None else [threshold]
94 | fill_levels = [0] + levels + [1]
95 | if fill:
96 | ax.contourf(X1, X2, decision_values.reshape(X1.shape),
97 | levels=fill_levels, alpha=alpha, cmap=cm)
98 | else:
99 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels,
100 | colors="black", alpha=alpha, linewidths=linewidth,
101 | linestyles=linestyle, zorder=5)
102 |
103 | ax.set_xlim(x_min, x_max)
104 | ax.set_ylim(y_min, y_max)
105 | ax.set_xticks(())
106 | ax.set_yticks(())
--------------------------------------------------------------------------------
/mglearn/plot_agglomerative.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.datasets import make_blobs
4 | from sklearn.cluster import AgglomerativeClustering
5 | from sklearn.neighbors import KernelDensity
6 |
7 |
8 | def plot_agglomerative_algorithm():
9 | # generate synthetic two-dimensional data
10 | X, y = make_blobs(random_state=0, n_samples=12)
11 |
12 | agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X)
13 |
14 | fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={'xticks': (),
15 | 'yticks': ()},
16 | figsize=(20, 8))
17 |
18 | eps = X.std() / 2
19 |
20 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
21 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
22 |
23 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
24 | gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]
25 |
26 | for i, ax in enumerate(axes.ravel()):
27 | ax.set_xlim(x_min, x_max)
28 | ax.set_ylim(y_min, y_max)
29 | agg.n_clusters = X.shape[0] - i
30 | agg.fit(X)
31 | ax.set_title("Step %d" % i)
32 | ax.scatter(X[:, 0], X[:, 1], s=60, c='grey')
33 | bins = np.bincount(agg.labels_)
34 | for cluster in range(agg.n_clusters):
35 | if bins[cluster] > 1:
36 | points = X[agg.labels_ == cluster]
37 | other_points = X[agg.labels_ != cluster]
38 |
39 | kde = KernelDensity(bandwidth=.5).fit(points)
40 | scores = kde.score_samples(gridpoints)
41 | score_inside = np.min(kde.score_samples(points))
42 | score_outside = np.max(kde.score_samples(other_points))
43 | levels = .8 * score_inside + .2 * score_outside
44 | ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels],
45 | colors='k', linestyles='solid', linewidths=2)
46 |
47 | axes[0, 0].set_title("Initialization")
48 |
49 |
50 | def plot_agglomerative():
51 | X, y = make_blobs(random_state=0, n_samples=12)
52 | agg = AgglomerativeClustering(n_clusters=3)
53 |
54 | eps = X.std() / 2.
55 |
56 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
57 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
58 |
59 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
60 | gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]
61 |
62 | ax = plt.gca()
63 | for i, x in enumerate(X):
64 | ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center')
65 |
66 | ax.scatter(X[:, 0], X[:, 1], s=60, c='grey')
67 | ax.set_xticks(())
68 | ax.set_yticks(())
69 |
70 | for i in range(11):
71 | agg.n_clusters = X.shape[0] - i
72 | agg.fit(X)
73 |
74 | bins = np.bincount(agg.labels_)
75 | for cluster in range(agg.n_clusters):
76 | if bins[cluster] > 1:
77 | points = X[agg.labels_ == cluster]
78 | other_points = X[agg.labels_ != cluster]
79 |
80 | kde = KernelDensity(bandwidth=.5).fit(points)
81 | scores = kde.score_samples(gridpoints)
82 | score_inside = np.min(kde.score_samples(points))
83 | score_outside = np.max(kde.score_samples(other_points))
84 | levels = .8 * score_inside + .2 * score_outside
85 | ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels],
86 | colors='k', linestyles='solid', linewidths=1)
87 |
88 | ax.set_xlim(x_min, x_max)
89 | ax.set_ylim(y_min, y_max)
90 |
--------------------------------------------------------------------------------
/mglearn/plot_animal_tree.py:
--------------------------------------------------------------------------------
1 | from imageio import imread
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | def plot_animal_tree(ax=None):
6 | import graphviz
7 | if ax is None:
8 | ax = plt.gca()
9 | mygraph = graphviz.Digraph(node_attr={'shape': 'box'},
10 | edge_attr={'labeldistance': "10.5"},
11 | format="png")
12 | mygraph.node("0", "Has feathers?")
13 | mygraph.node("1", "Can fly?")
14 | mygraph.node("2", "Has fins?")
15 | mygraph.node("3", "Hawk")
16 | mygraph.node("4", "Penguin")
17 | mygraph.node("5", "Dolphin")
18 | mygraph.node("6", "Bear")
19 | mygraph.edge("0", "1", label="True")
20 | mygraph.edge("0", "2", label="False")
21 | mygraph.edge("1", "3", label="True")
22 | mygraph.edge("1", "4", label="False")
23 | mygraph.edge("2", "5", label="True")
24 | mygraph.edge("2", "6", label="False")
25 | mygraph.render("tmp")
26 | ax.imshow(imread("tmp.png"))
27 | ax.set_axis_off()
28 |
--------------------------------------------------------------------------------
/mglearn/plot_cross_validation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | def plot_group_kfold():
6 | from sklearn.model_selection import GroupKFold
7 | groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
8 |
9 | plt.figure(figsize=(10, 2))
10 | plt.title("GroupKFold")
11 |
12 | axes = plt.gca()
13 | axes.set_frame_on(False)
14 |
15 | n_folds = 12
16 | n_samples = 12
17 | n_iter = 3
18 | n_samples_per_fold = 1
19 |
20 | cv = GroupKFold(n_splits=3)
21 | mask = np.zeros((n_iter, n_samples))
22 | for i, (train, test) in enumerate(cv.split(range(12), groups=groups)):
23 | mask[i, train] = 1
24 | mask[i, test] = 2
25 |
26 | for i in range(n_folds):
27 | # test is grey
28 | colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
29 | # not selected has no hatch
30 |
31 | boxes = axes.barh(y=range(n_iter), width=[1 - 0.1] * n_iter,
32 | left=i * n_samples_per_fold, height=.6, color=colors,
33 | hatch="//", edgecolor="k", align='edge')
34 | for j in np.where(mask[:, i] == 0)[0]:
35 | boxes[j].set_hatch("")
36 |
37 | axes.barh(y=[n_iter] * n_folds, width=[1 - 0.1] * n_folds,
38 | left=np.arange(n_folds) * n_samples_per_fold, height=.6,
39 | color="w", edgecolor='k', align="edge")
40 |
41 | for i in range(12):
42 | axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" %
43 | groups[i], horizontalalignment="center")
44 |
45 | axes.invert_yaxis()
46 | axes.set_xlim(0, n_samples + 1)
47 | axes.set_ylabel("CV iterations")
48 | axes.set_xlabel("Data points")
49 | axes.set_xticks(np.arange(n_samples) + .5)
50 | axes.set_xticklabels(np.arange(1, n_samples + 1))
51 | axes.set_yticks(np.arange(n_iter + 1) + .3)
52 | axes.set_yticklabels(
53 | ["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"])
54 | plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3))
55 | plt.tight_layout()
56 |
57 |
58 | def plot_shuffle_split():
59 | from sklearn.model_selection import ShuffleSplit
60 | plt.figure(figsize=(10, 2))
61 | plt.title("ShuffleSplit with 10 points"
62 | ", train_size=5, test_size=2, n_splits=4")
63 |
64 | axes = plt.gca()
65 | axes.set_frame_on(False)
66 |
67 | n_folds = 10
68 | n_samples = 10
69 | n_iter = 4
70 | n_samples_per_fold = 1
71 |
72 | ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43)
73 | mask = np.zeros((n_iter, n_samples))
74 | for i, (train, test) in enumerate(ss.split(range(10))):
75 | mask[i, train] = 1
76 | mask[i, test] = 2
77 |
78 | for i in range(n_folds):
79 | # test is grey
80 | colors = ["grey" if x == 2 else "white" for x in mask[:, i]]
81 | # not selected has no hatch
82 |
83 | boxes = axes.barh(y=range(n_iter), width=[1 - 0.1] * n_iter,
84 | left=i * n_samples_per_fold, height=.6, color=colors,
85 | hatch="//", edgecolor='k', align='edge')
86 | for j in np.where(mask[:, i] == 0)[0]:
87 | boxes[j].set_hatch("")
88 |
89 | axes.invert_yaxis()
90 | axes.set_xlim(0, n_samples + 1)
91 | axes.set_ylabel("CV iterations")
92 | axes.set_xlabel("Data points")
93 | axes.set_xticks(np.arange(n_samples) + .5)
94 | axes.set_xticklabels(np.arange(1, n_samples + 1))
95 | axes.set_yticks(np.arange(n_iter) + .3)
96 | axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)])
97 | # legend hacked for this random state
98 | plt.legend([boxes[1], boxes[0], boxes[2]], [
99 | "Training set", "Test set", "Not selected"], loc=(1, .3))
100 | plt.tight_layout()
101 |
102 |
103 | def plot_stratified_cross_validation():
104 | fig, both_axes = plt.subplots(2, 1, figsize=(12, 5))
105 | # plt.title("cross_validation_not_stratified")
106 | axes = both_axes[0]
107 | axes.set_title("Standard cross-validation with sorted class labels")
108 |
109 | axes.set_frame_on(False)
110 |
111 | n_folds = 3
112 | n_samples = 150
113 |
114 | n_samples_per_fold = n_samples / float(n_folds)
115 |
116 | for i in range(n_folds):
117 | colors = ["w"] * n_folds
118 | colors[i] = "grey"
119 | axes.barh(y=range(n_folds), width=[n_samples_per_fold - 1] *
120 | n_folds, left=i * n_samples_per_fold, height=.6,
121 | color=colors, hatch="//", edgecolor='k', align='edge')
122 |
123 | axes.barh(y=[n_folds] * n_folds, width=[n_samples_per_fold - 1] *
124 | n_folds, left=np.arange(3) * n_samples_per_fold, height=.6,
125 | color="w", edgecolor='k', align='edge')
126 |
127 | axes.invert_yaxis()
128 | axes.set_xlim(0, n_samples + 1)
129 | axes.set_ylabel("CV iterations")
130 | axes.set_xlabel("Data points")
131 | axes.set_xticks(np.arange(n_samples_per_fold / 2.,
132 | n_samples, n_samples_per_fold))
133 | axes.set_xticklabels(["Fold %d" % x for x in range(1, n_folds + 1)])
134 | axes.set_yticks(np.arange(n_folds + 1) + .3)
135 | axes.set_yticklabels(
136 | ["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"])
137 | for i in range(3):
138 | axes.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" %
139 | i, horizontalalignment="center")
140 |
141 | ax = both_axes[1]
142 | ax.set_title("Stratified Cross-validation")
143 | ax.set_frame_on(False)
144 | ax.invert_yaxis()
145 | ax.set_xlim(0, n_samples + 1)
146 | ax.set_ylabel("CV iterations")
147 | ax.set_xlabel("Data points")
148 |
149 | ax.set_yticks(np.arange(n_folds + 1) + .3)
150 | ax.set_yticklabels(
151 | ["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"])
152 |
153 | n_subsplit = n_samples_per_fold / 3.
154 | for i in range(n_folds):
155 | test_bars = ax.barh(
156 | y=[i] * n_folds, width=[n_subsplit - 1] * n_folds,
157 | left=np.arange(n_folds) * n_samples_per_fold + i * n_subsplit,
158 | height=.6, color="grey", hatch="//", edgecolor='k', align='edge')
159 |
160 | w = 2 * n_subsplit - 1
161 | ax.barh(y=[0] * n_folds, width=[w] * n_folds, left=np.arange(n_folds)
162 | * n_samples_per_fold + (0 + 1) * n_subsplit, height=.6, color="w",
163 | hatch="//", edgecolor='k', align='edge')
164 | ax.barh(y=[1] * (n_folds + 1), width=[w / 2., w, w, w / 2.],
165 | left=np.maximum(0, np.arange(n_folds + 1) * n_samples_per_fold -
166 | n_subsplit), height=.6, color="w", hatch="//",
167 | edgecolor='k', align='edge')
168 | training_bars = ax.barh(y=[2] * n_folds, width=[w] * n_folds,
169 | left=np.arange(n_folds) * n_samples_per_fold,
170 | height=.6, color="w", hatch="//", edgecolor='k',
171 | align='edge')
172 |
173 | ax.barh(y=[n_folds] * n_folds, width=[n_samples_per_fold - 1] *
174 | n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6,
175 | color="w", edgecolor='k', align='edge')
176 |
177 | for i in range(3):
178 | ax.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" %
179 | i, horizontalalignment="center")
180 | ax.set_ylim(4, -0.1)
181 | plt.legend([training_bars[0], test_bars[0]], [
182 | 'Training data', 'Test data'], loc=(1.05, 1), frameon=False)
183 |
184 | fig.tight_layout()
185 |
186 |
187 | def plot_cross_validation():
188 | plt.figure(figsize=(12, 2))
189 | plt.title("cross_validation")
190 | axes = plt.gca()
191 | axes.set_frame_on(False)
192 |
193 | n_folds = 5
194 | n_samples = 25
195 |
196 | n_samples_per_fold = n_samples / float(n_folds)
197 |
198 | for i in range(n_folds):
199 | colors = ["w"] * n_folds
200 | colors[i] = "grey"
201 | bars = plt.barh(
202 | y=range(n_folds), width=[n_samples_per_fold - 0.1] * n_folds,
203 | left=i * n_samples_per_fold, height=.6, color=colors, hatch="//",
204 | edgecolor='k', align='edge')
205 | axes.invert_yaxis()
206 | axes.set_xlim(0, n_samples + 1)
207 | plt.ylabel("CV iterations")
208 | plt.xlabel("Data points")
209 | plt.xticks(np.arange(n_samples_per_fold / 2., n_samples,
210 | n_samples_per_fold),
211 | ["Fold %d" % x for x in range(1, n_folds + 1)])
212 | plt.yticks(np.arange(n_folds) + .3,
213 | ["Split %d" % x for x in range(1, n_folds + 1)])
214 | plt.legend([bars[0], bars[4]], ['Training data', 'Test data'],
215 | loc=(1.05, 0.4), frameon=False)
216 |
217 |
218 | def plot_threefold_split():
219 | plt.figure(figsize=(15, 1))
220 | axis = plt.gca()
221 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], left=[0, 12, 15], color=[
222 | 'white', 'grey', 'grey'], hatch="//", edgecolor='k',
223 | align='edge')
224 | bars[2].set_hatch(r"")
225 | axis.set_yticks(())
226 | axis.set_frame_on(False)
227 | axis.set_ylim(-.1, .8)
228 | axis.set_xlim(-0.1, 20.1)
229 | axis.set_xticks([6, 13.3, 17.5])
230 | axis.set_xticklabels(["training set", "validation set",
231 | "test set"], fontdict={'fontsize': 20})
232 | axis.tick_params(length=0, labeltop=True, labelbottom=False)
233 | axis.text(6, -.3, "Model fitting",
234 | fontdict={'fontsize': 13}, horizontalalignment="center")
235 | axis.text(13.3, -.3, "Parameter selection",
236 | fontdict={'fontsize': 13}, horizontalalignment="center")
237 | axis.text(17.5, -.3, "Evaluation",
238 | fontdict={'fontsize': 13}, horizontalalignment="center")
239 |
--------------------------------------------------------------------------------
/mglearn/plot_dbscan.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.cluster import DBSCAN
4 | from sklearn.datasets import make_blobs
5 |
6 | from .plot_helpers import discrete_scatter, cm3
7 |
8 |
9 | def plot_dbscan():
10 | X, y = make_blobs(random_state=0, n_samples=12)
11 |
12 | dbscan = DBSCAN()
13 | clusters = dbscan.fit_predict(X)
14 | clusters
15 |
16 | fig, axes = plt.subplots(3, 4, figsize=(11, 8),
17 | subplot_kw={'xticks': (), 'yticks': ()})
18 | # Plot clusters as red, green and blue, and outliers (-1) as white
19 | colors = [cm3(1), cm3(0), cm3(2)]
20 | markers = ['o', '^', 'v']
21 |
22 | # iterate over settings of min_samples and eps
23 | for i, min_samples in enumerate([2, 3, 5]):
24 | for j, eps in enumerate([1, 1.5, 2, 3]):
25 | # instantiate DBSCAN with a particular setting
26 | dbscan = DBSCAN(min_samples=min_samples, eps=eps)
27 | # get cluster assignments
28 | clusters = dbscan.fit_predict(X)
29 | print("min_samples: %d eps: %f cluster: %s"
30 | % (min_samples, eps, clusters))
31 | if np.any(clusters == -1):
32 | c = ['w'] + colors
33 | m = ['o'] + markers
34 | else:
35 | c = colors
36 | m = markers
37 | discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c,
38 | s=8, markers=m)
39 | inds = dbscan.core_sample_indices_
40 | # vizualize core samples and clusters.
41 | if len(inds):
42 | discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds],
43 | ax=axes[i, j], s=15, c=colors,
44 | markers=markers)
45 | axes[i, j].set_title("min_samples: %d eps: %.1f"
46 | % (min_samples, eps))
47 | fig.tight_layout()
48 |
--------------------------------------------------------------------------------
/mglearn/plot_decomposition.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from matplotlib.offsetbox import OffsetImage, AnnotationBbox
3 |
4 |
5 | def plot_decomposition(people, pca):
6 | image_shape = people.images[0].shape
7 | plt.figure(figsize=(20, 3))
8 | ax = plt.gca()
9 |
10 | imagebox = OffsetImage(people.images[0], zoom=1.5, cmap="gray")
11 | ab = AnnotationBbox(imagebox, (.05, 0.4), pad=0.0, xycoords='data')
12 | ax.add_artist(ab)
13 |
14 | for i in range(4):
15 | imagebox = OffsetImage(pca.components_[i].reshape(image_shape), zoom=1.5, cmap="viridis")
16 |
17 | ab = AnnotationBbox(imagebox, (.3 + .2 * i, 0.4),
18 | pad=0.0,
19 | xycoords='data'
20 | )
21 | ax.add_artist(ab)
22 | if i == 0:
23 | plt.text(.18, .25, 'x_%d *' % i, fontdict={'fontsize': 50})
24 | else:
25 | plt.text(.15 + .2 * i, .25, '+ x_%d *' % i, fontdict={'fontsize': 50})
26 |
27 | plt.text(.95, .25, '+ ...', fontdict={'fontsize': 50})
28 |
29 | plt.rc('text', usetex=True)
30 | plt.text(.13, .3, r'\approx', fontdict={'fontsize': 50})
31 | plt.axis("off")
32 |
--------------------------------------------------------------------------------
/mglearn/plot_grid_search.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | from sklearn.svm import SVC
4 | from sklearn.model_selection import GridSearchCV, train_test_split
5 | from sklearn.datasets import load_iris
6 | import pandas as pd
7 |
8 |
9 | def plot_cross_val_selection():
10 | iris = load_iris()
11 | X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data,
12 | iris.target,
13 | random_state=0)
14 |
15 | param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
16 | 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
17 | grid_search = GridSearchCV(SVC(), param_grid, cv=5,
18 | return_train_score=True)
19 | grid_search.fit(X_trainval, y_trainval)
20 | results = pd.DataFrame(grid_search.cv_results_)[15:]
21 |
22 | best = np.argmax(results.mean_test_score.values)
23 | plt.figure(figsize=(10, 3))
24 | plt.xlim(-1, len(results))
25 | plt.ylim(0, 1.1)
26 | for i, (_, row) in enumerate(results.iterrows()):
27 | scores = row[['split%d_test_score' % i for i in range(5)]]
28 | marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5,
29 | alpha=.5)
30 | marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1,
31 | markersize=10, markeredgecolor='k')
32 | if i == best:
33 | marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red',
34 | fillstyle="none", alpha=1, markersize=20,
35 | markeredgewidth=3)
36 | plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x
37 | in results['params']],
38 | rotation=90)
39 | plt.ylabel("Validation accuracy")
40 | plt.xlabel("Parameter settings")
41 | plt.legend([marker_cv, marker_mean, marker_best],
42 | ["cv accuracy", "mean accuracy", "best parameter setting"],
43 | loc=(1.05, .4))
44 |
45 |
46 | def plot_grid_search_overview():
47 | plt.figure(figsize=(10, 3), dpi=70)
48 | axes = plt.gca()
49 | axes.yaxis.set_visible(False)
50 | axes.xaxis.set_visible(False)
51 | axes.set_frame_on(False)
52 |
53 | def draw(ax, text, start, target=None):
54 | if target is not None:
55 | patchB = target.get_bbox_patch()
56 | end = target.get_position()
57 | else:
58 | end = start
59 | patchB = None
60 | annotation = ax.annotate(text, end, start, xycoords='axes pixels',
61 | textcoords='axes pixels', size=20,
62 | arrowprops=dict(
63 | arrowstyle="-|>", fc="w", ec="k",
64 | patchB=patchB,
65 | connectionstyle="arc3,rad=0.0"),
66 | bbox=dict(boxstyle="round", fc="w"),
67 | horizontalalignment="center",
68 | verticalalignment="center")
69 | plt.draw()
70 | return annotation
71 |
72 | step = 100
73 | grr = 400
74 |
75 | final_evaluation = draw(axes, "final evaluation", (5 * step, grr - 3 *
76 | step))
77 | retrained_model = draw(axes, "retrained model", (3 * step, grr - 3 * step),
78 | final_evaluation)
79 | best_parameters = draw(axes, "best parameters", (.5 * step, grr - 3 *
80 | step), retrained_model)
81 | cross_validation = draw(axes, "cross-validation", (.5 * step, grr - 2 *
82 | step), best_parameters)
83 | draw(axes, "parameter grid", (0.0, grr - 0), cross_validation)
84 | training_data = draw(axes, "training data", (2 * step, grr - step),
85 | cross_validation)
86 | draw(axes, "training data", (2 * step, grr - step), retrained_model)
87 | test_data = draw(axes, "test data", (5 * step, grr - step),
88 | final_evaluation)
89 | draw(axes, "data set", (3.5 * step, grr - 0.0), training_data)
90 | draw(axes, "data set", (3.5 * step, grr - 0.0), test_data)
91 | plt.ylim(0, 1)
92 | plt.xlim(0, 1.5)
93 |
--------------------------------------------------------------------------------
/mglearn/plot_helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib as mpl
3 | import matplotlib.pyplot as plt
4 | from matplotlib.colors import ListedColormap, colorConverter, LinearSegmentedColormap
5 |
6 |
7 | cm_cycle = ListedColormap(['#0000aa', '#ff5050', '#50ff50', '#9040a0', '#fff000'])
8 | cm3 = ListedColormap(['#0000aa', '#ff2020', '#50ff50'])
9 | cm2 = ListedColormap(['#0000aa', '#ff2020'])
10 |
11 | # create a smooth transition from the first to to the second color of cm3
12 | # similar to RdBu but with our red and blue, also not going through white,
13 | # which is really bad for greyscale
14 |
15 | cdict = {'red': [(0.0, 0.0, cm2(0)[0]),
16 | (1.0, cm2(1)[0], 1.0)],
17 |
18 | 'green': [(0.0, 0.0, cm2(0)[1]),
19 | (1.0, cm2(1)[1], 1.0)],
20 |
21 | 'blue': [(0.0, 0.0, cm2(0)[2]),
22 | (1.0, cm2(1)[2], 1.0)]}
23 |
24 | ReBl = LinearSegmentedColormap("ReBl", cdict)
25 |
26 |
27 | def discrete_scatter(x1, x2, y=None, markers=None, s=10, ax=None,
28 | labels=None, padding=.2, alpha=1, c=None, markeredgewidth=None):
29 | """Adaption of matplotlib.pyplot.scatter to plot classes or clusters.
30 |
31 | Parameters
32 | ----------
33 |
34 | x1 : nd-array
35 | input data, first axis
36 |
37 | x2 : nd-array
38 | input data, second axis
39 |
40 | y : nd-array
41 | input data, discrete labels
42 |
43 | cmap : colormap
44 | Colormap to use.
45 |
46 | markers : list of string
47 | List of markers to use, or None (which defaults to 'o').
48 |
49 | s : int or float
50 | Size of the marker
51 |
52 | padding : float
53 | Fraction of the dataset range to use for padding the axes.
54 |
55 | alpha : float
56 | Alpha value for all points.
57 | """
58 | if ax is None:
59 | ax = plt.gca()
60 |
61 | if y is None:
62 | y = np.zeros(len(x1))
63 |
64 | unique_y = np.unique(y)
65 |
66 | if markers is None:
67 | markers = ['o', '^', 'v', 'D', 's', '*', 'p', 'h', 'H', '8', '<', '>'] * 10
68 |
69 | if len(markers) == 1:
70 | markers = markers * len(unique_y)
71 |
72 | if labels is None:
73 | labels = unique_y
74 |
75 | # lines in the matplotlib sense, not actual lines
76 | lines = []
77 |
78 | current_cycler = mpl.rcParams['axes.prop_cycle']
79 |
80 | for i, (yy, cycle) in enumerate(zip(unique_y, current_cycler())):
81 | mask = y == yy
82 | # if c is none, use color cycle
83 | if c is None:
84 | color = cycle['color']
85 | elif len(c) > 1:
86 | color = c[i]
87 | else:
88 | color = c
89 | # use light edge for dark markers
90 | if np.mean(colorConverter.to_rgb(color)) < .4:
91 | markeredgecolor = "grey"
92 | else:
93 | markeredgecolor = "black"
94 |
95 | lines.append(ax.plot(x1[mask], x2[mask], markers[i], markersize=s,
96 | label=labels[i], alpha=alpha, c=color,
97 | markeredgewidth=markeredgewidth,
98 | markeredgecolor=markeredgecolor)[0])
99 |
100 | if padding != 0:
101 | pad1 = x1.std() * padding
102 | pad2 = x2.std() * padding
103 | xlim = ax.get_xlim()
104 | ylim = ax.get_ylim()
105 | ax.set_xlim(min(x1.min() - pad1, xlim[0]), max(x1.max() + pad1, xlim[1]))
106 | ax.set_ylim(min(x2.min() - pad2, ylim[0]), max(x2.max() + pad2, ylim[1]))
107 |
108 | return lines
109 |
--------------------------------------------------------------------------------
/mglearn/plot_improper_preprocessing.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 |
3 |
4 | def make_bracket(s, xy, textxy, width, ax):
5 | annotation = ax.annotate(
6 | s, xy, textxy, ha="center", va="center", size=20,
7 | arrowprops=dict(arrowstyle="-[", fc="w", ec="k",
8 | lw=2,), bbox=dict(boxstyle="square", fc="w"))
9 | annotation.arrow_patch.get_arrowstyle().widthB = width
10 |
11 |
12 | def plot_improper_processing():
13 | fig, axes = plt.subplots(2, 1, figsize=(15, 10))
14 |
15 | for axis in axes:
16 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], left=[0, 12, 15],
17 | color=['white', 'grey', 'grey'], hatch="//",
18 | align='edge', edgecolor='k')
19 | bars[2].set_hatch(r"")
20 | axis.set_yticks(())
21 | axis.set_frame_on(False)
22 | axis.set_ylim(-.1, 6)
23 | axis.set_xlim(-0.1, 20.1)
24 | axis.set_xticks(())
25 | axis.tick_params(length=0, labeltop=True, labelbottom=False)
26 | axis.text(6, -.3, "training folds",
27 | fontdict={'fontsize': 14}, horizontalalignment="center")
28 | axis.text(13.5, -.3, "validation fold",
29 | fontdict={'fontsize': 14}, horizontalalignment="center")
30 | axis.text(17.5, -.3, "test set",
31 | fontdict={'fontsize': 14}, horizontalalignment="center")
32 |
33 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[0])
34 | make_bracket("SVC fit", (6, 3), (6, 4), 12, axes[0])
35 | make_bracket("SVC predict", (13.4, 3), (13.4, 4), 2.5, axes[0])
36 |
37 | axes[0].set_title("Cross validation")
38 | axes[1].set_title("Test set prediction")
39 |
40 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[1])
41 | make_bracket("SVC fit", (7.5, 3), (7.5, 4), 15, axes[1])
42 | make_bracket("SVC predict", (17.5, 3), (17.5, 4), 4.8, axes[1])
43 |
44 |
45 | def plot_proper_processing():
46 | fig, axes = plt.subplots(2, 1, figsize=(15, 8))
47 |
48 | for axis in axes:
49 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9],
50 | left=[0, 12, 15], color=['white', 'grey', 'grey'],
51 | hatch="//", align='edge', edgecolor='k')
52 | bars[2].set_hatch(r"")
53 | axis.set_yticks(())
54 | axis.set_frame_on(False)
55 | axis.set_ylim(-.1, 4.5)
56 | axis.set_xlim(-0.1, 20.1)
57 | axis.set_xticks(())
58 | axis.tick_params(length=0, labeltop=True, labelbottom=False)
59 | axis.text(6, -.3, "training folds", fontdict={'fontsize': 14},
60 | horizontalalignment="center")
61 | axis.text(13.5, -.3, "validation fold", fontdict={'fontsize': 14},
62 | horizontalalignment="center")
63 | axis.text(17.5, -.3, "test set", fontdict={'fontsize': 14},
64 | horizontalalignment="center")
65 |
66 | make_bracket("scaler fit", (6, 1.3), (6, 2.), 12, axes[0])
67 | make_bracket("SVC fit", (6, 3), (6, 4), 12, axes[0])
68 | make_bracket("SVC predict", (13.4, 3), (13.4, 4), 2.5, axes[0])
69 |
70 | axes[0].set_title("Cross validation")
71 | axes[1].set_title("Test set prediction")
72 |
73 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[1])
74 | make_bracket("SVC fit", (7.5, 3), (7.5, 4), 15, axes[1])
75 | make_bracket("SVC predict", (17.5, 3), (17.5, 4), 4.8, axes[1])
76 | fig.subplots_adjust(hspace=.3)
77 |
--------------------------------------------------------------------------------
/mglearn/plot_interactive_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.tree import DecisionTreeClassifier
5 |
6 | from io import StringIO
7 | from sklearn.tree import export_graphviz
8 | from imageio import imread
9 | from scipy import ndimage
10 | from sklearn.datasets import make_moons
11 |
12 | import re
13 |
14 | from .tools import discrete_scatter
15 | from .plot_helpers import cm2
16 |
17 |
18 | def tree_image(tree, fout=None):
19 | try:
20 | import graphviz
21 | except ImportError:
22 | # make a hacky white plot
23 | x = np.ones((10, 10))
24 | x[0, 0] = 0
25 | return x
26 | dot_data = StringIO()
27 | export_graphviz(tree, out_file=dot_data, max_depth=3, impurity=False)
28 | data = dot_data.getvalue()
29 | data = re.sub(r"samples = [0-9]+\\n", "", data)
30 | data = re.sub(r"\\nsamples = [0-9]+", "", data)
31 | data = re.sub(r"value", "counts", data)
32 |
33 | graph = graphviz.Source(data, format="png")
34 | if fout is None:
35 | fout = "tmp"
36 | graph.render(fout)
37 | return imread(fout + ".png")
38 |
39 |
40 | def plot_tree_progressive():
41 | X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
42 | plt.figure()
43 | ax = plt.gca()
44 | discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
45 | ax.set_xlabel("Feature 0")
46 | ax.set_ylabel("Feature 1")
47 | plt.legend(["Class 0", "Class 1"], loc='best')
48 |
49 | axes = []
50 | for i in range(3):
51 | fig, ax = plt.subplots(1, 2, figsize=(12, 4),
52 | subplot_kw={'xticks': (), 'yticks': ()})
53 | axes.append(ax)
54 | axes = np.array(axes)
55 |
56 | for i, max_depth in enumerate([1, 2, 9]):
57 | tree = plot_tree(X, y, max_depth=max_depth, ax=axes[i, 0])
58 | axes[i, 1].imshow(tree_image(tree))
59 | axes[i, 1].set_axis_off()
60 |
61 |
62 | def plot_tree_partition(X, y, tree, ax=None):
63 | if ax is None:
64 | ax = plt.gca()
65 | eps = X.std() / 2.
66 |
67 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
68 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
69 | xx = np.linspace(x_min, x_max, 1000)
70 | yy = np.linspace(y_min, y_max, 1000)
71 |
72 | X1, X2 = np.meshgrid(xx, yy)
73 | X_grid = np.c_[X1.ravel(), X2.ravel()]
74 |
75 | Z = tree.predict(X_grid)
76 | Z = Z.reshape(X1.shape)
77 | faces = tree.apply(X_grid)
78 | faces = faces.reshape(X1.shape)
79 | border = ndimage.laplace(faces) != 0
80 | ax.contourf(X1, X2, Z, alpha=.4, cmap=cm2, levels=[0, .5, 1])
81 | ax.scatter(X1[border], X2[border], marker='.', s=1)
82 |
83 | discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
84 | ax.set_xlim(x_min, x_max)
85 | ax.set_ylim(y_min, y_max)
86 | ax.set_xticks(())
87 | ax.set_yticks(())
88 | return ax
89 |
90 |
91 | def plot_tree(X, y, max_depth=1, ax=None):
92 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=0).fit(X, y)
93 | ax = plot_tree_partition(X, y, tree, ax=ax)
94 | ax.set_title("depth = %d" % max_depth)
95 | return tree
96 |
--------------------------------------------------------------------------------
/mglearn/plot_kmeans.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from sklearn.datasets import make_blobs
4 | from sklearn.cluster import KMeans
5 | from sklearn.metrics import pairwise_distances
6 | import matplotlib.pyplot as plt
7 | import matplotlib as mpl
8 | from cycler import cycler
9 |
10 | from .tools import discrete_scatter
11 | from .plot_2d_separator import plot_2d_classification
12 | from .plot_helpers import cm3
13 |
14 |
15 | def plot_kmeans_algorithm():
16 |
17 | X, y = make_blobs(random_state=1)
18 | # we don't want cyan in there
19 | with mpl.rc_context(rc={'axes.prop_cycle': cycler('color', ['#0000aa',
20 | '#ff2020',
21 | '#50ff50'])}):
22 | fig, axes = plt.subplots(3, 3, figsize=(10, 8), subplot_kw={'xticks': (), 'yticks': ()})
23 | axes = axes.ravel()
24 | axes[0].set_title("Input data")
25 | discrete_scatter(X[:, 0], X[:, 1], ax=axes[0], markers=['o'], c='w')
26 |
27 | axes[1].set_title("Initialization")
28 | init = X[:3, :]
29 | discrete_scatter(X[:, 0], X[:, 1], ax=axes[1], markers=['o'], c='w')
30 | discrete_scatter(init[:, 0], init[:, 1], [0, 1, 2], ax=axes[1],
31 | markers=['^'], markeredgewidth=2)
32 |
33 | axes[2].set_title("Assign Points (1)")
34 | km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X)
35 | centers = km.cluster_centers_
36 | # need to compute labels by hand. scikit-learn does two e-steps for max_iter=1
37 | # (and it's totally my fault)
38 | labels = np.argmin(pairwise_distances(init, X), axis=0)
39 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
40 | ax=axes[2])
41 | discrete_scatter(init[:, 0], init[:, 1], [0, 1, 2],
42 | ax=axes[2], markers=['^'], markeredgewidth=2)
43 |
44 | axes[3].set_title("Recompute Centers (1)")
45 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
46 | ax=axes[3])
47 | discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
48 | ax=axes[3], markers=['^'], markeredgewidth=2)
49 |
50 | axes[4].set_title("Reassign Points (2)")
51 | km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X)
52 | labels = km.labels_
53 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
54 | ax=axes[4])
55 | discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
56 | ax=axes[4], markers=['^'], markeredgewidth=2)
57 |
58 | km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X)
59 | axes[5].set_title("Recompute Centers (2)")
60 | centers = km.cluster_centers_
61 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
62 | ax=axes[5])
63 | discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
64 | ax=axes[5], markers=['^'], markeredgewidth=2)
65 |
66 | axes[6].set_title("Reassign Points (3)")
67 | labels = km.labels_
68 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
69 | ax=axes[6])
70 | markers = discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
71 | ax=axes[6], markers=['^'],
72 | markeredgewidth=2)
73 |
74 | axes[7].set_title("Recompute Centers (3)")
75 | km = KMeans(n_clusters=3, init=init, max_iter=3, n_init=1).fit(X)
76 | centers = km.cluster_centers_
77 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
78 | ax=axes[7])
79 | discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
80 | ax=axes[7], markers=['^'], markeredgewidth=2)
81 | axes[8].set_axis_off()
82 | axes[8].legend(markers, ["Cluster 0", "Cluster 1", "Cluster 2"], loc='best')
83 |
84 |
85 | def plot_kmeans_boundaries():
86 | X, y = make_blobs(random_state=1)
87 | init = X[:3, :]
88 | km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X)
89 | discrete_scatter(X[:, 0], X[:, 1], km.labels_, markers=['o'])
90 | discrete_scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
91 | [0, 1, 2], markers=['^'], markeredgewidth=2)
92 | plot_2d_classification(km, X, cm=cm3, alpha=.4)
93 |
94 |
95 | def plot_kmeans_faces(km, pca, X_pca, X_people, y_people, target_names):
96 | n_clusters = 10
97 | image_shape = (87, 65)
98 | fig, axes = plt.subplots(n_clusters, 11, subplot_kw={'xticks': (), 'yticks': ()},
99 | figsize=(10, 15), gridspec_kw={"hspace": .3})
100 |
101 | for cluster in range(n_clusters):
102 | center = km.cluster_centers_[cluster]
103 | mask = km.labels_ == cluster
104 | dists = np.sum((X_pca - center) ** 2, axis=1)
105 | dists[~mask] = np.inf
106 | inds = np.argsort(dists)[:5]
107 | dists[~mask] = -np.inf
108 | inds = np.r_[inds, np.argsort(dists)[-5:]]
109 | axes[cluster, 0].imshow(pca.inverse_transform(center).reshape(image_shape), vmin=0, vmax=1)
110 | for image, label, asdf, ax in zip(X_people[inds], y_people[inds],
111 | km.labels_[inds], axes[cluster, 1:]):
112 | ax.imshow(image.reshape(image_shape), vmin=0, vmax=1)
113 | ax.set_title("%s" % (target_names[label].split()[-1]), fontdict={'fontsize': 9})
114 |
115 | # add some boxes to illustrate which are similar and which dissimilar
116 | rec = plt.Rectangle([-5, -30], 73, 1295, fill=False, lw=2)
117 | rec = axes[0, 0].add_patch(rec)
118 | rec.set_clip_on(False)
119 | axes[0, 0].text(0, -40, "Center")
120 |
121 | rec = plt.Rectangle([-5, -30], 385, 1295, fill=False, lw=2)
122 | rec = axes[0, 1].add_patch(rec)
123 | rec.set_clip_on(False)
124 | axes[0, 1].text(0, -40, "Close to center")
125 |
126 | rec = plt.Rectangle([-5, -30], 385, 1295, fill=False, lw=2)
127 | rec = axes[0, 6].add_patch(rec)
128 | rec.set_clip_on(False)
129 | axes[0, 6].text(0, -40, "Far from center")
130 |
--------------------------------------------------------------------------------
/mglearn/plot_kneighbors_regularization.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.neighbors import KNeighborsRegressor
5 |
6 |
7 | def plot_kneighbors_regularization():
8 | rnd = np.random.RandomState(42)
9 | x = np.linspace(-3, 3, 100)
10 | y_no_noise = np.sin(4 * x) + x
11 | y = y_no_noise + rnd.normal(size=len(x))
12 | X = x[:, np.newaxis]
13 | fig, axes = plt.subplots(1, 3, figsize=(15, 5))
14 |
15 | x_test = np.linspace(-3, 3, 1000)
16 |
17 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()):
18 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors)
19 | kneighbor_regression.fit(X, y)
20 | ax.plot(x, y_no_noise, label="true function")
21 | ax.plot(x, y, "o", label="data")
22 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]),
23 | label="prediction")
24 | ax.legend()
25 | ax.set_title("n_neighbors = %d" % n_neighbors)
--------------------------------------------------------------------------------
/mglearn/plot_knn_classification.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.metrics import euclidean_distances
5 | from sklearn.neighbors import KNeighborsClassifier
6 |
7 | from .datasets import make_forge
8 | from .plot_helpers import discrete_scatter
9 |
10 |
11 | def plot_knn_classification(n_neighbors=1):
12 | X, y = make_forge()
13 |
14 | X_test = np.array([[8.2, 3.66214339], [9.9, 3.2], [11.2, .5]])
15 | dist = euclidean_distances(X, X_test)
16 | closest = np.argsort(dist, axis=0)
17 |
18 | for x, neighbors in zip(X_test, closest.T):
19 | for neighbor in neighbors[:n_neighbors]:
20 | plt.arrow(x[0], x[1], X[neighbor, 0] - x[0],
21 | X[neighbor, 1] - x[1], head_width=0, fc='k', ec='k')
22 |
23 | clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
24 | test_points = discrete_scatter(X_test[:, 0], X_test[:, 1], clf.predict(X_test), markers="*")
25 | training_points = discrete_scatter(X[:, 0], X[:, 1], y)
26 | plt.legend(training_points + test_points, ["training class 0", "training class 1",
27 | "test pred 0", "test pred 1"])
28 |
--------------------------------------------------------------------------------
/mglearn/plot_knn_regression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.neighbors import KNeighborsRegressor
5 | from sklearn.metrics import euclidean_distances
6 |
7 | from .datasets import make_wave
8 | from .plot_helpers import cm3
9 |
10 |
11 | def plot_knn_regression(n_neighbors=1):
12 | X, y = make_wave(n_samples=40)
13 | X_test = np.array([[-1.5], [0.9], [1.5]])
14 |
15 | dist = euclidean_distances(X, X_test)
16 | closest = np.argsort(dist, axis=0)
17 |
18 | plt.figure(figsize=(10, 6))
19 |
20 | reg = KNeighborsRegressor(n_neighbors=n_neighbors).fit(X, y)
21 | y_pred = reg.predict(X_test)
22 |
23 | for x, y_, neighbors in zip(X_test, y_pred, closest.T):
24 | for neighbor in neighbors[:n_neighbors]:
25 | plt.arrow(x[0], y_, X[neighbor, 0] - x[0], y[neighbor] - y_,
26 | head_width=0, fc='k', ec='k')
27 |
28 | train, = plt.plot(X, y, 'o', c=cm3(0))
29 | test, = plt.plot(X_test, -3 * np.ones(len(X_test)), '*', c=cm3(2),
30 | markersize=20)
31 | pred, = plt.plot(X_test, y_pred, '*', c=cm3(0), markersize=20)
32 | plt.vlines(X_test, -3.1, 3.1, linestyle="--")
33 | plt.legend([train, test, pred],
34 | ["training data/target", "test data", "test prediction"],
35 | ncol=3, loc=(.1, 1.025))
36 | plt.ylim(-3.1, 3.1)
37 | plt.xlabel("Feature")
38 | plt.ylabel("Target")
39 |
--------------------------------------------------------------------------------
/mglearn/plot_linear_regression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.linear_model import LinearRegression
5 | from sklearn.model_selection import train_test_split
6 | from .datasets import make_wave
7 | from .plot_helpers import cm2
8 |
9 |
10 | def plot_linear_regression_wave():
11 | X, y = make_wave(n_samples=60)
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
13 |
14 | line = np.linspace(-3, 3, 100).reshape(-1, 1)
15 |
16 | lr = LinearRegression().fit(X_train, y_train)
17 | print("w[0]: %f b: %f" % (lr.coef_[0], lr.intercept_))
18 |
19 | plt.figure(figsize=(8, 8))
20 | plt.plot(line, lr.predict(line))
21 | plt.plot(X, y, 'o', c=cm2(0))
22 | ax = plt.gca()
23 | ax.spines['left'].set_position('center')
24 | ax.spines['right'].set_color('none')
25 | ax.spines['bottom'].set_position('center')
26 | ax.spines['top'].set_color('none')
27 | ax.set_ylim(-3, 3)
28 | #ax.set_xlabel("Feature")
29 | #ax.set_ylabel("Target")
30 | ax.legend(["model", "training data"], loc="best")
31 | ax.grid(True)
32 | ax.set_aspect('equal')
33 |
--------------------------------------------------------------------------------
/mglearn/plot_linear_svc_regularization.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.svm import LinearSVC
4 | from sklearn.datasets import make_blobs
5 |
6 | from .plot_helpers import discrete_scatter
7 |
8 |
9 | def plot_linear_svc_regularization():
10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
11 | fig, axes = plt.subplots(1, 3, figsize=(12, 4))
12 |
13 | # a carefully hand-designed dataset lol
14 | y[7] = 0
15 | y[27] = 0
16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
18 |
19 | for ax, C in zip(axes, [1e-2, 10, 1e3]):
20 | discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
21 |
22 | svm = LinearSVC(C=C, tol=0.00001, dual=False).fit(X, y)
23 | w = svm.coef_[0]
24 | a = -w[0] / w[1]
25 | xx = np.linspace(6, 13)
26 | yy = a * xx - (svm.intercept_[0]) / w[1]
27 | ax.plot(xx, yy, c='k')
28 | ax.set_xlim(x_min, x_max)
29 | ax.set_ylim(y_min, y_max)
30 | ax.set_xticks(())
31 | ax.set_yticks(())
32 | ax.set_title("C = %f" % C)
33 | axes[0].legend(loc="best")
34 |
--------------------------------------------------------------------------------
/mglearn/plot_metrics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from .tools import plot_2d_separator, plot_2d_scores, cm, discrete_scatter
5 | from .plot_helpers import ReBl
6 |
7 |
8 | def plot_confusion_matrix_illustration():
9 | plt.figure(figsize=(8, 8))
10 | confusion = np.array([[401, 2], [8, 39]])
11 | plt.text(0.40, .7, confusion[0, 0], size=70, horizontalalignment='right')
12 | plt.text(0.40, .2, confusion[1, 0], size=70, horizontalalignment='right')
13 | plt.text(.90, .7, confusion[0, 1], size=70, horizontalalignment='right')
14 | plt.text(.90, 0.2, confusion[1, 1], size=70, horizontalalignment='right')
15 | plt.xticks([.25, .75], ["predicted 'not nine'", "predicted 'nine'"], size=20)
16 | plt.yticks([.25, .75], ["true 'nine'", "true 'not nine'"], size=20)
17 | plt.plot([.5, .5], [0, 1], '--', c='k')
18 | plt.plot([0, 1], [.5, .5], '--', c='k')
19 |
20 | plt.xlim(0, 1)
21 | plt.ylim(0, 1)
22 |
23 |
24 | def plot_binary_confusion_matrix():
25 | plt.text(0.45, .6, "TN", size=100, horizontalalignment='right')
26 | plt.text(0.45, .1, "FN", size=100, horizontalalignment='right')
27 | plt.text(.95, .6, "FP", size=100, horizontalalignment='right')
28 | plt.text(.95, 0.1, "TP", size=100, horizontalalignment='right')
29 | plt.xticks([.25, .75], ["predicted negative", "predicted positive"], size=15)
30 | plt.yticks([.25, .75], ["positive class", "negative class"], size=15)
31 | plt.plot([.5, .5], [0, 1], '--', c='k')
32 | plt.plot([0, 1], [.5, .5], '--', c='k')
33 |
34 | plt.xlim(0, 1)
35 | plt.ylim(0, 1)
36 |
37 |
38 | def plot_decision_threshold():
39 | from sklearn.datasets import make_blobs
40 | from sklearn.svm import SVC
41 | from sklearn.model_selection import train_test_split
42 |
43 | X, y = make_blobs(n_samples=(400, 50), cluster_std=[7.0, 2],
44 | random_state=22)
45 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
46 |
47 | fig, axes = plt.subplots(2, 3, figsize=(15, 8), subplot_kw={'xticks': (), 'yticks': ()})
48 | plt.suptitle("decision_threshold")
49 | axes[0, 0].set_title("training data")
50 | discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 0])
51 |
52 | svc = SVC(gamma=.05).fit(X_train, y_train)
53 | axes[0, 1].set_title("decision with threshold 0")
54 | discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 1])
55 | plot_2d_scores(svc, X_train, function="decision_function", alpha=.7,
56 | ax=axes[0, 1], cm=ReBl)
57 | plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1])
58 | axes[0, 2].set_title("decision with threshold -0.8")
59 | discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 2])
60 | plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8)
61 | plot_2d_scores(svc, X_train, function="decision_function", alpha=.7,
62 | ax=axes[0, 2], cm=ReBl)
63 |
64 | axes[1, 0].set_axis_off()
65 |
66 | mask = np.abs(X_train[:, 1] - 7) < 5
67 | bla = np.sum(mask)
68 |
69 | line = np.linspace(X_train.min(), X_train.max(), 100)
70 | axes[1, 1].set_title("Cross-section with threshold 0")
71 | axes[1, 1].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k')
72 | dec = svc.decision_function(np.c_[line, 10 * np.ones(100)])
73 | contour = (dec > 0).reshape(1, -1).repeat(10, axis=0)
74 | axes[1, 1].contourf(line, np.linspace(-1.5, 1.5, 10), contour, alpha=0.4, cmap=cm)
75 | discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], ax=axes[1, 1])
76 | axes[1, 1].set_xlim(X_train.min(), X_train.max())
77 | axes[1, 1].set_ylim(-1.5, 1.5)
78 | axes[1, 1].set_xticks(())
79 | axes[1, 1].set_ylabel("Decision value")
80 |
81 | contour2 = (dec > -.8).reshape(1, -1).repeat(10, axis=0)
82 | axes[1, 2].set_title("Cross-section with threshold -0.8")
83 | axes[1, 2].contourf(line, np.linspace(-1.5, 1.5, 10), contour2, alpha=0.4, cmap=cm)
84 | discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], alpha=.1, ax=axes[1, 2])
85 | axes[1, 2].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k')
86 | axes[1, 2].set_xlim(X_train.min(), X_train.max())
87 | axes[1, 2].set_ylim(-1.5, 1.5)
88 | axes[1, 2].set_xticks(())
89 | axes[1, 2].set_ylabel("Decision value")
90 | axes[1, 0].legend(['negative class', 'positive class'])
91 |
--------------------------------------------------------------------------------
/mglearn/plot_nmf.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import NMF
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 | from joblib import Memory
6 |
7 | try:
8 | memory = Memory(cachedir="cache")
9 | except TypeError:
10 | # joblib.Memory changed its API in 0.12
11 | memory = Memory(location="cache", verbose=0)
12 |
13 |
14 | def plot_nmf_illustration():
15 | rnd = np.random.RandomState(5)
16 | X_ = rnd.normal(size=(300, 2))
17 | # Add 8 to make sure every point lies in the positive part of the space
18 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8
19 |
20 | nmf = NMF(random_state=0)
21 | nmf.fit(X_blob)
22 | X_nmf = nmf.transform(X_blob)
23 |
24 | fig, axes = plt.subplots(1, 2, figsize=(15, 5))
25 |
26 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0,
27 | s=60, cmap='viridis')
28 | axes[0].set_xlabel("feature 1")
29 | axes[0].set_ylabel("feature 2")
30 | axes[0].set_xlim(0, 12)
31 | axes[0].set_ylim(0, 12)
32 | axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1,
33 | head_width=.3, color='k')
34 | axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1,
35 | head_width=.3, color='k')
36 | axes[0].set_aspect('equal')
37 | axes[0].set_title("NMF with two components")
38 |
39 | # second plot
40 | nmf = NMF(random_state=0, n_components=1)
41 | nmf.fit(X_blob)
42 |
43 | axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0,
44 | s=60, cmap='viridis')
45 | axes[1].set_xlabel("feature 1")
46 | axes[1].set_ylabel("feature 2")
47 | axes[1].set_xlim(0, 12)
48 | axes[1].set_ylim(0, 12)
49 | axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1,
50 | head_width=.3, color='k')
51 |
52 | axes[1].set_aspect('equal')
53 | axes[1].set_title("NMF with one component")
54 |
55 |
56 | @memory.cache
57 | def nmf_faces(X_train, X_test):
58 | # Build NMF models with 10, 50, 100 and 500 components
59 | # this list will hold the back-transformd test-data
60 | reduced_images = []
61 | for n_components in [10, 50, 100, 500]:
62 | # build the NMF model
63 | nmf = NMF(n_components=n_components, random_state=0)
64 | nmf.fit(X_train)
65 | # transform the test data (afterwards has n_components many dimensions)
66 | X_test_nmf = nmf.transform(X_test)
67 | # back-transform the transformed test-data
68 | # (afterwards it's in the original space again)
69 | X_test_back = np.dot(X_test_nmf, nmf.components_)
70 | reduced_images.append(X_test_back)
71 | return reduced_images
72 |
73 |
74 | def plot_nmf_faces(X_train, X_test, image_shape):
75 | reduced_images = nmf_faces(X_train, X_test)
76 |
77 | # plot the first three images in the test set:
78 | fix, axes = plt.subplots(3, 5, figsize=(15, 12),
79 | subplot_kw={'xticks': (), 'yticks': ()})
80 | for i, ax in enumerate(axes):
81 | # plot original image
82 | ax[0].imshow(X_test[i].reshape(image_shape),
83 | vmin=0, vmax=1)
84 | # plot the four back-transformed images
85 | for a, X_test_back in zip(ax[1:], reduced_images):
86 | a.imshow(X_test_back[i].reshape(image_shape), vmin=0, vmax=1)
87 |
88 | # label the top row
89 | axes[0, 0].set_title("original image")
90 | for ax, n_components in zip(axes[0, 1:], [10, 50, 100, 500]):
91 | ax.set_title("%d components" % n_components)
92 |
--------------------------------------------------------------------------------
/mglearn/plot_nn_graphs.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | def plot_logistic_regression_graph():
4 | import graphviz
5 | lr_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'},
6 | graph_attr={'rankdir': 'LR', 'splines': 'line'})
7 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0")
8 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2")
9 |
10 | for i in range(4):
11 | inputs.node("x[%d]" % i, labelloc="c")
12 | inputs.body.append('label = "inputs"')
13 | inputs.body.append('color = "white"')
14 |
15 | lr_graph.subgraph(inputs)
16 |
17 | output.body.append('label = "output"')
18 | output.body.append('color = "white"')
19 | output.node("y")
20 |
21 | lr_graph.subgraph(output)
22 |
23 | for i in range(4):
24 | lr_graph.edge("x[%d]" % i, "y", label="w[%d]" % i)
25 | return lr_graph
26 |
27 |
28 | def plot_single_hidden_layer_graph():
29 | import graphviz
30 | nn_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'},
31 | graph_attr={'rankdir': 'LR', 'splines': 'line'})
32 |
33 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0")
34 | hidden = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_1")
35 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2")
36 |
37 | for i in range(4):
38 | inputs.node("x[%d]" % i)
39 |
40 | inputs.body.append('label = "inputs"')
41 | inputs.body.append('color = "white"')
42 |
43 | hidden.body.append('label = "hidden layer"')
44 | hidden.body.append('color = "white"')
45 |
46 | for i in range(3):
47 | hidden.node("h%d" % i, label="h[%d]" % i)
48 |
49 | output.node("y")
50 | output.body.append('label = "output"')
51 | output.body.append('color = "white"')
52 |
53 | nn_graph.subgraph(inputs)
54 | nn_graph.subgraph(hidden)
55 | nn_graph.subgraph(output)
56 |
57 | for i in range(4):
58 | for j in range(3):
59 | nn_graph.edge("x[%d]" % i, "h%d" % j)
60 |
61 | for i in range(3):
62 | nn_graph.edge("h%d" % i, "y")
63 | return nn_graph
64 |
65 |
66 | def plot_two_hidden_layer_graph():
67 | import graphviz
68 | nn_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'},
69 | graph_attr={'rankdir': 'LR', 'splines': 'line'})
70 |
71 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0")
72 | hidden = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_1")
73 | hidden2 = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2")
74 |
75 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_3")
76 |
77 | for i in range(4):
78 | inputs.node("x[%d]" % i)
79 |
80 | inputs.body.append('label = "inputs"')
81 | inputs.body.append('color = "white"')
82 |
83 | for i in range(3):
84 | hidden.node("h1[%d]" % i)
85 |
86 | for i in range(3):
87 | hidden2.node("h2[%d]" % i)
88 |
89 | hidden.body.append('label = "hidden layer 1"')
90 | hidden.body.append('color = "white"')
91 |
92 | hidden2.body.append('label = "hidden layer 2"')
93 | hidden2.body.append('color = "white"')
94 |
95 | output.node("y")
96 | output.body.append('label = "output"')
97 | output.body.append('color = "white"')
98 |
99 | nn_graph.subgraph(inputs)
100 | nn_graph.subgraph(hidden)
101 | nn_graph.subgraph(hidden2)
102 |
103 | nn_graph.subgraph(output)
104 |
105 | for i in range(4):
106 | for j in range(3):
107 | nn_graph.edge("x[%d]" % i, "h1[%d]" % j, label="")
108 |
109 | for i in range(3):
110 | for j in range(3):
111 | nn_graph.edge("h1[%d]" % i, "h2[%d]" % j, label="")
112 |
113 | for i in range(3):
114 | nn_graph.edge("h2[%d]" % i, "y", label="")
115 |
116 | return nn_graph
117 |
--------------------------------------------------------------------------------
/mglearn/plot_pca.py:
--------------------------------------------------------------------------------
1 | from sklearn.decomposition import PCA
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 | from joblib import Memory
6 |
7 | try:
8 | memory = Memory(cachedir="cache")
9 | except TypeError:
10 | # joblib.Memory changed its API in 0.12
11 | memory = Memory(location="cache", verbose=0)
12 |
13 | def plot_pca_illustration():
14 | rnd = np.random.RandomState(5)
15 | X_ = rnd.normal(size=(300, 2))
16 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2)
17 |
18 | pca = PCA()
19 | pca.fit(X_blob)
20 | X_pca = pca.transform(X_blob)
21 |
22 | S = X_pca.std(axis=0)
23 |
24 | fig, axes = plt.subplots(2, 2, figsize=(10, 10))
25 | axes = axes.ravel()
26 |
27 | axes[0].set_title("Original data")
28 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0,
29 | s=60, cmap='viridis')
30 | axes[0].set_xlabel("feature 1")
31 | axes[0].set_ylabel("feature 2")
32 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[0] * pca.components_[0, 0],
33 | S[0] * pca.components_[0, 1], width=.1, head_width=.3,
34 | color='k')
35 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[1] * pca.components_[1, 0],
36 | S[1] * pca.components_[1, 1], width=.1, head_width=.3,
37 | color='k')
38 | axes[0].text(-1.5, -.5, "Component 2", size=14)
39 | axes[0].text(-4, -4, "Component 1", size=14)
40 | axes[0].set_aspect('equal')
41 |
42 | axes[1].set_title("Transformed data")
43 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0,
44 | s=60, cmap='viridis')
45 | axes[1].set_xlabel("First principal component")
46 | axes[1].set_ylabel("Second principal component")
47 | axes[1].set_aspect('equal')
48 | axes[1].set_ylim(-8, 8)
49 |
50 | pca = PCA(n_components=1)
51 | pca.fit(X_blob)
52 | X_inverse = pca.inverse_transform(pca.transform(X_blob))
53 |
54 | axes[2].set_title("Transformed data w/ second component dropped")
55 | axes[2].scatter(X_pca[:, 0], np.zeros(X_pca.shape[0]), c=X_pca[:, 0],
56 | linewidths=0, s=60, cmap='viridis')
57 | axes[2].set_xlabel("First principal component")
58 | axes[2].set_aspect('equal')
59 | axes[2].set_ylim(-8, 8)
60 |
61 | axes[3].set_title("Back-rotation using only first component")
62 | axes[3].scatter(X_inverse[:, 0], X_inverse[:, 1], c=X_pca[:, 0],
63 | linewidths=0, s=60, cmap='viridis')
64 | axes[3].set_xlabel("feature 1")
65 | axes[3].set_ylabel("feature 2")
66 | axes[3].set_aspect('equal')
67 | axes[3].set_xlim(-8, 4)
68 | axes[3].set_ylim(-8, 4)
69 |
70 |
71 | def plot_pca_whitening():
72 | rnd = np.random.RandomState(5)
73 | X_ = rnd.normal(size=(300, 2))
74 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2)
75 |
76 | pca = PCA(whiten=True)
77 | pca.fit(X_blob)
78 | X_pca = pca.transform(X_blob)
79 |
80 | fig, axes = plt.subplots(1, 2, figsize=(10, 10))
81 | axes = axes.ravel()
82 |
83 | axes[0].set_title("Original data")
84 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis')
85 | axes[0].set_xlabel("feature 1")
86 | axes[0].set_ylabel("feature 2")
87 | axes[0].set_aspect('equal')
88 |
89 | axes[1].set_title("Whitened data")
90 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis')
91 | axes[1].set_xlabel("First principal component")
92 | axes[1].set_ylabel("Second principal component")
93 | axes[1].set_aspect('equal')
94 | axes[1].set_xlim(-3, 4)
95 |
96 |
97 | @memory.cache
98 | def pca_faces(X_train, X_test):
99 | # copy and pasted from nmf. refactor?
100 | # Build NMF models with 10, 50, 100, 500 components
101 | # this list will hold the back-transformd test-data
102 | reduced_images = []
103 | for n_components in [10, 50, 100, 500]:
104 | # build the NMF model
105 | pca = PCA(n_components=n_components)
106 | pca.fit(X_train)
107 | # transform the test data (afterwards has n_components many dimensions)
108 | X_test_pca = pca.transform(X_test)
109 | # back-transform the transformed test-data
110 | # (afterwards it's in the original space again)
111 | X_test_back = pca.inverse_transform(X_test_pca)
112 | reduced_images.append(X_test_back)
113 | return reduced_images
114 |
115 |
116 | def plot_pca_faces(X_train, X_test, image_shape):
117 | reduced_images = pca_faces(X_train, X_test)
118 |
119 | # plot the first three images in the test set:
120 | fix, axes = plt.subplots(3, 5, figsize=(15, 12),
121 | subplot_kw={'xticks': (), 'yticks': ()})
122 | for i, ax in enumerate(axes):
123 | # plot original image
124 | ax[0].imshow(X_test[i].reshape(image_shape),
125 | vmin=0, vmax=1)
126 | # plot the four back-transformed images
127 | for a, X_test_back in zip(ax[1:], reduced_images):
128 | a.imshow(X_test_back[i].reshape(image_shape), vmin=0, vmax=1)
129 |
130 | # label the top row
131 | axes[0, 0].set_title("original image")
132 | for ax, n_components in zip(axes[0, 1:], [10, 50, 100, 500]):
133 | ax.set_title("%d components" % n_components)
134 |
--------------------------------------------------------------------------------
/mglearn/plot_rbf_svm_parameters.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from sklearn.svm import SVC
3 | from .plot_2d_separator import plot_2d_separator
4 | from .tools import make_handcrafted_dataset
5 | from .plot_helpers import discrete_scatter
6 |
7 |
8 | def plot_svm(log_C, log_gamma, ax=None):
9 | X, y = make_handcrafted_dataset()
10 | C = 10. ** log_C
11 | gamma = 10. ** log_gamma
12 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y)
13 | if ax is None:
14 | ax = plt.gca()
15 | plot_2d_separator(svm, X, ax=ax, eps=.5)
16 | # plot data
17 | discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
18 | # plot support vectors
19 | sv = svm.support_vectors_
20 | # class labels of support vectors are given by the sign of the dual coefficients
21 | sv_labels = svm.dual_coef_.ravel() > 0
22 | discrete_scatter(sv[:, 0], sv[:, 1], sv_labels, s=15, markeredgewidth=3, ax=ax)
23 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma))
24 |
25 |
26 | def plot_svm_interactive():
27 | from IPython.html.widgets import interactive, FloatSlider
28 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False)
29 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False)
30 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider)
31 |
--------------------------------------------------------------------------------
/mglearn/plot_ridge.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 | from sklearn.linear_model import Ridge, LinearRegression
5 | from sklearn.model_selection import learning_curve, KFold
6 |
7 | from .datasets import load_extended_boston
8 |
9 |
10 | def plot_learning_curve(est, X, y):
11 | training_set_size, train_scores, test_scores = learning_curve(
12 | est, X, y, train_sizes=np.linspace(.1, 1, 20), cv=KFold(20, shuffle=True, random_state=1))
13 | estimator_name = est.__class__.__name__
14 | line = plt.plot(training_set_size, train_scores.mean(axis=1), '--',
15 | label="training " + estimator_name)
16 | plt.plot(training_set_size, test_scores.mean(axis=1), '-',
17 | label="test " + estimator_name, c=line[0].get_color())
18 | plt.xlabel('Training set size')
19 | plt.ylabel('Score (R^2)')
20 | plt.ylim(0, 1.1)
21 |
22 |
23 | def plot_ridge_n_samples():
24 | X, y = load_extended_boston()
25 |
26 | plot_learning_curve(Ridge(alpha=1), X, y)
27 | plot_learning_curve(LinearRegression(), X, y)
28 | plt.legend(loc=(0, 1.05), ncol=2, fontsize=11)
29 |
--------------------------------------------------------------------------------
/mglearn/plot_scaling.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.datasets import make_blobs
4 | from sklearn.preprocessing import (StandardScaler, MinMaxScaler, Normalizer,
5 | RobustScaler)
6 | from .plot_helpers import cm2
7 |
8 |
9 | def plot_scaling():
10 | X, y = make_blobs(n_samples=50, centers=2, random_state=4, cluster_std=1)
11 | X += 3
12 |
13 | plt.figure(figsize=(15, 8))
14 | main_ax = plt.subplot2grid((2, 4), (0, 0), rowspan=2, colspan=2)
15 |
16 | main_ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm2, s=60)
17 | maxx = np.abs(X[:, 0]).max()
18 | maxy = np.abs(X[:, 1]).max()
19 |
20 | main_ax.set_xlim(-maxx + 1, maxx + 1)
21 | main_ax.set_ylim(-maxy + 1, maxy + 1)
22 | main_ax.set_title("Original Data")
23 | other_axes = [plt.subplot2grid((2, 4), (i, j))
24 | for j in range(2, 4) for i in range(2)]
25 |
26 | for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(),
27 | MinMaxScaler(), Normalizer(norm='l2')]):
28 | X_ = scaler.fit_transform(X)
29 | ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=cm2, s=60)
30 | ax.set_xlim(-2, 2)
31 | ax.set_ylim(-2, 2)
32 | ax.set_title(type(scaler).__name__)
33 |
34 | other_axes.append(main_ax)
35 |
36 | for ax in other_axes:
37 | ax.spines['left'].set_position('center')
38 | ax.spines['right'].set_color('none')
39 | ax.spines['bottom'].set_position('center')
40 | ax.spines['top'].set_color('none')
41 | ax.xaxis.set_ticks_position('bottom')
42 | ax.yaxis.set_ticks_position('left')
43 |
--------------------------------------------------------------------------------
/mglearn/plot_tree_nonmonotonous.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from sklearn.datasets import make_blobs
3 | from sklearn.tree import DecisionTreeClassifier, export_graphviz
4 | from .tools import discrete_scatter
5 | from .plot_2d_separator import plot_2d_separator
6 |
7 |
8 | def plot_tree_not_monotone():
9 | import graphviz
10 | # make a simple 2d dataset
11 | X, y = make_blobs(centers=4, random_state=8)
12 | y = y % 2
13 | plt.figure()
14 | discrete_scatter(X[:, 0], X[:, 1], y)
15 | plt.legend(["Class 0", "Class 1"], loc="best")
16 |
17 | # learn a decision tree model
18 | tree = DecisionTreeClassifier(random_state=0).fit(X, y)
19 | plot_2d_separator(tree, X, linestyle="dashed")
20 |
21 | # visualize the tree
22 | export_graphviz(tree, out_file="mytree.dot", impurity=False, filled=True)
23 | with open("mytree.dot") as f:
24 | dot_graph = f.read()
25 | print("Feature importances: %s" % tree.feature_importances_)
26 | return graphviz.Source(dot_graph)
27 |
--------------------------------------------------------------------------------
/mglearn/plots.py:
--------------------------------------------------------------------------------
1 | from .plot_linear_svc_regularization import plot_linear_svc_regularization
2 | from .plot_interactive_tree import plot_tree_progressive, plot_tree_partition
3 | from .plot_animal_tree import plot_animal_tree
4 | from .plot_rbf_svm_parameters import plot_svm
5 | from .plot_knn_regression import plot_knn_regression
6 | from .plot_knn_classification import plot_knn_classification
7 | from .plot_2d_separator import plot_2d_classification, plot_2d_separator
8 | from .plot_nn_graphs import (plot_logistic_regression_graph,
9 | plot_single_hidden_layer_graph,
10 | plot_two_hidden_layer_graph)
11 | from .plot_linear_regression import plot_linear_regression_wave
12 | from .plot_tree_nonmonotonous import plot_tree_not_monotone
13 | from .plot_scaling import plot_scaling
14 | from .plot_pca import plot_pca_illustration, plot_pca_whitening, plot_pca_faces
15 | from .plot_decomposition import plot_decomposition
16 | from .plot_nmf import plot_nmf_illustration, plot_nmf_faces
17 | from .plot_helpers import cm2, cm3
18 | from .plot_agglomerative import plot_agglomerative, plot_agglomerative_algorithm
19 | from .plot_kmeans import plot_kmeans_algorithm, plot_kmeans_boundaries, plot_kmeans_faces
20 | from .plot_improper_preprocessing import plot_improper_processing, plot_proper_processing
21 | from .plot_cross_validation import (plot_threefold_split, plot_group_kfold,
22 | plot_shuffle_split, plot_cross_validation,
23 | plot_stratified_cross_validation)
24 |
25 | from .plot_grid_search import plot_grid_search_overview, plot_cross_val_selection
26 | from .plot_metrics import (plot_confusion_matrix_illustration,
27 | plot_binary_confusion_matrix,
28 | plot_decision_threshold)
29 | from .plot_dbscan import plot_dbscan
30 | from .plot_ridge import plot_ridge_n_samples
31 | from .plot_kneighbors_regularization import plot_kneighbors_regularization
32 |
33 | __all__ = ['plot_linear_svc_regularization',
34 | "plot_animal_tree", "plot_tree_progressive",
35 | 'plot_tree_partition', 'plot_svm',
36 | 'plot_knn_regression',
37 | 'plot_logistic_regression_graph',
38 | 'plot_single_hidden_layer_graph',
39 | 'plot_two_hidden_layer_graph',
40 | 'plot_2d_classification',
41 | 'plot_2d_separator',
42 | 'plot_knn_classification',
43 | 'plot_linear_regression_wave',
44 | 'plot_tree_not_monotone',
45 | 'plot_scaling',
46 | 'plot_pca_illustration',
47 | 'plot_pca_faces',
48 | 'plot_pca_whitening',
49 | 'plot_decomposition',
50 | 'plot_nmf_illustration',
51 | 'plot_nmf_faces',
52 | 'plot_agglomerative',
53 | 'plot_agglomerative_algorithm',
54 | 'plot_kmeans_boundaries',
55 | 'plot_kmeans_algorithm',
56 | 'plot_kmeans_faces',
57 | 'cm3', 'cm2', 'plot_improper_processing', 'plot_proper_processing',
58 | 'plot_group_kfold',
59 | 'plot_shuffle_split',
60 | 'plot_stratified_cross_validation',
61 | 'plot_threefold_split',
62 | 'plot_cross_validation',
63 | 'plot_grid_search_overview',
64 | 'plot_cross_val_selection',
65 | 'plot_confusion_matrix_illustration',
66 | 'plot_binary_confusion_matrix',
67 | 'plot_decision_threshold',
68 | 'plot_dbscan',
69 | 'plot_ridge_n_samples',
70 | 'plot_kneighbors_regularization'
71 | ]
72 |
--------------------------------------------------------------------------------
/mglearn/tools.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.datasets import make_blobs
3 | from sklearn.tree import export_graphviz
4 | import matplotlib.pyplot as plt
5 | from .plot_2d_separator import (plot_2d_separator, plot_2d_classification,
6 | plot_2d_scores)
7 | from .plot_helpers import cm2 as cm, discrete_scatter
8 |
9 |
10 | def visualize_coefficients(coefficients, feature_names, n_top_features=25):
11 | """Visualize coefficients of a linear model.
12 |
13 | Parameters
14 | ----------
15 | coefficients : nd-array, shape (n_features,)
16 | Model coefficients.
17 |
18 | feature_names : list or nd-array of strings, shape (n_features,)
19 | Feature names for labeling the coefficients.
20 |
21 | n_top_features : int, default=25
22 | How many features to show. The function will show the largest (most
23 | positive) and smallest (most negative) n_top_features coefficients,
24 | for a total of 2 * n_top_features coefficients.
25 | """
26 | coefficients = coefficients.squeeze()
27 | if coefficients.ndim > 1:
28 | # this is not a row or column vector
29 | raise ValueError("coeffients must be 1d array or column vector, got"
30 | " shape {}".format(coefficients.shape))
31 | coefficients = coefficients.ravel()
32 |
33 | if len(coefficients) != len(feature_names):
34 | raise ValueError("Number of coefficients {} doesn't match number of"
35 | "feature names {}.".format(len(coefficients),
36 | len(feature_names)))
37 | # get coefficients with large absolute values
38 | coef = coefficients.ravel()
39 | positive_coefficients = np.argsort(coef)[-n_top_features:]
40 | negative_coefficients = np.argsort(coef)[:n_top_features]
41 | interesting_coefficients = np.hstack([negative_coefficients,
42 | positive_coefficients])
43 | # plot them
44 | plt.figure(figsize=(15, 5))
45 | colors = [cm(1) if c < 0 else cm(0)
46 | for c in coef[interesting_coefficients]]
47 | plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients],
48 | color=colors)
49 | feature_names = np.array(feature_names)
50 | plt.subplots_adjust(bottom=0.3)
51 | plt.xticks(np.arange(1, 1 + 2 * n_top_features),
52 | feature_names[interesting_coefficients], rotation=60,
53 | ha="right")
54 | plt.ylabel("Coefficient magnitude")
55 | plt.xlabel("Feature")
56 |
57 |
58 | def heatmap(values, xlabel, ylabel, xticklabels, yticklabels, cmap=None,
59 | vmin=None, vmax=None, ax=None, fmt="%0.2f"):
60 | if ax is None:
61 | ax = plt.gca()
62 | # plot the mean cross-validation scores
63 | img = ax.pcolor(values, cmap=cmap, vmin=vmin, vmax=vmax)
64 | img.update_scalarmappable()
65 | ax.set_xlabel(xlabel)
66 | ax.set_ylabel(ylabel)
67 | ax.set_xticks(np.arange(len(xticklabels)) + .5)
68 | ax.set_yticks(np.arange(len(yticklabels)) + .5)
69 | ax.set_xticklabels(xticklabels)
70 | ax.set_yticklabels(yticklabels)
71 | ax.set_aspect(1)
72 |
73 | for p, color, value in zip(img.get_paths(), img.get_facecolors(),
74 | img.get_array()):
75 | x, y = p.vertices[:-2, :].mean(0)
76 | if np.mean(color[:3]) > 0.5:
77 | c = 'k'
78 | else:
79 | c = 'w'
80 | ax.text(x, y, fmt % value, color=c, ha="center", va="center")
81 | return img
82 |
83 |
84 | def make_handcrafted_dataset():
85 | # a carefully hand-designed dataset lol
86 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
87 | y[np.array([7, 27])] = 0
88 | mask = np.ones(len(X), dtype=bool)
89 | mask[np.array([0, 1, 5, 26])] = 0
90 | X, y = X[mask], y[mask]
91 | return X, y
92 |
93 |
94 | def print_topics(topics, feature_names, sorting, topics_per_chunk=6,
95 | n_words=20):
96 | for i in range(0, len(topics), topics_per_chunk):
97 | # for each chunk:
98 | these_topics = topics[i: i + topics_per_chunk]
99 | # maybe we have less than topics_per_chunk left
100 | len_this_chunk = len(these_topics)
101 | # print topic headers
102 | print(("topic {:<8}" * len_this_chunk).format(*these_topics))
103 | print(("-------- {0:<5}" * len_this_chunk).format(""))
104 | # print top n_words frequent words
105 | for i in range(n_words):
106 | try:
107 | print(("{:<14}" * len_this_chunk).format(
108 | *feature_names[sorting[these_topics, i]]))
109 | except:
110 | pass
111 | print("\n")
112 |
113 |
114 | def get_tree(tree, **kwargs):
115 | try:
116 | # python3
117 | from io import StringIO
118 | except ImportError:
119 | # python2
120 | from StringIO import StringIO
121 | f = StringIO()
122 | export_graphviz(tree, f, **kwargs)
123 | import graphviz
124 | return graphviz.Source(f.getvalue())
125 |
126 | __all__ = ['plot_2d_separator', 'plot_2d_classification', 'plot_2d_scores',
127 | 'cm', 'visualize_coefficients', 'print_topics', 'heatmap',
128 | 'discrete_scatter']
129 |
--------------------------------------------------------------------------------
/preamble.py:
--------------------------------------------------------------------------------
1 | from IPython.display import set_matplotlib_formats, display
2 | import pandas as pd
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 | import mglearn
6 | from cycler import cycler
7 |
8 | set_matplotlib_formats('pdf', 'png')
9 | plt.rcParams['savefig.dpi'] = 300
10 | plt.rcParams['image.cmap'] = "viridis"
11 | plt.rcParams['image.interpolation'] = "none"
12 | plt.rcParams['savefig.bbox'] = "tight"
13 | plt.rcParams['lines.linewidth'] = 2
14 | plt.rcParams['legend.numpoints'] = 1
15 | plt.rc('axes', prop_cycle=(
16 | cycler('color', mglearn.plot_helpers.cm_cycle.colors) +
17 | cycler('linestyle', ['-', '-', "--", (0, (3, 3)), (0, (1.5, 1.5))])))
18 |
19 | np.set_printoptions(precision=3, suppress=True)
20 |
21 | pd.set_option("display.max_columns", 8)
22 | pd.set_option('display.precision', 2)
23 |
24 | __all__ = ['np', 'mglearn', 'display', 'plt', 'pd']
25 |
--------------------------------------------------------------------------------