├── .gitignore ├── 01-introduction.ipynb ├── 02-supervised-learning.ipynb ├── 03-unsupervised-learning.ipynb ├── 04-representing-data-feature-engineering.ipynb ├── 05-model-evaluation-and-improvement.ipynb ├── 06-algorithm-chains-and-pipelines.ipynb ├── 07-working-with-text-data.ipynb ├── 08-conclusion.ipynb ├── README.md ├── cover.jpg ├── data ├── adult.data ├── citibike.csv └── ram_price.csv ├── environment.yml ├── images ├── 05_gridsearch_overview.png ├── api_table.png ├── bag_of_words.png ├── bag_of_words.svg ├── classifier_comparison.png ├── dendrogram.png ├── iris_petal_sepal.png ├── iris_petal_sepal.svg ├── overfitting_underfitting_cartoon.png ├── overfitting_underfitting_cartoon.svg ├── pipeline.png └── pipeline.svg ├── mglearn ├── __init__.py ├── datasets.py ├── make_blobs.py ├── plot_2d_separator.py ├── plot_agglomerative.py ├── plot_animal_tree.py ├── plot_cross_validation.py ├── plot_dbscan.py ├── plot_decomposition.py ├── plot_grid_search.py ├── plot_helpers.py ├── plot_improper_preprocessing.py ├── plot_interactive_tree.py ├── plot_kmeans.py ├── plot_kneighbors_regularization.py ├── plot_knn_classification.py ├── plot_knn_regression.py ├── plot_linear_regression.py ├── plot_linear_svc_regularization.py ├── plot_metrics.py ├── plot_nmf.py ├── plot_nn_graphs.py ├── plot_pca.py ├── plot_rbf_svm_parameters.py ├── plot_ridge.py ├── plot_scaling.py ├── plot_tree_nonmonotonous.py ├── plots.py └── tools.py └── preamble.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | .ipynb_checkpoints/ 4 | -------------------------------------------------------------------------------- /08-conclusion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "hide_input": false 7 | }, 8 | "source": [ 9 | "## Outlook\n", 10 | "### Approaching a machine learning problem\n", 11 | "### Humans in the loop" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### From prototype to production" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Testing production systems" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Building your own estimator" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from sklearn.base import BaseEstimator, TransformerMixin\n", 42 | "\n", 43 | "class MyTransformer(BaseEstimator, TransformerMixin):\n", 44 | " def __init__(self, first_paramter=1, second_parameter=2):\n", 45 | " # all parameters must be specified in the __init__ function\n", 46 | " self.first_paramter = 1\n", 47 | " self.second_parameter = 2\n", 48 | " \n", 49 | " def fit(self, X, y=None):\n", 50 | " # fit should only take X and y as parameters\n", 51 | " # even if your model is unsupervised, you need to accept a y argument!\n", 52 | " \n", 53 | " # Model fitting code goes here\n", 54 | " print(\"fitting the model right here\")\n", 55 | " # fit returns self\n", 56 | " return self\n", 57 | " \n", 58 | " def transform(self, X):\n", 59 | " # transform takes as parameter only X\n", 60 | " \n", 61 | " # apply some transformation to X:\n", 62 | " X_transformed = X + 1\n", 63 | " return X_transformed" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Where to go from here\n", 71 | "#### Theory\n", 72 | "#### Other machine learning frameworks and packages\n", 73 | "#### Ranking, recommender systems, time series, and other kinds of learning\n", 74 | "#### Probabilistic modeling, inference and probabilistic programming" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "#### Neural Networks" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "#### Scaling to larger datasets" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "#### Honing your skills" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "#### Conclusion" 103 | ] 104 | } 105 | ], 106 | "metadata": { 107 | "anaconda-cloud": {}, 108 | "kernelspec": { 109 | "display_name": "Python [conda env:root] *", 110 | "language": "python", 111 | "name": "conda-root-py" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.7.6" 124 | }, 125 | "toc-autonumbering": false 126 | }, 127 | "nbformat": 4, 128 | "nbformat_minor": 4 129 | } 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/amueller/introduction_to_ml_with_python/master) 2 | 3 | # Introduction to Machine Learning with Python 4 | 5 | This repository holds the code for the forthcoming book "Introduction to Machine 6 | Learning with Python" by [Andreas Mueller](http://amueller.io) and [Sarah Guido](https://twitter.com/sarah_guido). 7 | You can find details about the book on the [O'Reilly website](http://shop.oreilly.com/product/0636920030515.do). 8 | 9 | The book requires the current stable version of scikit-learn, that is 10 | 0.20.0. Most of the book can also be used with previous versions of 11 | scikit-learn, though you need to adjust the import for everything from the 12 | ``model_selection`` module, mostly ``cross_val_score``, ``train_test_split`` 13 | and ``GridSearchCV``. 14 | 15 | 16 | This repository provides the notebooks from which the book is created, together 17 | with the ``mglearn`` library of helper functions to create figures and 18 | datasets. 19 | 20 | For the curious ones, the cover depicts a [hellbender](https://en.wikipedia.org/wiki/Hellbender). 21 | 22 | All datasets are included in the repository, with the exception of the aclImdb dataset, which you can download from 23 | the page of [Andrew Maas](http://ai.stanford.edu/~amaas/data/sentiment/). See the book for details. 24 | 25 | If you get ``ImportError: No module named mglearn`` you can try to install mglearn into your python environment using 26 | the command ``pip install mglearn`` in your terminal or ``!pip install mglearn`` in Jupyter Notebook. 27 | 28 | 29 | ## Errata 30 | Please note that the first print of the book is missing the following line when listing the assumed imports: 31 | 32 | ```python 33 | from IPython.display import display 34 | ``` 35 | Please add this line if you see an error involving ``display``. 36 | 37 | 38 | The first print of the book used a function called ``plot_group_kfold``. 39 | This has been renamed to ``plot_label_kfold`` because of a rename in 40 | scikit-learn. 41 | 42 | ## Setup 43 | 44 | To run the code, you need the packages ``numpy``, ``scipy``, ``scikit-learn``, ``matplotlib``, ``pandas`` and ``pillow``. 45 | Some of the visualizations of decision trees and neural networks structures also require ``graphviz``. The chapter 46 | on text processing also requires ``nltk`` and ``spacy``. 47 | 48 | The easiest way to set up an environment is by installing [Anaconda](https://www.continuum.io/downloads). 49 | 50 | ### Installing packages with conda: 51 | If you already have a Python environment set up, and you are using the ``conda`` package manager, you can get all packages by running 52 | 53 | conda install numpy scipy scikit-learn matplotlib pandas pillow graphviz python-graphviz 54 | 55 | For the chapter on text processing you also need to install ``nltk`` and ``spacy``: 56 | 57 | conda install nltk spacy 58 | 59 | 60 | ### Installing packages with pip 61 | If you already have a Python environment and are using pip to install packages, you need to run 62 | 63 | pip install numpy scipy scikit-learn matplotlib pandas pillow graphviz 64 | 65 | You also need to install the graphiz C-library, which is easiest using a package manager. 66 | If you are using OS X and homebrew, you can ``brew install graphviz``. If you are on Ubuntu or debian, you can ``apt-get install graphviz``. 67 | Installing graphviz on Windows can be tricky and using conda / anaconda is recommended. 68 | For the chapter on text processing you also need to install ``nltk`` and ``spacy``: 69 | 70 | pip install nltk spacy 71 | 72 | ### Downloading English language model 73 | For the text processing chapter, you need to download the English language model for spacy using 74 | 75 | python -m spacy download en 76 | 77 | ## Submitting Errata 78 | 79 | If you have errata for the (e-)book, please submit them via the [O'Reilly Website](http://www.oreilly.com/catalog/errata.csp?isbn=0636920030515). 80 | You can submit fixes to the code as pull-requests here, but I'd appreciate it if you would also submit them there, as this repository doesn't hold the 81 | "master notebooks". 82 | 83 | ![cover](cover.jpg) 84 | -------------------------------------------------------------------------------- /cover.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/cover.jpg -------------------------------------------------------------------------------- /data/ram_price.csv: -------------------------------------------------------------------------------- 1 | ,date,price 2 | 0,1957.0,411041792.0 3 | 1,1959.0,67947725.0 4 | 2,1960.0,5242880.0 5 | 3,1965.0,2642412.0 6 | 4,1970.0,734003.0 7 | 5,1973.0,399360.0 8 | 6,1974.0,314573.0 9 | 7,1975.0,421888.0 10 | 8,1975.08,180224.0 11 | 9,1975.25,67584.0 12 | 10,1975.75,49920.0 13 | 11,1976.0,40704.0 14 | 12,1976.17,48960.0 15 | 13,1976.42,23040.0 16 | 14,1976.58,32000.0 17 | 15,1977.08,36800.0 18 | 16,1978.17,28000.0 19 | 17,1978.25,29440.0 20 | 18,1978.33,19200.0 21 | 19,1978.5,24000.0 22 | 20,1978.58,16000.0 23 | 21,1978.75,15200.0 24 | 22,1979.0,10528.0 25 | 23,1979.75,6704.0 26 | 24,1980.0,6480.0 27 | 25,1981.0,8800.0 28 | 26,1981.58,4479.0 29 | 27,1982.0,3520.0 30 | 28,1982.17,4464.0 31 | 29,1982.67,1980.0 32 | 30,1983.0,2396.0 33 | 31,1983.67,1980.0 34 | 32,1984.0,1379.0 35 | 33,1984.58,1331.0 36 | 34,1985.0,880.0 37 | 35,1985.33,720.0 38 | 36,1985.42,550.0 39 | 37,1985.5,420.0 40 | 38,1985.58,350.0 41 | 39,1985.67,300.0 42 | 40,1985.83,300.0 43 | 41,1985.92,300.0 44 | 42,1986.0,300.0 45 | 43,1986.08,300.0 46 | 44,1986.17,300.0 47 | 45,1986.25,300.0 48 | 46,1986.33,190.0 49 | 47,1986.42,190.0 50 | 48,1986.5,190.0 51 | 49,1986.58,190.0 52 | 50,1986.67,190.0 53 | 51,1986.75,190.0 54 | 52,1986.92,190.0 55 | 53,1987.0,176.0 56 | 54,1987.08,176.0 57 | 55,1987.17,157.0 58 | 56,1987.25,154.0 59 | 57,1987.33,154.0 60 | 58,1987.42,154.0 61 | 59,1987.5,154.0 62 | 60,1987.58,154.0 63 | 61,1987.67,163.0 64 | 62,1987.75,133.0 65 | 63,1987.83,163.0 66 | 64,1987.92,163.0 67 | 65,1988.0,163.0 68 | 66,1988.08,182.0 69 | 67,1988.17,199.0 70 | 68,1988.33,199.0 71 | 69,1988.42,199.0 72 | 70,1988.5,505.0 73 | 71,1988.58,505.0 74 | 72,1988.67,505.0 75 | 73,1988.75,505.0 76 | 74,1988.83,505.0 77 | 75,1988.92,505.0 78 | 76,1989.0,505.0 79 | 77,1989.08,505.0 80 | 78,1989.17,505.0 81 | 79,1989.25,505.0 82 | 80,1989.42,344.0 83 | 81,1989.5,197.0 84 | 82,1989.58,188.0 85 | 83,1989.67,188.0 86 | 84,1989.75,128.0 87 | 85,1989.83,117.0 88 | 86,1989.92,113.0 89 | 87,1990.0,106.0 90 | 88,1990.17,98.3 91 | 89,1990.33,98.3 92 | 90,1990.42,89.5 93 | 91,1990.5,82.8 94 | 92,1990.58,81.1 95 | 93,1990.67,71.5 96 | 94,1990.75,59.0 97 | 95,1990.83,51.0 98 | 96,1990.92,45.5 99 | 97,1991.0,44.5 100 | 98,1991.08,44.5 101 | 99,1991.17,45.0 102 | 100,1991.25,45.0 103 | 101,1991.33,45.0 104 | 102,1991.42,43.8 105 | 103,1991.5,43.8 106 | 104,1991.58,41.3 107 | 105,1991.67,46.3 108 | 106,1991.75,45.0 109 | 107,1991.83,39.8 110 | 108,1991.92,39.8 111 | 109,1992.0,36.3 112 | 110,1992.08,36.3 113 | 111,1992.17,36.3 114 | 112,1992.25,34.8 115 | 113,1992.33,30.0 116 | 114,1992.42,32.5 117 | 115,1992.5,33.5 118 | 116,1992.58,31.0 119 | 117,1992.67,27.5 120 | 118,1992.75,26.3 121 | 119,1992.83,26.3 122 | 120,1992.92,26.3 123 | 121,1993.0,33.1 124 | 122,1993.08,27.5 125 | 123,1993.17,27.5 126 | 124,1993.25,27.5 127 | 125,1993.33,27.5 128 | 126,1993.42,30.0 129 | 127,1993.5,30.0 130 | 128,1993.58,30.0 131 | 129,1993.67,30.0 132 | 130,1993.75,36.0 133 | 131,1993.83,39.8 134 | 132,1993.92,35.8 135 | 133,1994.0,35.8 136 | 134,1994.08,35.8 137 | 135,1994.17,36.0 138 | 136,1994.25,37.3 139 | 137,1994.33,37.3 140 | 138,1994.42,37.3 141 | 139,1994.5,38.5 142 | 140,1994.58,37.0 143 | 141,1994.67,34.0 144 | 142,1994.75,33.5 145 | 143,1994.83,32.3 146 | 144,1994.92,32.3 147 | 145,1995.0,32.3 148 | 146,1995.08,32.0 149 | 147,1995.17,32.0 150 | 148,1995.25,31.2 151 | 149,1995.33,31.2 152 | 150,1995.42,31.1 153 | 151,1995.5,31.2 154 | 152,1995.58,30.6 155 | 153,1995.67,33.1 156 | 154,1995.75,33.1 157 | 155,1995.83,30.9 158 | 156,1995.92,30.9 159 | 157,1996.0,29.9 160 | 158,1996.08,28.8 161 | 159,1996.17,26.1 162 | 160,1996.25,24.7 163 | 161,1996.33,17.2 164 | 162,1996.42,14.9 165 | 163,1996.5,11.3 166 | 164,1996.58,9.06 167 | 165,1996.67,8.44 168 | 166,1996.75,8.0 169 | 167,1996.83,5.25 170 | 168,1996.92,5.25 171 | 169,1997.0,4.63 172 | 170,1997.08,3.63 173 | 171,1997.17,3.0 174 | 172,1997.25,3.0 175 | 173,1997.33,3.0 176 | 174,1997.42,3.69 177 | 175,1997.5,4.0 178 | 176,1997.58,4.13 179 | 177,1997.67,3.63 180 | 178,1997.75,3.41 181 | 179,1997.83,3.25 182 | 180,1997.92,2.16 183 | 181,1998.0,2.16 184 | 182,1998.08,0.91 185 | 183,1998.17,0.97 186 | 184,1998.25,1.22 187 | 185,1998.33,1.19 188 | 186,1998.42,0.97 189 | 187,1998.58,1.03 190 | 188,1998.67,0.97 191 | 189,1998.75,1.16 192 | 190,1998.83,0.84 193 | 191,1998.92,0.84 194 | 192,1999.08,1.44 195 | 193,1999.13,0.84 196 | 194,1999.17,1.25 197 | 195,1999.25,1.25 198 | 196,1999.33,0.86 199 | 197,1999.5,0.78 200 | 198,1999.67,0.87 201 | 199,1999.75,1.04 202 | 200,1999.83,1.34 203 | 201,1999.92,2.35 204 | 202,2000.0,1.56 205 | 203,2000.08,1.48 206 | 204,2000.17,1.08 207 | 205,2000.25,0.84 208 | 206,2000.33,0.7 209 | 207,2000.42,0.9 210 | 208,2000.5,0.77 211 | 209,2000.58,0.84 212 | 210,2000.67,1.07 213 | 211,2000.75,1.12 214 | 212,2000.83,1.12 215 | 213,2000.92,0.9 216 | 214,2001.0,0.75 217 | 215,2001.08,0.464 218 | 216,2001.17,0.464 219 | 217,2001.25,0.383 220 | 218,2001.33,0.387 221 | 219,2001.42,0.305 222 | 220,2001.5,0.352 223 | 221,2001.5,0.27 224 | 222,2001.58,0.191 225 | 223,2001.67,0.191 226 | 224,2001.75,0.169 227 | 225,2001.77,0.148 228 | 226,2002.08,0.134 229 | 227,2002.08,0.207 230 | 228,2002.25,0.193 231 | 229,2002.33,0.193 232 | 230,2002.42,0.33 233 | 231,2002.58,0.193 234 | 232,2002.75,0.193 235 | 233,2003.17,0.176 236 | 234,2003.25,0.076 237 | 235,2003.33,0.126 238 | 236,2003.42,0.115 239 | 237,2003.5,0.133 240 | 238,2003.58,0.129 241 | 239,2003.67,0.143 242 | 240,2003.75,0.148 243 | 241,2003.83,0.16 244 | 242,2003.99,0.166 245 | 243,2004.0,0.174 246 | 244,2004.08,0.148 247 | 245,2004.17,0.146 248 | 246,2004.33,0.156 249 | 247,2004.42,0.203 250 | 248,2004.5,0.176 251 | 249,2005.25,0.185 252 | 250,2005.42,0.149 253 | 251,2005.83,0.116 254 | 252,2005.92,0.185 255 | 253,2006.17,0.112 256 | 254,2006.33,0.073 257 | 255,2006.5,0.082 258 | 256,2006.67,0.073 259 | 257,2006.75,0.088 260 | 258,2006.83,0.098 261 | 259,2006.99,0.092 262 | 260,2007.0,0.082 263 | 261,2007.08,0.078 264 | 262,2007.17,0.066 265 | 263,2007.33,0.0464 266 | 264,2007.5,0.0386 267 | 265,2007.67,0.0351 268 | 266,2007.75,0.0322 269 | 267,2007.83,0.0244 270 | 268,2007.92,0.0244 271 | 269,2008.0,0.0232 272 | 270,2008.08,0.022 273 | 271,2008.33,0.022 274 | 272,2008.5,0.0207 275 | 273,2008.58,0.0176 276 | 274,2008.67,0.0146 277 | 275,2008.83,0.011 278 | 276,2008.92,0.0098 279 | 277,2009.0,0.0098 280 | 278,2009.08,0.0107 281 | 279,2009.25,0.0105 282 | 280,2009.42,0.0115 283 | 281,2009.5,0.011 284 | 282,2009.58,0.0127 285 | 283,2009.75,0.0183 286 | 284,2009.92,0.0205 287 | 285,2010.0,0.019 288 | 286,2010.08,0.0202 289 | 287,2010.17,0.0195 290 | 288,2010.33,0.0242 291 | 289,2010.5,0.021 292 | 290,2010.58,0.022 293 | 291,2010.75,0.0171 294 | 292,2010.83,0.0146 295 | 293,2010.92,0.0122 296 | 294,2011.0,0.01 297 | 295,2011.08,0.0103 298 | 296,2011.33,0.01 299 | 297,2011.42,0.0085 300 | 298,2011.67,0.0054 301 | 299,2011.75,0.0051 302 | 300,2012.0,0.0049 303 | 301,2012.08,0.0049 304 | 302,2012.25,0.005 305 | 303,2012.33,0.0049 306 | 304,2012.58,0.0048 307 | 305,2012.67,0.004 308 | 306,2012.83,0.0037 309 | 307,2013.0,0.0043 310 | 308,2013.08,0.0054 311 | 309,2013.33,0.0067 312 | 310,2013.42,0.0061 313 | 311,2013.58,0.0073 314 | 312,2013.67,0.0065 315 | 313,2013.75,0.0082 316 | 314,2013.83,0.0085 317 | 315,2013.92,0.0079 318 | 316,2014.08,0.0095 319 | 317,2014.17,0.0079 320 | 318,2014.25,0.0073 321 | 319,2014.42,0.0079 322 | 320,2014.58,0.0085 323 | 321,2014.67,0.0085 324 | 322,2014.83,0.0085 325 | 323,2015.0,0.0078 326 | 324,2015.08,0.0073 327 | 325,2015.25,0.0061 328 | 326,2015.33,0.0056 329 | 327,2015.5,0.0049 330 | 328,2015.58,0.0045 331 | 329,2015.67,0.0043 332 | 330,2015.75,0.0042 333 | 331,2015.83,0.0038 334 | 332,2015.92,0.0037 335 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: python-ml 2 | dependencies: 3 | - numpy 4 | - scipy 5 | - scikit-learn 6 | - matplotlib 7 | - pandas 8 | - pillow 9 | - graphviz 10 | - python-graphviz 11 | - imageio 12 | - joblib 13 | -------------------------------------------------------------------------------- /images/05_gridsearch_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/05_gridsearch_overview.png -------------------------------------------------------------------------------- /images/api_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/api_table.png -------------------------------------------------------------------------------- /images/bag_of_words.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/bag_of_words.png -------------------------------------------------------------------------------- /images/classifier_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/classifier_comparison.png -------------------------------------------------------------------------------- /images/dendrogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/dendrogram.png -------------------------------------------------------------------------------- /images/iris_petal_sepal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/iris_petal_sepal.png -------------------------------------------------------------------------------- /images/overfitting_underfitting_cartoon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/overfitting_underfitting_cartoon.png -------------------------------------------------------------------------------- /images/overfitting_underfitting_cartoon.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlModel complexity 385 | Accuracy 403 | Training 421 | Generalization 439 | Underfitting 457 | Overfitting 475 | Sweet spot 493 | -------------------------------------------------------------------------------- /images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/introduction_to_ml_with_python/ea60cf6cf791553b6cca7cf31802c68cb3798ebb/images/pipeline.png -------------------------------------------------------------------------------- /images/pipeline.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 28 | 34 | 35 | 43 | 49 | 50 | 58 | 64 | 65 | 73 | 79 | 80 | 88 | 94 | 95 | 103 | 109 | 110 | 118 | 124 | 125 | 133 | 139 | 140 | 148 | 154 | 155 | 163 | 169 | 170 | 178 | 184 | 185 | 193 | 199 | 200 | 208 | 214 | 215 | 223 | 229 | 230 | 238 | 244 | 245 | 253 | 259 | 260 | 268 | 274 | 275 | 283 | 289 | 290 | 298 | 304 | 305 | 313 | 319 | 320 | 328 | 334 | 335 | 343 | 349 | 350 | 351 | 373 | 375 | 376 | 378 | image/svg+xml 379 | 381 | 382 | 383 | 384 | 385 | 390 | pipe.fit(X, y) 402 | 406 | 413 | T1 424 | 425 | X 436 | y 447 | 454 | 461 | T1.fit(X, y) 472 | T2.fit(X1, y) 483 | Classifier.fit(X2, y) 494 | T1.transform(X) 505 | pipe.predict(X') 517 | X' 528 | y' 539 | Classifier.predict(X'2) 550 | 556 | 560 | 567 | T2 578 | 579 | 583 | 590 | Classifier 601 | 602 | 606 | 613 | T2 624 | 625 | 629 | 636 | T1 647 | 648 | X1 659 | 665 | y 676 | T2.transform(X1) 687 | X2 698 | y 709 | 713 | 720 | Classifier 731 | 732 | 739 | T1.transform(X') 750 | X'1 761 | 767 | T2.transform(X'1) 778 | X'2 789 | 794 | 799 | 805 | 811 | 817 | pipe = make_pipeline(T1(), T2(), Classifier()) pipe = make_pipeline(T1(), T2(), Classifier()) 842 | 853 | 854 | 855 | -------------------------------------------------------------------------------- /mglearn/__init__.py: -------------------------------------------------------------------------------- 1 | from . import plots 2 | from . import tools 3 | from .plots import cm3, cm2 4 | from .tools import discrete_scatter 5 | from .plot_helpers import ReBl 6 | 7 | __version__ = "0.2.0" 8 | 9 | __all__ = ['tools', 'plots', 'cm3', 'cm2', 'discrete_scatter', 'ReBl'] 10 | -------------------------------------------------------------------------------- /mglearn/datasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | from scipy import signal 5 | from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures 6 | from sklearn.datasets import make_blobs 7 | from sklearn.utils import Bunch 8 | 9 | DATA_PATH = os.path.join(os.path.dirname(__file__), "data") 10 | 11 | 12 | def make_forge(): 13 | # a carefully hand-designed dataset lol 14 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 15 | y[np.array([7, 27])] = 0 16 | mask = np.ones(len(X), dtype=bool) 17 | mask[np.array([0, 1, 5, 26])] = 0 18 | X, y = X[mask], y[mask] 19 | return X, y 20 | 21 | 22 | def make_wave(n_samples=100): 23 | rnd = np.random.RandomState(42) 24 | x = rnd.uniform(-3, 3, size=n_samples) 25 | y_no_noise = (np.sin(4 * x) + x) 26 | y = (y_no_noise + rnd.normal(size=len(x))) / 2 27 | return x.reshape(-1, 1), y 28 | 29 | 30 | def load_boston(): 31 | try: 32 | from sklearn.datasets import load_boston 33 | return load_boston() 34 | except ImportError: 35 | pass 36 | data_url = "http://lib.stat.cmu.edu/datasets/boston" 37 | raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) 38 | data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) 39 | target = raw_df.values[1::2, 2] 40 | return Bunch(data=data, target=target) 41 | 42 | 43 | def load_extended_boston(): 44 | boston = load_boston() 45 | X = boston.data 46 | 47 | X = MinMaxScaler().fit_transform(boston.data) 48 | X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X) 49 | return X, boston.target 50 | 51 | 52 | def load_citibike(): 53 | data_mine = pd.read_csv(os.path.join(DATA_PATH, "citibike.csv")) 54 | data_mine['one'] = 1 55 | data_mine['starttime'] = pd.to_datetime(data_mine.starttime) 56 | data_starttime = data_mine.set_index("starttime") 57 | data_resampled = data_starttime.resample("3h").sum().fillna(0) 58 | return data_resampled.one 59 | 60 | 61 | def make_signals(): 62 | # fix a random state seed 63 | rng = np.random.RandomState(42) 64 | n_samples = 2000 65 | time = np.linspace(0, 8, n_samples) 66 | # create three signals 67 | s1 = np.sin(2 * time) # Signal 1 : sinusoidal signal 68 | s2 = np.sign(np.sin(3 * time)) # Signal 2 : square signal 69 | s3 = signal.sawtooth(2 * np.pi * time) # Signal 3: saw tooth signal 70 | 71 | # concatenate the signals, add noise 72 | S = np.c_[s1, s2, s3] 73 | S += 0.2 * rng.normal(size=S.shape) 74 | 75 | S /= S.std(axis=0) # Standardize data 76 | S -= S.min() 77 | return S 78 | -------------------------------------------------------------------------------- /mglearn/make_blobs.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import numpy as np 3 | 4 | from sklearn.utils import check_array, check_random_state 5 | from sklearn.utils import shuffle as shuffle_ 6 | from sklearn.utils.deprecation import deprecated 7 | 8 | 9 | @deprecated("Please import make_blobs directly from scikit-learn") 10 | def make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=1.0, 11 | center_box=(-10.0, 10.0), shuffle=True, random_state=None): 12 | """Generate isotropic Gaussian blobs for clustering. 13 | 14 | Read more in the :ref:`User Guide `. 15 | 16 | Parameters 17 | ---------- 18 | n_samples : int, or tuple, optional (default=100) 19 | The total number of points equally divided among clusters. 20 | 21 | n_features : int, optional (default=2) 22 | The number of features for each sample. 23 | 24 | centers : int or array of shape [n_centers, n_features], optional 25 | (default=3) 26 | The number of centers to generate, or the fixed center locations. 27 | 28 | cluster_std: float or sequence of floats, optional (default=1.0) 29 | The standard deviation of the clusters. 30 | 31 | center_box: pair of floats (min, max), optional (default=(-10.0, 10.0)) 32 | The bounding box for each cluster center when centers are 33 | generated at random. 34 | 35 | shuffle : boolean, optional (default=True) 36 | Shuffle the samples. 37 | 38 | random_state : int, RandomState instance or None, optional (default=None) 39 | If int, random_state is the seed used by the random number generator; 40 | If RandomState instance, random_state is the random number generator; 41 | If None, the random number generator is the RandomState instance used 42 | by `np.random`. 43 | 44 | Returns 45 | ------- 46 | X : array of shape [n_samples, n_features] 47 | The generated samples. 48 | 49 | y : array of shape [n_samples] 50 | The integer labels for cluster membership of each sample. 51 | 52 | Examples 53 | -------- 54 | >>> from sklearn.datasets.samples_generator import make_blobs 55 | >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2, 56 | ... random_state=0) 57 | >>> print(X.shape) 58 | (10, 2) 59 | >>> y 60 | array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0]) 61 | 62 | See also 63 | -------- 64 | make_classification: a more intricate variant 65 | """ 66 | generator = check_random_state(random_state) 67 | 68 | if isinstance(centers, numbers.Integral): 69 | centers = generator.uniform(center_box[0], center_box[1], 70 | size=(centers, n_features)) 71 | else: 72 | centers = check_array(centers) 73 | n_features = centers.shape[1] 74 | 75 | if isinstance(cluster_std, numbers.Real): 76 | cluster_std = np.ones(len(centers)) * cluster_std 77 | 78 | X = [] 79 | y = [] 80 | 81 | n_centers = centers.shape[0] 82 | if isinstance(n_samples, numbers.Integral): 83 | n_samples_per_center = [int(n_samples // n_centers)] * n_centers 84 | for i in range(n_samples % n_centers): 85 | n_samples_per_center[i] += 1 86 | else: 87 | n_samples_per_center = n_samples 88 | 89 | for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)): 90 | X.append(centers[i] + generator.normal(scale=std, 91 | size=(n, n_features))) 92 | y += [i] * n 93 | 94 | X = np.concatenate(X) 95 | y = np.array(y) 96 | 97 | if shuffle: 98 | X, y = shuffle_(X, y, random_state=generator) 99 | 100 | return X, y 101 | -------------------------------------------------------------------------------- /mglearn/plot_2d_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from .plot_helpers import cm2, cm3, discrete_scatter 4 | 5 | 6 | def plot_2d_classification(classifier, X, fill=False, ax=None, eps=None, 7 | alpha=1, cm=cm3): 8 | # multiclass 9 | if eps is None: 10 | eps = X.std() / 2. 11 | 12 | if ax is None: 13 | ax = plt.gca() 14 | 15 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 16 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 17 | xx = np.linspace(x_min, x_max, 1000) 18 | yy = np.linspace(y_min, y_max, 1000) 19 | 20 | X1, X2 = np.meshgrid(xx, yy) 21 | X_grid = np.c_[X1.ravel(), X2.ravel()] 22 | decision_values = classifier.predict(X_grid) 23 | ax.imshow(decision_values.reshape(X1.shape), extent=(x_min, x_max, 24 | y_min, y_max), 25 | aspect='auto', origin='lower', alpha=alpha, cmap=cm) 26 | ax.set_xlim(x_min, x_max) 27 | ax.set_ylim(y_min, y_max) 28 | ax.set_xticks(()) 29 | ax.set_yticks(()) 30 | 31 | 32 | def plot_2d_scores(classifier, X, ax=None, eps=None, alpha=1, cm="viridis", 33 | function=None): 34 | # binary with fill 35 | if eps is None: 36 | eps = X.std() / 2. 37 | 38 | if ax is None: 39 | ax = plt.gca() 40 | 41 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 42 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 43 | xx = np.linspace(x_min, x_max, 100) 44 | yy = np.linspace(y_min, y_max, 100) 45 | 46 | X1, X2 = np.meshgrid(xx, yy) 47 | X_grid = np.c_[X1.ravel(), X2.ravel()] 48 | if function is None: 49 | function = getattr(classifier, "decision_function", 50 | getattr(classifier, "predict_proba")) 51 | else: 52 | function = getattr(classifier, function) 53 | decision_values = function(X_grid) 54 | if decision_values.ndim > 1 and decision_values.shape[1] > 1: 55 | # predict_proba 56 | decision_values = decision_values[:, 1] 57 | grr = ax.imshow(decision_values.reshape(X1.shape), 58 | extent=(x_min, x_max, y_min, y_max), aspect='auto', 59 | origin='lower', alpha=alpha, cmap=cm) 60 | 61 | ax.set_xlim(x_min, x_max) 62 | ax.set_ylim(y_min, y_max) 63 | ax.set_xticks(()) 64 | ax.set_yticks(()) 65 | return grr 66 | 67 | 68 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None, alpha=1, 69 | cm=cm2, linewidth=None, threshold=None, 70 | linestyle="solid"): 71 | # binary? 72 | if eps is None: 73 | eps = X.std() / 2. 74 | 75 | if ax is None: 76 | ax = plt.gca() 77 | 78 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 79 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 80 | xx = np.linspace(x_min, x_max, 1000) 81 | yy = np.linspace(y_min, y_max, 1000) 82 | 83 | X1, X2 = np.meshgrid(xx, yy) 84 | X_grid = np.c_[X1.ravel(), X2.ravel()] 85 | try: 86 | decision_values = classifier.decision_function(X_grid) 87 | levels = [0] if threshold is None else [threshold] 88 | fill_levels = [decision_values.min()] + levels + [ 89 | decision_values.max()] 90 | except AttributeError: 91 | # no decision_function 92 | decision_values = classifier.predict_proba(X_grid)[:, 1] 93 | levels = [.5] if threshold is None else [threshold] 94 | fill_levels = [0] + levels + [1] 95 | if fill: 96 | ax.contourf(X1, X2, decision_values.reshape(X1.shape), 97 | levels=fill_levels, alpha=alpha, cmap=cm) 98 | else: 99 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels, 100 | colors="black", alpha=alpha, linewidths=linewidth, 101 | linestyles=linestyle, zorder=5) 102 | 103 | ax.set_xlim(x_min, x_max) 104 | ax.set_ylim(y_min, y_max) 105 | ax.set_xticks(()) 106 | ax.set_yticks(()) -------------------------------------------------------------------------------- /mglearn/plot_agglomerative.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.datasets import make_blobs 4 | from sklearn.cluster import AgglomerativeClustering 5 | from sklearn.neighbors import KernelDensity 6 | 7 | 8 | def plot_agglomerative_algorithm(): 9 | # generate synthetic two-dimensional data 10 | X, y = make_blobs(random_state=0, n_samples=12) 11 | 12 | agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X) 13 | 14 | fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={'xticks': (), 15 | 'yticks': ()}, 16 | figsize=(20, 8)) 17 | 18 | eps = X.std() / 2 19 | 20 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 21 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 22 | 23 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) 24 | gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] 25 | 26 | for i, ax in enumerate(axes.ravel()): 27 | ax.set_xlim(x_min, x_max) 28 | ax.set_ylim(y_min, y_max) 29 | agg.n_clusters = X.shape[0] - i 30 | agg.fit(X) 31 | ax.set_title("Step %d" % i) 32 | ax.scatter(X[:, 0], X[:, 1], s=60, c='grey') 33 | bins = np.bincount(agg.labels_) 34 | for cluster in range(agg.n_clusters): 35 | if bins[cluster] > 1: 36 | points = X[agg.labels_ == cluster] 37 | other_points = X[agg.labels_ != cluster] 38 | 39 | kde = KernelDensity(bandwidth=.5).fit(points) 40 | scores = kde.score_samples(gridpoints) 41 | score_inside = np.min(kde.score_samples(points)) 42 | score_outside = np.max(kde.score_samples(other_points)) 43 | levels = .8 * score_inside + .2 * score_outside 44 | ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], 45 | colors='k', linestyles='solid', linewidths=2) 46 | 47 | axes[0, 0].set_title("Initialization") 48 | 49 | 50 | def plot_agglomerative(): 51 | X, y = make_blobs(random_state=0, n_samples=12) 52 | agg = AgglomerativeClustering(n_clusters=3) 53 | 54 | eps = X.std() / 2. 55 | 56 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 57 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 58 | 59 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100)) 60 | gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)] 61 | 62 | ax = plt.gca() 63 | for i, x in enumerate(X): 64 | ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center') 65 | 66 | ax.scatter(X[:, 0], X[:, 1], s=60, c='grey') 67 | ax.set_xticks(()) 68 | ax.set_yticks(()) 69 | 70 | for i in range(11): 71 | agg.n_clusters = X.shape[0] - i 72 | agg.fit(X) 73 | 74 | bins = np.bincount(agg.labels_) 75 | for cluster in range(agg.n_clusters): 76 | if bins[cluster] > 1: 77 | points = X[agg.labels_ == cluster] 78 | other_points = X[agg.labels_ != cluster] 79 | 80 | kde = KernelDensity(bandwidth=.5).fit(points) 81 | scores = kde.score_samples(gridpoints) 82 | score_inside = np.min(kde.score_samples(points)) 83 | score_outside = np.max(kde.score_samples(other_points)) 84 | levels = .8 * score_inside + .2 * score_outside 85 | ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels], 86 | colors='k', linestyles='solid', linewidths=1) 87 | 88 | ax.set_xlim(x_min, x_max) 89 | ax.set_ylim(y_min, y_max) 90 | -------------------------------------------------------------------------------- /mglearn/plot_animal_tree.py: -------------------------------------------------------------------------------- 1 | from imageio import imread 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_animal_tree(ax=None): 6 | import graphviz 7 | if ax is None: 8 | ax = plt.gca() 9 | mygraph = graphviz.Digraph(node_attr={'shape': 'box'}, 10 | edge_attr={'labeldistance': "10.5"}, 11 | format="png") 12 | mygraph.node("0", "Has feathers?") 13 | mygraph.node("1", "Can fly?") 14 | mygraph.node("2", "Has fins?") 15 | mygraph.node("3", "Hawk") 16 | mygraph.node("4", "Penguin") 17 | mygraph.node("5", "Dolphin") 18 | mygraph.node("6", "Bear") 19 | mygraph.edge("0", "1", label="True") 20 | mygraph.edge("0", "2", label="False") 21 | mygraph.edge("1", "3", label="True") 22 | mygraph.edge("1", "4", label="False") 23 | mygraph.edge("2", "5", label="True") 24 | mygraph.edge("2", "6", label="False") 25 | mygraph.render("tmp") 26 | ax.imshow(imread("tmp.png")) 27 | ax.set_axis_off() 28 | -------------------------------------------------------------------------------- /mglearn/plot_cross_validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_group_kfold(): 6 | from sklearn.model_selection import GroupKFold 7 | groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3] 8 | 9 | plt.figure(figsize=(10, 2)) 10 | plt.title("GroupKFold") 11 | 12 | axes = plt.gca() 13 | axes.set_frame_on(False) 14 | 15 | n_folds = 12 16 | n_samples = 12 17 | n_iter = 3 18 | n_samples_per_fold = 1 19 | 20 | cv = GroupKFold(n_splits=3) 21 | mask = np.zeros((n_iter, n_samples)) 22 | for i, (train, test) in enumerate(cv.split(range(12), groups=groups)): 23 | mask[i, train] = 1 24 | mask[i, test] = 2 25 | 26 | for i in range(n_folds): 27 | # test is grey 28 | colors = ["grey" if x == 2 else "white" for x in mask[:, i]] 29 | # not selected has no hatch 30 | 31 | boxes = axes.barh(y=range(n_iter), width=[1 - 0.1] * n_iter, 32 | left=i * n_samples_per_fold, height=.6, color=colors, 33 | hatch="//", edgecolor="k", align='edge') 34 | for j in np.where(mask[:, i] == 0)[0]: 35 | boxes[j].set_hatch("") 36 | 37 | axes.barh(y=[n_iter] * n_folds, width=[1 - 0.1] * n_folds, 38 | left=np.arange(n_folds) * n_samples_per_fold, height=.6, 39 | color="w", edgecolor='k', align="edge") 40 | 41 | for i in range(12): 42 | axes.text((i + .5) * n_samples_per_fold, 3.5, "%d" % 43 | groups[i], horizontalalignment="center") 44 | 45 | axes.invert_yaxis() 46 | axes.set_xlim(0, n_samples + 1) 47 | axes.set_ylabel("CV iterations") 48 | axes.set_xlabel("Data points") 49 | axes.set_xticks(np.arange(n_samples) + .5) 50 | axes.set_xticklabels(np.arange(1, n_samples + 1)) 51 | axes.set_yticks(np.arange(n_iter + 1) + .3) 52 | axes.set_yticklabels( 53 | ["Split %d" % x for x in range(1, n_iter + 1)] + ["Group"]) 54 | plt.legend([boxes[0], boxes[1]], ["Training set", "Test set"], loc=(1, .3)) 55 | plt.tight_layout() 56 | 57 | 58 | def plot_shuffle_split(): 59 | from sklearn.model_selection import ShuffleSplit 60 | plt.figure(figsize=(10, 2)) 61 | plt.title("ShuffleSplit with 10 points" 62 | ", train_size=5, test_size=2, n_splits=4") 63 | 64 | axes = plt.gca() 65 | axes.set_frame_on(False) 66 | 67 | n_folds = 10 68 | n_samples = 10 69 | n_iter = 4 70 | n_samples_per_fold = 1 71 | 72 | ss = ShuffleSplit(n_splits=4, train_size=5, test_size=2, random_state=43) 73 | mask = np.zeros((n_iter, n_samples)) 74 | for i, (train, test) in enumerate(ss.split(range(10))): 75 | mask[i, train] = 1 76 | mask[i, test] = 2 77 | 78 | for i in range(n_folds): 79 | # test is grey 80 | colors = ["grey" if x == 2 else "white" for x in mask[:, i]] 81 | # not selected has no hatch 82 | 83 | boxes = axes.barh(y=range(n_iter), width=[1 - 0.1] * n_iter, 84 | left=i * n_samples_per_fold, height=.6, color=colors, 85 | hatch="//", edgecolor='k', align='edge') 86 | for j in np.where(mask[:, i] == 0)[0]: 87 | boxes[j].set_hatch("") 88 | 89 | axes.invert_yaxis() 90 | axes.set_xlim(0, n_samples + 1) 91 | axes.set_ylabel("CV iterations") 92 | axes.set_xlabel("Data points") 93 | axes.set_xticks(np.arange(n_samples) + .5) 94 | axes.set_xticklabels(np.arange(1, n_samples + 1)) 95 | axes.set_yticks(np.arange(n_iter) + .3) 96 | axes.set_yticklabels(["Split %d" % x for x in range(1, n_iter + 1)]) 97 | # legend hacked for this random state 98 | plt.legend([boxes[1], boxes[0], boxes[2]], [ 99 | "Training set", "Test set", "Not selected"], loc=(1, .3)) 100 | plt.tight_layout() 101 | 102 | 103 | def plot_stratified_cross_validation(): 104 | fig, both_axes = plt.subplots(2, 1, figsize=(12, 5)) 105 | # plt.title("cross_validation_not_stratified") 106 | axes = both_axes[0] 107 | axes.set_title("Standard cross-validation with sorted class labels") 108 | 109 | axes.set_frame_on(False) 110 | 111 | n_folds = 3 112 | n_samples = 150 113 | 114 | n_samples_per_fold = n_samples / float(n_folds) 115 | 116 | for i in range(n_folds): 117 | colors = ["w"] * n_folds 118 | colors[i] = "grey" 119 | axes.barh(y=range(n_folds), width=[n_samples_per_fold - 1] * 120 | n_folds, left=i * n_samples_per_fold, height=.6, 121 | color=colors, hatch="//", edgecolor='k', align='edge') 122 | 123 | axes.barh(y=[n_folds] * n_folds, width=[n_samples_per_fold - 1] * 124 | n_folds, left=np.arange(3) * n_samples_per_fold, height=.6, 125 | color="w", edgecolor='k', align='edge') 126 | 127 | axes.invert_yaxis() 128 | axes.set_xlim(0, n_samples + 1) 129 | axes.set_ylabel("CV iterations") 130 | axes.set_xlabel("Data points") 131 | axes.set_xticks(np.arange(n_samples_per_fold / 2., 132 | n_samples, n_samples_per_fold)) 133 | axes.set_xticklabels(["Fold %d" % x for x in range(1, n_folds + 1)]) 134 | axes.set_yticks(np.arange(n_folds + 1) + .3) 135 | axes.set_yticklabels( 136 | ["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"]) 137 | for i in range(3): 138 | axes.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" % 139 | i, horizontalalignment="center") 140 | 141 | ax = both_axes[1] 142 | ax.set_title("Stratified Cross-validation") 143 | ax.set_frame_on(False) 144 | ax.invert_yaxis() 145 | ax.set_xlim(0, n_samples + 1) 146 | ax.set_ylabel("CV iterations") 147 | ax.set_xlabel("Data points") 148 | 149 | ax.set_yticks(np.arange(n_folds + 1) + .3) 150 | ax.set_yticklabels( 151 | ["Split %d" % x for x in range(1, n_folds + 1)] + ["Class label"]) 152 | 153 | n_subsplit = n_samples_per_fold / 3. 154 | for i in range(n_folds): 155 | test_bars = ax.barh( 156 | y=[i] * n_folds, width=[n_subsplit - 1] * n_folds, 157 | left=np.arange(n_folds) * n_samples_per_fold + i * n_subsplit, 158 | height=.6, color="grey", hatch="//", edgecolor='k', align='edge') 159 | 160 | w = 2 * n_subsplit - 1 161 | ax.barh(y=[0] * n_folds, width=[w] * n_folds, left=np.arange(n_folds) 162 | * n_samples_per_fold + (0 + 1) * n_subsplit, height=.6, color="w", 163 | hatch="//", edgecolor='k', align='edge') 164 | ax.barh(y=[1] * (n_folds + 1), width=[w / 2., w, w, w / 2.], 165 | left=np.maximum(0, np.arange(n_folds + 1) * n_samples_per_fold - 166 | n_subsplit), height=.6, color="w", hatch="//", 167 | edgecolor='k', align='edge') 168 | training_bars = ax.barh(y=[2] * n_folds, width=[w] * n_folds, 169 | left=np.arange(n_folds) * n_samples_per_fold, 170 | height=.6, color="w", hatch="//", edgecolor='k', 171 | align='edge') 172 | 173 | ax.barh(y=[n_folds] * n_folds, width=[n_samples_per_fold - 1] * 174 | n_folds, left=np.arange(n_folds) * n_samples_per_fold, height=.6, 175 | color="w", edgecolor='k', align='edge') 176 | 177 | for i in range(3): 178 | ax.text((i + .5) * n_samples_per_fold, 3.5, "Class %d" % 179 | i, horizontalalignment="center") 180 | ax.set_ylim(4, -0.1) 181 | plt.legend([training_bars[0], test_bars[0]], [ 182 | 'Training data', 'Test data'], loc=(1.05, 1), frameon=False) 183 | 184 | fig.tight_layout() 185 | 186 | 187 | def plot_cross_validation(): 188 | plt.figure(figsize=(12, 2)) 189 | plt.title("cross_validation") 190 | axes = plt.gca() 191 | axes.set_frame_on(False) 192 | 193 | n_folds = 5 194 | n_samples = 25 195 | 196 | n_samples_per_fold = n_samples / float(n_folds) 197 | 198 | for i in range(n_folds): 199 | colors = ["w"] * n_folds 200 | colors[i] = "grey" 201 | bars = plt.barh( 202 | y=range(n_folds), width=[n_samples_per_fold - 0.1] * n_folds, 203 | left=i * n_samples_per_fold, height=.6, color=colors, hatch="//", 204 | edgecolor='k', align='edge') 205 | axes.invert_yaxis() 206 | axes.set_xlim(0, n_samples + 1) 207 | plt.ylabel("CV iterations") 208 | plt.xlabel("Data points") 209 | plt.xticks(np.arange(n_samples_per_fold / 2., n_samples, 210 | n_samples_per_fold), 211 | ["Fold %d" % x for x in range(1, n_folds + 1)]) 212 | plt.yticks(np.arange(n_folds) + .3, 213 | ["Split %d" % x for x in range(1, n_folds + 1)]) 214 | plt.legend([bars[0], bars[4]], ['Training data', 'Test data'], 215 | loc=(1.05, 0.4), frameon=False) 216 | 217 | 218 | def plot_threefold_split(): 219 | plt.figure(figsize=(15, 1)) 220 | axis = plt.gca() 221 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], left=[0, 12, 15], color=[ 222 | 'white', 'grey', 'grey'], hatch="//", edgecolor='k', 223 | align='edge') 224 | bars[2].set_hatch(r"") 225 | axis.set_yticks(()) 226 | axis.set_frame_on(False) 227 | axis.set_ylim(-.1, .8) 228 | axis.set_xlim(-0.1, 20.1) 229 | axis.set_xticks([6, 13.3, 17.5]) 230 | axis.set_xticklabels(["training set", "validation set", 231 | "test set"], fontdict={'fontsize': 20}) 232 | axis.tick_params(length=0, labeltop=True, labelbottom=False) 233 | axis.text(6, -.3, "Model fitting", 234 | fontdict={'fontsize': 13}, horizontalalignment="center") 235 | axis.text(13.3, -.3, "Parameter selection", 236 | fontdict={'fontsize': 13}, horizontalalignment="center") 237 | axis.text(17.5, -.3, "Evaluation", 238 | fontdict={'fontsize': 13}, horizontalalignment="center") 239 | -------------------------------------------------------------------------------- /mglearn/plot_dbscan.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.cluster import DBSCAN 4 | from sklearn.datasets import make_blobs 5 | 6 | from .plot_helpers import discrete_scatter, cm3 7 | 8 | 9 | def plot_dbscan(): 10 | X, y = make_blobs(random_state=0, n_samples=12) 11 | 12 | dbscan = DBSCAN() 13 | clusters = dbscan.fit_predict(X) 14 | clusters 15 | 16 | fig, axes = plt.subplots(3, 4, figsize=(11, 8), 17 | subplot_kw={'xticks': (), 'yticks': ()}) 18 | # Plot clusters as red, green and blue, and outliers (-1) as white 19 | colors = [cm3(1), cm3(0), cm3(2)] 20 | markers = ['o', '^', 'v'] 21 | 22 | # iterate over settings of min_samples and eps 23 | for i, min_samples in enumerate([2, 3, 5]): 24 | for j, eps in enumerate([1, 1.5, 2, 3]): 25 | # instantiate DBSCAN with a particular setting 26 | dbscan = DBSCAN(min_samples=min_samples, eps=eps) 27 | # get cluster assignments 28 | clusters = dbscan.fit_predict(X) 29 | print("min_samples: %d eps: %f cluster: %s" 30 | % (min_samples, eps, clusters)) 31 | if np.any(clusters == -1): 32 | c = ['w'] + colors 33 | m = ['o'] + markers 34 | else: 35 | c = colors 36 | m = markers 37 | discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c, 38 | s=8, markers=m) 39 | inds = dbscan.core_sample_indices_ 40 | # vizualize core samples and clusters. 41 | if len(inds): 42 | discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds], 43 | ax=axes[i, j], s=15, c=colors, 44 | markers=markers) 45 | axes[i, j].set_title("min_samples: %d eps: %.1f" 46 | % (min_samples, eps)) 47 | fig.tight_layout() 48 | -------------------------------------------------------------------------------- /mglearn/plot_decomposition.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from matplotlib.offsetbox import OffsetImage, AnnotationBbox 3 | 4 | 5 | def plot_decomposition(people, pca): 6 | image_shape = people.images[0].shape 7 | plt.figure(figsize=(20, 3)) 8 | ax = plt.gca() 9 | 10 | imagebox = OffsetImage(people.images[0], zoom=1.5, cmap="gray") 11 | ab = AnnotationBbox(imagebox, (.05, 0.4), pad=0.0, xycoords='data') 12 | ax.add_artist(ab) 13 | 14 | for i in range(4): 15 | imagebox = OffsetImage(pca.components_[i].reshape(image_shape), zoom=1.5, cmap="viridis") 16 | 17 | ab = AnnotationBbox(imagebox, (.3 + .2 * i, 0.4), 18 | pad=0.0, 19 | xycoords='data' 20 | ) 21 | ax.add_artist(ab) 22 | if i == 0: 23 | plt.text(.18, .25, 'x_%d *' % i, fontdict={'fontsize': 50}) 24 | else: 25 | plt.text(.15 + .2 * i, .25, '+ x_%d *' % i, fontdict={'fontsize': 50}) 26 | 27 | plt.text(.95, .25, '+ ...', fontdict={'fontsize': 50}) 28 | 29 | plt.rc('text', usetex=True) 30 | plt.text(.13, .3, r'\approx', fontdict={'fontsize': 50}) 31 | plt.axis("off") 32 | -------------------------------------------------------------------------------- /mglearn/plot_grid_search.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.svm import SVC 4 | from sklearn.model_selection import GridSearchCV, train_test_split 5 | from sklearn.datasets import load_iris 6 | import pandas as pd 7 | 8 | 9 | def plot_cross_val_selection(): 10 | iris = load_iris() 11 | X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, 12 | iris.target, 13 | random_state=0) 14 | 15 | param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 16 | 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} 17 | grid_search = GridSearchCV(SVC(), param_grid, cv=5, 18 | return_train_score=True) 19 | grid_search.fit(X_trainval, y_trainval) 20 | results = pd.DataFrame(grid_search.cv_results_)[15:] 21 | 22 | best = np.argmax(results.mean_test_score.values) 23 | plt.figure(figsize=(10, 3)) 24 | plt.xlim(-1, len(results)) 25 | plt.ylim(0, 1.1) 26 | for i, (_, row) in enumerate(results.iterrows()): 27 | scores = row[['split%d_test_score' % i for i in range(5)]] 28 | marker_cv, = plt.plot([i] * 5, scores, '^', c='gray', markersize=5, 29 | alpha=.5) 30 | marker_mean, = plt.plot(i, row.mean_test_score, 'v', c='none', alpha=1, 31 | markersize=10, markeredgecolor='k') 32 | if i == best: 33 | marker_best, = plt.plot(i, row.mean_test_score, 'o', c='red', 34 | fillstyle="none", alpha=1, markersize=20, 35 | markeredgewidth=3) 36 | plt.xticks(range(len(results)), [str(x).strip("{}").replace("'", "") for x 37 | in results['params']], 38 | rotation=90) 39 | plt.ylabel("Validation accuracy") 40 | plt.xlabel("Parameter settings") 41 | plt.legend([marker_cv, marker_mean, marker_best], 42 | ["cv accuracy", "mean accuracy", "best parameter setting"], 43 | loc=(1.05, .4)) 44 | 45 | 46 | def plot_grid_search_overview(): 47 | plt.figure(figsize=(10, 3), dpi=70) 48 | axes = plt.gca() 49 | axes.yaxis.set_visible(False) 50 | axes.xaxis.set_visible(False) 51 | axes.set_frame_on(False) 52 | 53 | def draw(ax, text, start, target=None): 54 | if target is not None: 55 | patchB = target.get_bbox_patch() 56 | end = target.get_position() 57 | else: 58 | end = start 59 | patchB = None 60 | annotation = ax.annotate(text, end, start, xycoords='axes pixels', 61 | textcoords='axes pixels', size=20, 62 | arrowprops=dict( 63 | arrowstyle="-|>", fc="w", ec="k", 64 | patchB=patchB, 65 | connectionstyle="arc3,rad=0.0"), 66 | bbox=dict(boxstyle="round", fc="w"), 67 | horizontalalignment="center", 68 | verticalalignment="center") 69 | plt.draw() 70 | return annotation 71 | 72 | step = 100 73 | grr = 400 74 | 75 | final_evaluation = draw(axes, "final evaluation", (5 * step, grr - 3 * 76 | step)) 77 | retrained_model = draw(axes, "retrained model", (3 * step, grr - 3 * step), 78 | final_evaluation) 79 | best_parameters = draw(axes, "best parameters", (.5 * step, grr - 3 * 80 | step), retrained_model) 81 | cross_validation = draw(axes, "cross-validation", (.5 * step, grr - 2 * 82 | step), best_parameters) 83 | draw(axes, "parameter grid", (0.0, grr - 0), cross_validation) 84 | training_data = draw(axes, "training data", (2 * step, grr - step), 85 | cross_validation) 86 | draw(axes, "training data", (2 * step, grr - step), retrained_model) 87 | test_data = draw(axes, "test data", (5 * step, grr - step), 88 | final_evaluation) 89 | draw(axes, "data set", (3.5 * step, grr - 0.0), training_data) 90 | draw(axes, "data set", (3.5 * step, grr - 0.0), test_data) 91 | plt.ylim(0, 1) 92 | plt.xlim(0, 1.5) 93 | -------------------------------------------------------------------------------- /mglearn/plot_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | import matplotlib.pyplot as plt 4 | from matplotlib.colors import ListedColormap, colorConverter, LinearSegmentedColormap 5 | 6 | 7 | cm_cycle = ListedColormap(['#0000aa', '#ff5050', '#50ff50', '#9040a0', '#fff000']) 8 | cm3 = ListedColormap(['#0000aa', '#ff2020', '#50ff50']) 9 | cm2 = ListedColormap(['#0000aa', '#ff2020']) 10 | 11 | # create a smooth transition from the first to to the second color of cm3 12 | # similar to RdBu but with our red and blue, also not going through white, 13 | # which is really bad for greyscale 14 | 15 | cdict = {'red': [(0.0, 0.0, cm2(0)[0]), 16 | (1.0, cm2(1)[0], 1.0)], 17 | 18 | 'green': [(0.0, 0.0, cm2(0)[1]), 19 | (1.0, cm2(1)[1], 1.0)], 20 | 21 | 'blue': [(0.0, 0.0, cm2(0)[2]), 22 | (1.0, cm2(1)[2], 1.0)]} 23 | 24 | ReBl = LinearSegmentedColormap("ReBl", cdict) 25 | 26 | 27 | def discrete_scatter(x1, x2, y=None, markers=None, s=10, ax=None, 28 | labels=None, padding=.2, alpha=1, c=None, markeredgewidth=None): 29 | """Adaption of matplotlib.pyplot.scatter to plot classes or clusters. 30 | 31 | Parameters 32 | ---------- 33 | 34 | x1 : nd-array 35 | input data, first axis 36 | 37 | x2 : nd-array 38 | input data, second axis 39 | 40 | y : nd-array 41 | input data, discrete labels 42 | 43 | cmap : colormap 44 | Colormap to use. 45 | 46 | markers : list of string 47 | List of markers to use, or None (which defaults to 'o'). 48 | 49 | s : int or float 50 | Size of the marker 51 | 52 | padding : float 53 | Fraction of the dataset range to use for padding the axes. 54 | 55 | alpha : float 56 | Alpha value for all points. 57 | """ 58 | if ax is None: 59 | ax = plt.gca() 60 | 61 | if y is None: 62 | y = np.zeros(len(x1)) 63 | 64 | unique_y = np.unique(y) 65 | 66 | if markers is None: 67 | markers = ['o', '^', 'v', 'D', 's', '*', 'p', 'h', 'H', '8', '<', '>'] * 10 68 | 69 | if len(markers) == 1: 70 | markers = markers * len(unique_y) 71 | 72 | if labels is None: 73 | labels = unique_y 74 | 75 | # lines in the matplotlib sense, not actual lines 76 | lines = [] 77 | 78 | current_cycler = mpl.rcParams['axes.prop_cycle'] 79 | 80 | for i, (yy, cycle) in enumerate(zip(unique_y, current_cycler())): 81 | mask = y == yy 82 | # if c is none, use color cycle 83 | if c is None: 84 | color = cycle['color'] 85 | elif len(c) > 1: 86 | color = c[i] 87 | else: 88 | color = c 89 | # use light edge for dark markers 90 | if np.mean(colorConverter.to_rgb(color)) < .4: 91 | markeredgecolor = "grey" 92 | else: 93 | markeredgecolor = "black" 94 | 95 | lines.append(ax.plot(x1[mask], x2[mask], markers[i], markersize=s, 96 | label=labels[i], alpha=alpha, c=color, 97 | markeredgewidth=markeredgewidth, 98 | markeredgecolor=markeredgecolor)[0]) 99 | 100 | if padding != 0: 101 | pad1 = x1.std() * padding 102 | pad2 = x2.std() * padding 103 | xlim = ax.get_xlim() 104 | ylim = ax.get_ylim() 105 | ax.set_xlim(min(x1.min() - pad1, xlim[0]), max(x1.max() + pad1, xlim[1])) 106 | ax.set_ylim(min(x2.min() - pad2, ylim[0]), max(x2.max() + pad2, ylim[1])) 107 | 108 | return lines 109 | -------------------------------------------------------------------------------- /mglearn/plot_improper_preprocessing.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def make_bracket(s, xy, textxy, width, ax): 5 | annotation = ax.annotate( 6 | s, xy, textxy, ha="center", va="center", size=20, 7 | arrowprops=dict(arrowstyle="-[", fc="w", ec="k", 8 | lw=2,), bbox=dict(boxstyle="square", fc="w")) 9 | annotation.arrow_patch.get_arrowstyle().widthB = width 10 | 11 | 12 | def plot_improper_processing(): 13 | fig, axes = plt.subplots(2, 1, figsize=(15, 10)) 14 | 15 | for axis in axes: 16 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], left=[0, 12, 15], 17 | color=['white', 'grey', 'grey'], hatch="//", 18 | align='edge', edgecolor='k') 19 | bars[2].set_hatch(r"") 20 | axis.set_yticks(()) 21 | axis.set_frame_on(False) 22 | axis.set_ylim(-.1, 6) 23 | axis.set_xlim(-0.1, 20.1) 24 | axis.set_xticks(()) 25 | axis.tick_params(length=0, labeltop=True, labelbottom=False) 26 | axis.text(6, -.3, "training folds", 27 | fontdict={'fontsize': 14}, horizontalalignment="center") 28 | axis.text(13.5, -.3, "validation fold", 29 | fontdict={'fontsize': 14}, horizontalalignment="center") 30 | axis.text(17.5, -.3, "test set", 31 | fontdict={'fontsize': 14}, horizontalalignment="center") 32 | 33 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[0]) 34 | make_bracket("SVC fit", (6, 3), (6, 4), 12, axes[0]) 35 | make_bracket("SVC predict", (13.4, 3), (13.4, 4), 2.5, axes[0]) 36 | 37 | axes[0].set_title("Cross validation") 38 | axes[1].set_title("Test set prediction") 39 | 40 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[1]) 41 | make_bracket("SVC fit", (7.5, 3), (7.5, 4), 15, axes[1]) 42 | make_bracket("SVC predict", (17.5, 3), (17.5, 4), 4.8, axes[1]) 43 | 44 | 45 | def plot_proper_processing(): 46 | fig, axes = plt.subplots(2, 1, figsize=(15, 8)) 47 | 48 | for axis in axes: 49 | bars = axis.barh([0, 0, 0], [11.9, 2.9, 4.9], 50 | left=[0, 12, 15], color=['white', 'grey', 'grey'], 51 | hatch="//", align='edge', edgecolor='k') 52 | bars[2].set_hatch(r"") 53 | axis.set_yticks(()) 54 | axis.set_frame_on(False) 55 | axis.set_ylim(-.1, 4.5) 56 | axis.set_xlim(-0.1, 20.1) 57 | axis.set_xticks(()) 58 | axis.tick_params(length=0, labeltop=True, labelbottom=False) 59 | axis.text(6, -.3, "training folds", fontdict={'fontsize': 14}, 60 | horizontalalignment="center") 61 | axis.text(13.5, -.3, "validation fold", fontdict={'fontsize': 14}, 62 | horizontalalignment="center") 63 | axis.text(17.5, -.3, "test set", fontdict={'fontsize': 14}, 64 | horizontalalignment="center") 65 | 66 | make_bracket("scaler fit", (6, 1.3), (6, 2.), 12, axes[0]) 67 | make_bracket("SVC fit", (6, 3), (6, 4), 12, axes[0]) 68 | make_bracket("SVC predict", (13.4, 3), (13.4, 4), 2.5, axes[0]) 69 | 70 | axes[0].set_title("Cross validation") 71 | axes[1].set_title("Test set prediction") 72 | 73 | make_bracket("scaler fit", (7.5, 1.3), (7.5, 2.), 15, axes[1]) 74 | make_bracket("SVC fit", (7.5, 3), (7.5, 4), 15, axes[1]) 75 | make_bracket("SVC predict", (17.5, 3), (17.5, 4), 4.8, axes[1]) 76 | fig.subplots_adjust(hspace=.3) 77 | -------------------------------------------------------------------------------- /mglearn/plot_interactive_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.tree import DecisionTreeClassifier 5 | 6 | from io import StringIO 7 | from sklearn.tree import export_graphviz 8 | from imageio import imread 9 | from scipy import ndimage 10 | from sklearn.datasets import make_moons 11 | 12 | import re 13 | 14 | from .tools import discrete_scatter 15 | from .plot_helpers import cm2 16 | 17 | 18 | def tree_image(tree, fout=None): 19 | try: 20 | import graphviz 21 | except ImportError: 22 | # make a hacky white plot 23 | x = np.ones((10, 10)) 24 | x[0, 0] = 0 25 | return x 26 | dot_data = StringIO() 27 | export_graphviz(tree, out_file=dot_data, max_depth=3, impurity=False) 28 | data = dot_data.getvalue() 29 | data = re.sub(r"samples = [0-9]+\\n", "", data) 30 | data = re.sub(r"\\nsamples = [0-9]+", "", data) 31 | data = re.sub(r"value", "counts", data) 32 | 33 | graph = graphviz.Source(data, format="png") 34 | if fout is None: 35 | fout = "tmp" 36 | graph.render(fout) 37 | return imread(fout + ".png") 38 | 39 | 40 | def plot_tree_progressive(): 41 | X, y = make_moons(n_samples=100, noise=0.25, random_state=3) 42 | plt.figure() 43 | ax = plt.gca() 44 | discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) 45 | ax.set_xlabel("Feature 0") 46 | ax.set_ylabel("Feature 1") 47 | plt.legend(["Class 0", "Class 1"], loc='best') 48 | 49 | axes = [] 50 | for i in range(3): 51 | fig, ax = plt.subplots(1, 2, figsize=(12, 4), 52 | subplot_kw={'xticks': (), 'yticks': ()}) 53 | axes.append(ax) 54 | axes = np.array(axes) 55 | 56 | for i, max_depth in enumerate([1, 2, 9]): 57 | tree = plot_tree(X, y, max_depth=max_depth, ax=axes[i, 0]) 58 | axes[i, 1].imshow(tree_image(tree)) 59 | axes[i, 1].set_axis_off() 60 | 61 | 62 | def plot_tree_partition(X, y, tree, ax=None): 63 | if ax is None: 64 | ax = plt.gca() 65 | eps = X.std() / 2. 66 | 67 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 68 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 69 | xx = np.linspace(x_min, x_max, 1000) 70 | yy = np.linspace(y_min, y_max, 1000) 71 | 72 | X1, X2 = np.meshgrid(xx, yy) 73 | X_grid = np.c_[X1.ravel(), X2.ravel()] 74 | 75 | Z = tree.predict(X_grid) 76 | Z = Z.reshape(X1.shape) 77 | faces = tree.apply(X_grid) 78 | faces = faces.reshape(X1.shape) 79 | border = ndimage.laplace(faces) != 0 80 | ax.contourf(X1, X2, Z, alpha=.4, cmap=cm2, levels=[0, .5, 1]) 81 | ax.scatter(X1[border], X2[border], marker='.', s=1) 82 | 83 | discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) 84 | ax.set_xlim(x_min, x_max) 85 | ax.set_ylim(y_min, y_max) 86 | ax.set_xticks(()) 87 | ax.set_yticks(()) 88 | return ax 89 | 90 | 91 | def plot_tree(X, y, max_depth=1, ax=None): 92 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=0).fit(X, y) 93 | ax = plot_tree_partition(X, y, tree, ax=ax) 94 | ax.set_title("depth = %d" % max_depth) 95 | return tree 96 | -------------------------------------------------------------------------------- /mglearn/plot_kmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.datasets import make_blobs 4 | from sklearn.cluster import KMeans 5 | from sklearn.metrics import pairwise_distances 6 | import matplotlib.pyplot as plt 7 | import matplotlib as mpl 8 | from cycler import cycler 9 | 10 | from .tools import discrete_scatter 11 | from .plot_2d_separator import plot_2d_classification 12 | from .plot_helpers import cm3 13 | 14 | 15 | def plot_kmeans_algorithm(): 16 | 17 | X, y = make_blobs(random_state=1) 18 | # we don't want cyan in there 19 | with mpl.rc_context(rc={'axes.prop_cycle': cycler('color', ['#0000aa', 20 | '#ff2020', 21 | '#50ff50'])}): 22 | fig, axes = plt.subplots(3, 3, figsize=(10, 8), subplot_kw={'xticks': (), 'yticks': ()}) 23 | axes = axes.ravel() 24 | axes[0].set_title("Input data") 25 | discrete_scatter(X[:, 0], X[:, 1], ax=axes[0], markers=['o'], c='w') 26 | 27 | axes[1].set_title("Initialization") 28 | init = X[:3, :] 29 | discrete_scatter(X[:, 0], X[:, 1], ax=axes[1], markers=['o'], c='w') 30 | discrete_scatter(init[:, 0], init[:, 1], [0, 1, 2], ax=axes[1], 31 | markers=['^'], markeredgewidth=2) 32 | 33 | axes[2].set_title("Assign Points (1)") 34 | km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X) 35 | centers = km.cluster_centers_ 36 | # need to compute labels by hand. scikit-learn does two e-steps for max_iter=1 37 | # (and it's totally my fault) 38 | labels = np.argmin(pairwise_distances(init, X), axis=0) 39 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'], 40 | ax=axes[2]) 41 | discrete_scatter(init[:, 0], init[:, 1], [0, 1, 2], 42 | ax=axes[2], markers=['^'], markeredgewidth=2) 43 | 44 | axes[3].set_title("Recompute Centers (1)") 45 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'], 46 | ax=axes[3]) 47 | discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2], 48 | ax=axes[3], markers=['^'], markeredgewidth=2) 49 | 50 | axes[4].set_title("Reassign Points (2)") 51 | km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X) 52 | labels = km.labels_ 53 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'], 54 | ax=axes[4]) 55 | discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2], 56 | ax=axes[4], markers=['^'], markeredgewidth=2) 57 | 58 | km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X) 59 | axes[5].set_title("Recompute Centers (2)") 60 | centers = km.cluster_centers_ 61 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'], 62 | ax=axes[5]) 63 | discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2], 64 | ax=axes[5], markers=['^'], markeredgewidth=2) 65 | 66 | axes[6].set_title("Reassign Points (3)") 67 | labels = km.labels_ 68 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'], 69 | ax=axes[6]) 70 | markers = discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2], 71 | ax=axes[6], markers=['^'], 72 | markeredgewidth=2) 73 | 74 | axes[7].set_title("Recompute Centers (3)") 75 | km = KMeans(n_clusters=3, init=init, max_iter=3, n_init=1).fit(X) 76 | centers = km.cluster_centers_ 77 | discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'], 78 | ax=axes[7]) 79 | discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2], 80 | ax=axes[7], markers=['^'], markeredgewidth=2) 81 | axes[8].set_axis_off() 82 | axes[8].legend(markers, ["Cluster 0", "Cluster 1", "Cluster 2"], loc='best') 83 | 84 | 85 | def plot_kmeans_boundaries(): 86 | X, y = make_blobs(random_state=1) 87 | init = X[:3, :] 88 | km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X) 89 | discrete_scatter(X[:, 0], X[:, 1], km.labels_, markers=['o']) 90 | discrete_scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], 91 | [0, 1, 2], markers=['^'], markeredgewidth=2) 92 | plot_2d_classification(km, X, cm=cm3, alpha=.4) 93 | 94 | 95 | def plot_kmeans_faces(km, pca, X_pca, X_people, y_people, target_names): 96 | n_clusters = 10 97 | image_shape = (87, 65) 98 | fig, axes = plt.subplots(n_clusters, 11, subplot_kw={'xticks': (), 'yticks': ()}, 99 | figsize=(10, 15), gridspec_kw={"hspace": .3}) 100 | 101 | for cluster in range(n_clusters): 102 | center = km.cluster_centers_[cluster] 103 | mask = km.labels_ == cluster 104 | dists = np.sum((X_pca - center) ** 2, axis=1) 105 | dists[~mask] = np.inf 106 | inds = np.argsort(dists)[:5] 107 | dists[~mask] = -np.inf 108 | inds = np.r_[inds, np.argsort(dists)[-5:]] 109 | axes[cluster, 0].imshow(pca.inverse_transform(center).reshape(image_shape), vmin=0, vmax=1) 110 | for image, label, asdf, ax in zip(X_people[inds], y_people[inds], 111 | km.labels_[inds], axes[cluster, 1:]): 112 | ax.imshow(image.reshape(image_shape), vmin=0, vmax=1) 113 | ax.set_title("%s" % (target_names[label].split()[-1]), fontdict={'fontsize': 9}) 114 | 115 | # add some boxes to illustrate which are similar and which dissimilar 116 | rec = plt.Rectangle([-5, -30], 73, 1295, fill=False, lw=2) 117 | rec = axes[0, 0].add_patch(rec) 118 | rec.set_clip_on(False) 119 | axes[0, 0].text(0, -40, "Center") 120 | 121 | rec = plt.Rectangle([-5, -30], 385, 1295, fill=False, lw=2) 122 | rec = axes[0, 1].add_patch(rec) 123 | rec.set_clip_on(False) 124 | axes[0, 1].text(0, -40, "Close to center") 125 | 126 | rec = plt.Rectangle([-5, -30], 385, 1295, fill=False, lw=2) 127 | rec = axes[0, 6].add_patch(rec) 128 | rec.set_clip_on(False) 129 | axes[0, 6].text(0, -40, "Far from center") 130 | -------------------------------------------------------------------------------- /mglearn/plot_kneighbors_regularization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | 6 | 7 | def plot_kneighbors_regularization(): 8 | rnd = np.random.RandomState(42) 9 | x = np.linspace(-3, 3, 100) 10 | y_no_noise = np.sin(4 * x) + x 11 | y = y_no_noise + rnd.normal(size=len(x)) 12 | X = x[:, np.newaxis] 13 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 14 | 15 | x_test = np.linspace(-3, 3, 1000) 16 | 17 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()): 18 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors) 19 | kneighbor_regression.fit(X, y) 20 | ax.plot(x, y_no_noise, label="true function") 21 | ax.plot(x, y, "o", label="data") 22 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]), 23 | label="prediction") 24 | ax.legend() 25 | ax.set_title("n_neighbors = %d" % n_neighbors) -------------------------------------------------------------------------------- /mglearn/plot_knn_classification.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.metrics import euclidean_distances 5 | from sklearn.neighbors import KNeighborsClassifier 6 | 7 | from .datasets import make_forge 8 | from .plot_helpers import discrete_scatter 9 | 10 | 11 | def plot_knn_classification(n_neighbors=1): 12 | X, y = make_forge() 13 | 14 | X_test = np.array([[8.2, 3.66214339], [9.9, 3.2], [11.2, .5]]) 15 | dist = euclidean_distances(X, X_test) 16 | closest = np.argsort(dist, axis=0) 17 | 18 | for x, neighbors in zip(X_test, closest.T): 19 | for neighbor in neighbors[:n_neighbors]: 20 | plt.arrow(x[0], x[1], X[neighbor, 0] - x[0], 21 | X[neighbor, 1] - x[1], head_width=0, fc='k', ec='k') 22 | 23 | clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y) 24 | test_points = discrete_scatter(X_test[:, 0], X_test[:, 1], clf.predict(X_test), markers="*") 25 | training_points = discrete_scatter(X[:, 0], X[:, 1], y) 26 | plt.legend(training_points + test_points, ["training class 0", "training class 1", 27 | "test pred 0", "test pred 1"]) 28 | -------------------------------------------------------------------------------- /mglearn/plot_knn_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | from sklearn.metrics import euclidean_distances 6 | 7 | from .datasets import make_wave 8 | from .plot_helpers import cm3 9 | 10 | 11 | def plot_knn_regression(n_neighbors=1): 12 | X, y = make_wave(n_samples=40) 13 | X_test = np.array([[-1.5], [0.9], [1.5]]) 14 | 15 | dist = euclidean_distances(X, X_test) 16 | closest = np.argsort(dist, axis=0) 17 | 18 | plt.figure(figsize=(10, 6)) 19 | 20 | reg = KNeighborsRegressor(n_neighbors=n_neighbors).fit(X, y) 21 | y_pred = reg.predict(X_test) 22 | 23 | for x, y_, neighbors in zip(X_test, y_pred, closest.T): 24 | for neighbor in neighbors[:n_neighbors]: 25 | plt.arrow(x[0], y_, X[neighbor, 0] - x[0], y[neighbor] - y_, 26 | head_width=0, fc='k', ec='k') 27 | 28 | train, = plt.plot(X, y, 'o', c=cm3(0)) 29 | test, = plt.plot(X_test, -3 * np.ones(len(X_test)), '*', c=cm3(2), 30 | markersize=20) 31 | pred, = plt.plot(X_test, y_pred, '*', c=cm3(0), markersize=20) 32 | plt.vlines(X_test, -3.1, 3.1, linestyle="--") 33 | plt.legend([train, test, pred], 34 | ["training data/target", "test data", "test prediction"], 35 | ncol=3, loc=(.1, 1.025)) 36 | plt.ylim(-3.1, 3.1) 37 | plt.xlabel("Feature") 38 | plt.ylabel("Target") 39 | -------------------------------------------------------------------------------- /mglearn/plot_linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.linear_model import LinearRegression 5 | from sklearn.model_selection import train_test_split 6 | from .datasets import make_wave 7 | from .plot_helpers import cm2 8 | 9 | 10 | def plot_linear_regression_wave(): 11 | X, y = make_wave(n_samples=60) 12 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 13 | 14 | line = np.linspace(-3, 3, 100).reshape(-1, 1) 15 | 16 | lr = LinearRegression().fit(X_train, y_train) 17 | print("w[0]: %f b: %f" % (lr.coef_[0], lr.intercept_)) 18 | 19 | plt.figure(figsize=(8, 8)) 20 | plt.plot(line, lr.predict(line)) 21 | plt.plot(X, y, 'o', c=cm2(0)) 22 | ax = plt.gca() 23 | ax.spines['left'].set_position('center') 24 | ax.spines['right'].set_color('none') 25 | ax.spines['bottom'].set_position('center') 26 | ax.spines['top'].set_color('none') 27 | ax.set_ylim(-3, 3) 28 | #ax.set_xlabel("Feature") 29 | #ax.set_ylabel("Target") 30 | ax.legend(["model", "training data"], loc="best") 31 | ax.grid(True) 32 | ax.set_aspect('equal') 33 | -------------------------------------------------------------------------------- /mglearn/plot_linear_svc_regularization.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import LinearSVC 4 | from sklearn.datasets import make_blobs 5 | 6 | from .plot_helpers import discrete_scatter 7 | 8 | 9 | def plot_linear_svc_regularization(): 10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 11 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 12 | 13 | # a carefully hand-designed dataset lol 14 | y[7] = 0 15 | y[27] = 0 16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 18 | 19 | for ax, C in zip(axes, [1e-2, 10, 1e3]): 20 | discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) 21 | 22 | svm = LinearSVC(C=C, tol=0.00001, dual=False).fit(X, y) 23 | w = svm.coef_[0] 24 | a = -w[0] / w[1] 25 | xx = np.linspace(6, 13) 26 | yy = a * xx - (svm.intercept_[0]) / w[1] 27 | ax.plot(xx, yy, c='k') 28 | ax.set_xlim(x_min, x_max) 29 | ax.set_ylim(y_min, y_max) 30 | ax.set_xticks(()) 31 | ax.set_yticks(()) 32 | ax.set_title("C = %f" % C) 33 | axes[0].legend(loc="best") 34 | -------------------------------------------------------------------------------- /mglearn/plot_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from .tools import plot_2d_separator, plot_2d_scores, cm, discrete_scatter 5 | from .plot_helpers import ReBl 6 | 7 | 8 | def plot_confusion_matrix_illustration(): 9 | plt.figure(figsize=(8, 8)) 10 | confusion = np.array([[401, 2], [8, 39]]) 11 | plt.text(0.40, .7, confusion[0, 0], size=70, horizontalalignment='right') 12 | plt.text(0.40, .2, confusion[1, 0], size=70, horizontalalignment='right') 13 | plt.text(.90, .7, confusion[0, 1], size=70, horizontalalignment='right') 14 | plt.text(.90, 0.2, confusion[1, 1], size=70, horizontalalignment='right') 15 | plt.xticks([.25, .75], ["predicted 'not nine'", "predicted 'nine'"], size=20) 16 | plt.yticks([.25, .75], ["true 'nine'", "true 'not nine'"], size=20) 17 | plt.plot([.5, .5], [0, 1], '--', c='k') 18 | plt.plot([0, 1], [.5, .5], '--', c='k') 19 | 20 | plt.xlim(0, 1) 21 | plt.ylim(0, 1) 22 | 23 | 24 | def plot_binary_confusion_matrix(): 25 | plt.text(0.45, .6, "TN", size=100, horizontalalignment='right') 26 | plt.text(0.45, .1, "FN", size=100, horizontalalignment='right') 27 | plt.text(.95, .6, "FP", size=100, horizontalalignment='right') 28 | plt.text(.95, 0.1, "TP", size=100, horizontalalignment='right') 29 | plt.xticks([.25, .75], ["predicted negative", "predicted positive"], size=15) 30 | plt.yticks([.25, .75], ["positive class", "negative class"], size=15) 31 | plt.plot([.5, .5], [0, 1], '--', c='k') 32 | plt.plot([0, 1], [.5, .5], '--', c='k') 33 | 34 | plt.xlim(0, 1) 35 | plt.ylim(0, 1) 36 | 37 | 38 | def plot_decision_threshold(): 39 | from sklearn.datasets import make_blobs 40 | from sklearn.svm import SVC 41 | from sklearn.model_selection import train_test_split 42 | 43 | X, y = make_blobs(n_samples=(400, 50), cluster_std=[7.0, 2], 44 | random_state=22) 45 | X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 46 | 47 | fig, axes = plt.subplots(2, 3, figsize=(15, 8), subplot_kw={'xticks': (), 'yticks': ()}) 48 | plt.suptitle("decision_threshold") 49 | axes[0, 0].set_title("training data") 50 | discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 0]) 51 | 52 | svc = SVC(gamma=.05).fit(X_train, y_train) 53 | axes[0, 1].set_title("decision with threshold 0") 54 | discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 1]) 55 | plot_2d_scores(svc, X_train, function="decision_function", alpha=.7, 56 | ax=axes[0, 1], cm=ReBl) 57 | plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1]) 58 | axes[0, 2].set_title("decision with threshold -0.8") 59 | discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 2]) 60 | plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8) 61 | plot_2d_scores(svc, X_train, function="decision_function", alpha=.7, 62 | ax=axes[0, 2], cm=ReBl) 63 | 64 | axes[1, 0].set_axis_off() 65 | 66 | mask = np.abs(X_train[:, 1] - 7) < 5 67 | bla = np.sum(mask) 68 | 69 | line = np.linspace(X_train.min(), X_train.max(), 100) 70 | axes[1, 1].set_title("Cross-section with threshold 0") 71 | axes[1, 1].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k') 72 | dec = svc.decision_function(np.c_[line, 10 * np.ones(100)]) 73 | contour = (dec > 0).reshape(1, -1).repeat(10, axis=0) 74 | axes[1, 1].contourf(line, np.linspace(-1.5, 1.5, 10), contour, alpha=0.4, cmap=cm) 75 | discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], ax=axes[1, 1]) 76 | axes[1, 1].set_xlim(X_train.min(), X_train.max()) 77 | axes[1, 1].set_ylim(-1.5, 1.5) 78 | axes[1, 1].set_xticks(()) 79 | axes[1, 1].set_ylabel("Decision value") 80 | 81 | contour2 = (dec > -.8).reshape(1, -1).repeat(10, axis=0) 82 | axes[1, 2].set_title("Cross-section with threshold -0.8") 83 | axes[1, 2].contourf(line, np.linspace(-1.5, 1.5, 10), contour2, alpha=0.4, cmap=cm) 84 | discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], alpha=.1, ax=axes[1, 2]) 85 | axes[1, 2].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k') 86 | axes[1, 2].set_xlim(X_train.min(), X_train.max()) 87 | axes[1, 2].set_ylim(-1.5, 1.5) 88 | axes[1, 2].set_xticks(()) 89 | axes[1, 2].set_ylabel("Decision value") 90 | axes[1, 0].legend(['negative class', 'positive class']) 91 | -------------------------------------------------------------------------------- /mglearn/plot_nmf.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import NMF 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | from joblib import Memory 6 | 7 | try: 8 | memory = Memory(cachedir="cache") 9 | except TypeError: 10 | # joblib.Memory changed its API in 0.12 11 | memory = Memory(location="cache", verbose=0) 12 | 13 | 14 | def plot_nmf_illustration(): 15 | rnd = np.random.RandomState(5) 16 | X_ = rnd.normal(size=(300, 2)) 17 | # Add 8 to make sure every point lies in the positive part of the space 18 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) + 8 19 | 20 | nmf = NMF(random_state=0) 21 | nmf.fit(X_blob) 22 | X_nmf = nmf.transform(X_blob) 23 | 24 | fig, axes = plt.subplots(1, 2, figsize=(15, 5)) 25 | 26 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, 27 | s=60, cmap='viridis') 28 | axes[0].set_xlabel("feature 1") 29 | axes[0].set_ylabel("feature 2") 30 | axes[0].set_xlim(0, 12) 31 | axes[0].set_ylim(0, 12) 32 | axes[0].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, 33 | head_width=.3, color='k') 34 | axes[0].arrow(0, 0, nmf.components_[1, 0], nmf.components_[1, 1], width=.1, 35 | head_width=.3, color='k') 36 | axes[0].set_aspect('equal') 37 | axes[0].set_title("NMF with two components") 38 | 39 | # second plot 40 | nmf = NMF(random_state=0, n_components=1) 41 | nmf.fit(X_blob) 42 | 43 | axes[1].scatter(X_blob[:, 0], X_blob[:, 1], c=X_nmf[:, 0], linewidths=0, 44 | s=60, cmap='viridis') 45 | axes[1].set_xlabel("feature 1") 46 | axes[1].set_ylabel("feature 2") 47 | axes[1].set_xlim(0, 12) 48 | axes[1].set_ylim(0, 12) 49 | axes[1].arrow(0, 0, nmf.components_[0, 0], nmf.components_[0, 1], width=.1, 50 | head_width=.3, color='k') 51 | 52 | axes[1].set_aspect('equal') 53 | axes[1].set_title("NMF with one component") 54 | 55 | 56 | @memory.cache 57 | def nmf_faces(X_train, X_test): 58 | # Build NMF models with 10, 50, 100 and 500 components 59 | # this list will hold the back-transformd test-data 60 | reduced_images = [] 61 | for n_components in [10, 50, 100, 500]: 62 | # build the NMF model 63 | nmf = NMF(n_components=n_components, random_state=0) 64 | nmf.fit(X_train) 65 | # transform the test data (afterwards has n_components many dimensions) 66 | X_test_nmf = nmf.transform(X_test) 67 | # back-transform the transformed test-data 68 | # (afterwards it's in the original space again) 69 | X_test_back = np.dot(X_test_nmf, nmf.components_) 70 | reduced_images.append(X_test_back) 71 | return reduced_images 72 | 73 | 74 | def plot_nmf_faces(X_train, X_test, image_shape): 75 | reduced_images = nmf_faces(X_train, X_test) 76 | 77 | # plot the first three images in the test set: 78 | fix, axes = plt.subplots(3, 5, figsize=(15, 12), 79 | subplot_kw={'xticks': (), 'yticks': ()}) 80 | for i, ax in enumerate(axes): 81 | # plot original image 82 | ax[0].imshow(X_test[i].reshape(image_shape), 83 | vmin=0, vmax=1) 84 | # plot the four back-transformed images 85 | for a, X_test_back in zip(ax[1:], reduced_images): 86 | a.imshow(X_test_back[i].reshape(image_shape), vmin=0, vmax=1) 87 | 88 | # label the top row 89 | axes[0, 0].set_title("original image") 90 | for ax, n_components in zip(axes[0, 1:], [10, 50, 100, 500]): 91 | ax.set_title("%d components" % n_components) 92 | -------------------------------------------------------------------------------- /mglearn/plot_nn_graphs.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def plot_logistic_regression_graph(): 4 | import graphviz 5 | lr_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'}, 6 | graph_attr={'rankdir': 'LR', 'splines': 'line'}) 7 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0") 8 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2") 9 | 10 | for i in range(4): 11 | inputs.node("x[%d]" % i, labelloc="c") 12 | inputs.body.append('label = "inputs"') 13 | inputs.body.append('color = "white"') 14 | 15 | lr_graph.subgraph(inputs) 16 | 17 | output.body.append('label = "output"') 18 | output.body.append('color = "white"') 19 | output.node("y") 20 | 21 | lr_graph.subgraph(output) 22 | 23 | for i in range(4): 24 | lr_graph.edge("x[%d]" % i, "y", label="w[%d]" % i) 25 | return lr_graph 26 | 27 | 28 | def plot_single_hidden_layer_graph(): 29 | import graphviz 30 | nn_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'}, 31 | graph_attr={'rankdir': 'LR', 'splines': 'line'}) 32 | 33 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0") 34 | hidden = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_1") 35 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2") 36 | 37 | for i in range(4): 38 | inputs.node("x[%d]" % i) 39 | 40 | inputs.body.append('label = "inputs"') 41 | inputs.body.append('color = "white"') 42 | 43 | hidden.body.append('label = "hidden layer"') 44 | hidden.body.append('color = "white"') 45 | 46 | for i in range(3): 47 | hidden.node("h%d" % i, label="h[%d]" % i) 48 | 49 | output.node("y") 50 | output.body.append('label = "output"') 51 | output.body.append('color = "white"') 52 | 53 | nn_graph.subgraph(inputs) 54 | nn_graph.subgraph(hidden) 55 | nn_graph.subgraph(output) 56 | 57 | for i in range(4): 58 | for j in range(3): 59 | nn_graph.edge("x[%d]" % i, "h%d" % j) 60 | 61 | for i in range(3): 62 | nn_graph.edge("h%d" % i, "y") 63 | return nn_graph 64 | 65 | 66 | def plot_two_hidden_layer_graph(): 67 | import graphviz 68 | nn_graph = graphviz.Digraph(node_attr={'shape': 'circle', 'fixedsize': 'True'}, 69 | graph_attr={'rankdir': 'LR', 'splines': 'line'}) 70 | 71 | inputs = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_0") 72 | hidden = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_1") 73 | hidden2 = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_2") 74 | 75 | output = graphviz.Digraph(node_attr={'shape': 'circle'}, name="cluster_3") 76 | 77 | for i in range(4): 78 | inputs.node("x[%d]" % i) 79 | 80 | inputs.body.append('label = "inputs"') 81 | inputs.body.append('color = "white"') 82 | 83 | for i in range(3): 84 | hidden.node("h1[%d]" % i) 85 | 86 | for i in range(3): 87 | hidden2.node("h2[%d]" % i) 88 | 89 | hidden.body.append('label = "hidden layer 1"') 90 | hidden.body.append('color = "white"') 91 | 92 | hidden2.body.append('label = "hidden layer 2"') 93 | hidden2.body.append('color = "white"') 94 | 95 | output.node("y") 96 | output.body.append('label = "output"') 97 | output.body.append('color = "white"') 98 | 99 | nn_graph.subgraph(inputs) 100 | nn_graph.subgraph(hidden) 101 | nn_graph.subgraph(hidden2) 102 | 103 | nn_graph.subgraph(output) 104 | 105 | for i in range(4): 106 | for j in range(3): 107 | nn_graph.edge("x[%d]" % i, "h1[%d]" % j, label="") 108 | 109 | for i in range(3): 110 | for j in range(3): 111 | nn_graph.edge("h1[%d]" % i, "h2[%d]" % j, label="") 112 | 113 | for i in range(3): 114 | nn_graph.edge("h2[%d]" % i, "y", label="") 115 | 116 | return nn_graph 117 | -------------------------------------------------------------------------------- /mglearn/plot_pca.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import PCA 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | from joblib import Memory 6 | 7 | try: 8 | memory = Memory(cachedir="cache") 9 | except TypeError: 10 | # joblib.Memory changed its API in 0.12 11 | memory = Memory(location="cache", verbose=0) 12 | 13 | def plot_pca_illustration(): 14 | rnd = np.random.RandomState(5) 15 | X_ = rnd.normal(size=(300, 2)) 16 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) 17 | 18 | pca = PCA() 19 | pca.fit(X_blob) 20 | X_pca = pca.transform(X_blob) 21 | 22 | S = X_pca.std(axis=0) 23 | 24 | fig, axes = plt.subplots(2, 2, figsize=(10, 10)) 25 | axes = axes.ravel() 26 | 27 | axes[0].set_title("Original data") 28 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0, 29 | s=60, cmap='viridis') 30 | axes[0].set_xlabel("feature 1") 31 | axes[0].set_ylabel("feature 2") 32 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[0] * pca.components_[0, 0], 33 | S[0] * pca.components_[0, 1], width=.1, head_width=.3, 34 | color='k') 35 | axes[0].arrow(pca.mean_[0], pca.mean_[1], S[1] * pca.components_[1, 0], 36 | S[1] * pca.components_[1, 1], width=.1, head_width=.3, 37 | color='k') 38 | axes[0].text(-1.5, -.5, "Component 2", size=14) 39 | axes[0].text(-4, -4, "Component 1", size=14) 40 | axes[0].set_aspect('equal') 41 | 42 | axes[1].set_title("Transformed data") 43 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0, 44 | s=60, cmap='viridis') 45 | axes[1].set_xlabel("First principal component") 46 | axes[1].set_ylabel("Second principal component") 47 | axes[1].set_aspect('equal') 48 | axes[1].set_ylim(-8, 8) 49 | 50 | pca = PCA(n_components=1) 51 | pca.fit(X_blob) 52 | X_inverse = pca.inverse_transform(pca.transform(X_blob)) 53 | 54 | axes[2].set_title("Transformed data w/ second component dropped") 55 | axes[2].scatter(X_pca[:, 0], np.zeros(X_pca.shape[0]), c=X_pca[:, 0], 56 | linewidths=0, s=60, cmap='viridis') 57 | axes[2].set_xlabel("First principal component") 58 | axes[2].set_aspect('equal') 59 | axes[2].set_ylim(-8, 8) 60 | 61 | axes[3].set_title("Back-rotation using only first component") 62 | axes[3].scatter(X_inverse[:, 0], X_inverse[:, 1], c=X_pca[:, 0], 63 | linewidths=0, s=60, cmap='viridis') 64 | axes[3].set_xlabel("feature 1") 65 | axes[3].set_ylabel("feature 2") 66 | axes[3].set_aspect('equal') 67 | axes[3].set_xlim(-8, 4) 68 | axes[3].set_ylim(-8, 4) 69 | 70 | 71 | def plot_pca_whitening(): 72 | rnd = np.random.RandomState(5) 73 | X_ = rnd.normal(size=(300, 2)) 74 | X_blob = np.dot(X_, rnd.normal(size=(2, 2))) + rnd.normal(size=2) 75 | 76 | pca = PCA(whiten=True) 77 | pca.fit(X_blob) 78 | X_pca = pca.transform(X_blob) 79 | 80 | fig, axes = plt.subplots(1, 2, figsize=(10, 10)) 81 | axes = axes.ravel() 82 | 83 | axes[0].set_title("Original data") 84 | axes[0].scatter(X_blob[:, 0], X_blob[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis') 85 | axes[0].set_xlabel("feature 1") 86 | axes[0].set_ylabel("feature 2") 87 | axes[0].set_aspect('equal') 88 | 89 | axes[1].set_title("Whitened data") 90 | axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=X_pca[:, 0], linewidths=0, s=60, cmap='viridis') 91 | axes[1].set_xlabel("First principal component") 92 | axes[1].set_ylabel("Second principal component") 93 | axes[1].set_aspect('equal') 94 | axes[1].set_xlim(-3, 4) 95 | 96 | 97 | @memory.cache 98 | def pca_faces(X_train, X_test): 99 | # copy and pasted from nmf. refactor? 100 | # Build NMF models with 10, 50, 100, 500 components 101 | # this list will hold the back-transformd test-data 102 | reduced_images = [] 103 | for n_components in [10, 50, 100, 500]: 104 | # build the NMF model 105 | pca = PCA(n_components=n_components) 106 | pca.fit(X_train) 107 | # transform the test data (afterwards has n_components many dimensions) 108 | X_test_pca = pca.transform(X_test) 109 | # back-transform the transformed test-data 110 | # (afterwards it's in the original space again) 111 | X_test_back = pca.inverse_transform(X_test_pca) 112 | reduced_images.append(X_test_back) 113 | return reduced_images 114 | 115 | 116 | def plot_pca_faces(X_train, X_test, image_shape): 117 | reduced_images = pca_faces(X_train, X_test) 118 | 119 | # plot the first three images in the test set: 120 | fix, axes = plt.subplots(3, 5, figsize=(15, 12), 121 | subplot_kw={'xticks': (), 'yticks': ()}) 122 | for i, ax in enumerate(axes): 123 | # plot original image 124 | ax[0].imshow(X_test[i].reshape(image_shape), 125 | vmin=0, vmax=1) 126 | # plot the four back-transformed images 127 | for a, X_test_back in zip(ax[1:], reduced_images): 128 | a.imshow(X_test_back[i].reshape(image_shape), vmin=0, vmax=1) 129 | 130 | # label the top row 131 | axes[0, 0].set_title("original image") 132 | for ax, n_components in zip(axes[0, 1:], [10, 50, 100, 500]): 133 | ax.set_title("%d components" % n_components) 134 | -------------------------------------------------------------------------------- /mglearn/plot_rbf_svm_parameters.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn.svm import SVC 3 | from .plot_2d_separator import plot_2d_separator 4 | from .tools import make_handcrafted_dataset 5 | from .plot_helpers import discrete_scatter 6 | 7 | 8 | def plot_svm(log_C, log_gamma, ax=None): 9 | X, y = make_handcrafted_dataset() 10 | C = 10. ** log_C 11 | gamma = 10. ** log_gamma 12 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y) 13 | if ax is None: 14 | ax = plt.gca() 15 | plot_2d_separator(svm, X, ax=ax, eps=.5) 16 | # plot data 17 | discrete_scatter(X[:, 0], X[:, 1], y, ax=ax) 18 | # plot support vectors 19 | sv = svm.support_vectors_ 20 | # class labels of support vectors are given by the sign of the dual coefficients 21 | sv_labels = svm.dual_coef_.ravel() > 0 22 | discrete_scatter(sv[:, 0], sv[:, 1], sv_labels, s=15, markeredgewidth=3, ax=ax) 23 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma)) 24 | 25 | 26 | def plot_svm_interactive(): 27 | from IPython.html.widgets import interactive, FloatSlider 28 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False) 29 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False) 30 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider) 31 | -------------------------------------------------------------------------------- /mglearn/plot_ridge.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | from sklearn.linear_model import Ridge, LinearRegression 5 | from sklearn.model_selection import learning_curve, KFold 6 | 7 | from .datasets import load_extended_boston 8 | 9 | 10 | def plot_learning_curve(est, X, y): 11 | training_set_size, train_scores, test_scores = learning_curve( 12 | est, X, y, train_sizes=np.linspace(.1, 1, 20), cv=KFold(20, shuffle=True, random_state=1)) 13 | estimator_name = est.__class__.__name__ 14 | line = plt.plot(training_set_size, train_scores.mean(axis=1), '--', 15 | label="training " + estimator_name) 16 | plt.plot(training_set_size, test_scores.mean(axis=1), '-', 17 | label="test " + estimator_name, c=line[0].get_color()) 18 | plt.xlabel('Training set size') 19 | plt.ylabel('Score (R^2)') 20 | plt.ylim(0, 1.1) 21 | 22 | 23 | def plot_ridge_n_samples(): 24 | X, y = load_extended_boston() 25 | 26 | plot_learning_curve(Ridge(alpha=1), X, y) 27 | plot_learning_curve(LinearRegression(), X, y) 28 | plt.legend(loc=(0, 1.05), ncol=2, fontsize=11) 29 | -------------------------------------------------------------------------------- /mglearn/plot_scaling.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.datasets import make_blobs 4 | from sklearn.preprocessing import (StandardScaler, MinMaxScaler, Normalizer, 5 | RobustScaler) 6 | from .plot_helpers import cm2 7 | 8 | 9 | def plot_scaling(): 10 | X, y = make_blobs(n_samples=50, centers=2, random_state=4, cluster_std=1) 11 | X += 3 12 | 13 | plt.figure(figsize=(15, 8)) 14 | main_ax = plt.subplot2grid((2, 4), (0, 0), rowspan=2, colspan=2) 15 | 16 | main_ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm2, s=60) 17 | maxx = np.abs(X[:, 0]).max() 18 | maxy = np.abs(X[:, 1]).max() 19 | 20 | main_ax.set_xlim(-maxx + 1, maxx + 1) 21 | main_ax.set_ylim(-maxy + 1, maxy + 1) 22 | main_ax.set_title("Original Data") 23 | other_axes = [plt.subplot2grid((2, 4), (i, j)) 24 | for j in range(2, 4) for i in range(2)] 25 | 26 | for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(), 27 | MinMaxScaler(), Normalizer(norm='l2')]): 28 | X_ = scaler.fit_transform(X) 29 | ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=cm2, s=60) 30 | ax.set_xlim(-2, 2) 31 | ax.set_ylim(-2, 2) 32 | ax.set_title(type(scaler).__name__) 33 | 34 | other_axes.append(main_ax) 35 | 36 | for ax in other_axes: 37 | ax.spines['left'].set_position('center') 38 | ax.spines['right'].set_color('none') 39 | ax.spines['bottom'].set_position('center') 40 | ax.spines['top'].set_color('none') 41 | ax.xaxis.set_ticks_position('bottom') 42 | ax.yaxis.set_ticks_position('left') 43 | -------------------------------------------------------------------------------- /mglearn/plot_tree_nonmonotonous.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from sklearn.datasets import make_blobs 3 | from sklearn.tree import DecisionTreeClassifier, export_graphviz 4 | from .tools import discrete_scatter 5 | from .plot_2d_separator import plot_2d_separator 6 | 7 | 8 | def plot_tree_not_monotone(): 9 | import graphviz 10 | # make a simple 2d dataset 11 | X, y = make_blobs(centers=4, random_state=8) 12 | y = y % 2 13 | plt.figure() 14 | discrete_scatter(X[:, 0], X[:, 1], y) 15 | plt.legend(["Class 0", "Class 1"], loc="best") 16 | 17 | # learn a decision tree model 18 | tree = DecisionTreeClassifier(random_state=0).fit(X, y) 19 | plot_2d_separator(tree, X, linestyle="dashed") 20 | 21 | # visualize the tree 22 | export_graphviz(tree, out_file="mytree.dot", impurity=False, filled=True) 23 | with open("mytree.dot") as f: 24 | dot_graph = f.read() 25 | print("Feature importances: %s" % tree.feature_importances_) 26 | return graphviz.Source(dot_graph) 27 | -------------------------------------------------------------------------------- /mglearn/plots.py: -------------------------------------------------------------------------------- 1 | from .plot_linear_svc_regularization import plot_linear_svc_regularization 2 | from .plot_interactive_tree import plot_tree_progressive, plot_tree_partition 3 | from .plot_animal_tree import plot_animal_tree 4 | from .plot_rbf_svm_parameters import plot_svm 5 | from .plot_knn_regression import plot_knn_regression 6 | from .plot_knn_classification import plot_knn_classification 7 | from .plot_2d_separator import plot_2d_classification, plot_2d_separator 8 | from .plot_nn_graphs import (plot_logistic_regression_graph, 9 | plot_single_hidden_layer_graph, 10 | plot_two_hidden_layer_graph) 11 | from .plot_linear_regression import plot_linear_regression_wave 12 | from .plot_tree_nonmonotonous import plot_tree_not_monotone 13 | from .plot_scaling import plot_scaling 14 | from .plot_pca import plot_pca_illustration, plot_pca_whitening, plot_pca_faces 15 | from .plot_decomposition import plot_decomposition 16 | from .plot_nmf import plot_nmf_illustration, plot_nmf_faces 17 | from .plot_helpers import cm2, cm3 18 | from .plot_agglomerative import plot_agglomerative, plot_agglomerative_algorithm 19 | from .plot_kmeans import plot_kmeans_algorithm, plot_kmeans_boundaries, plot_kmeans_faces 20 | from .plot_improper_preprocessing import plot_improper_processing, plot_proper_processing 21 | from .plot_cross_validation import (plot_threefold_split, plot_group_kfold, 22 | plot_shuffle_split, plot_cross_validation, 23 | plot_stratified_cross_validation) 24 | 25 | from .plot_grid_search import plot_grid_search_overview, plot_cross_val_selection 26 | from .plot_metrics import (plot_confusion_matrix_illustration, 27 | plot_binary_confusion_matrix, 28 | plot_decision_threshold) 29 | from .plot_dbscan import plot_dbscan 30 | from .plot_ridge import plot_ridge_n_samples 31 | from .plot_kneighbors_regularization import plot_kneighbors_regularization 32 | 33 | __all__ = ['plot_linear_svc_regularization', 34 | "plot_animal_tree", "plot_tree_progressive", 35 | 'plot_tree_partition', 'plot_svm', 36 | 'plot_knn_regression', 37 | 'plot_logistic_regression_graph', 38 | 'plot_single_hidden_layer_graph', 39 | 'plot_two_hidden_layer_graph', 40 | 'plot_2d_classification', 41 | 'plot_2d_separator', 42 | 'plot_knn_classification', 43 | 'plot_linear_regression_wave', 44 | 'plot_tree_not_monotone', 45 | 'plot_scaling', 46 | 'plot_pca_illustration', 47 | 'plot_pca_faces', 48 | 'plot_pca_whitening', 49 | 'plot_decomposition', 50 | 'plot_nmf_illustration', 51 | 'plot_nmf_faces', 52 | 'plot_agglomerative', 53 | 'plot_agglomerative_algorithm', 54 | 'plot_kmeans_boundaries', 55 | 'plot_kmeans_algorithm', 56 | 'plot_kmeans_faces', 57 | 'cm3', 'cm2', 'plot_improper_processing', 'plot_proper_processing', 58 | 'plot_group_kfold', 59 | 'plot_shuffle_split', 60 | 'plot_stratified_cross_validation', 61 | 'plot_threefold_split', 62 | 'plot_cross_validation', 63 | 'plot_grid_search_overview', 64 | 'plot_cross_val_selection', 65 | 'plot_confusion_matrix_illustration', 66 | 'plot_binary_confusion_matrix', 67 | 'plot_decision_threshold', 68 | 'plot_dbscan', 69 | 'plot_ridge_n_samples', 70 | 'plot_kneighbors_regularization' 71 | ] 72 | -------------------------------------------------------------------------------- /mglearn/tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import make_blobs 3 | from sklearn.tree import export_graphviz 4 | import matplotlib.pyplot as plt 5 | from .plot_2d_separator import (plot_2d_separator, plot_2d_classification, 6 | plot_2d_scores) 7 | from .plot_helpers import cm2 as cm, discrete_scatter 8 | 9 | 10 | def visualize_coefficients(coefficients, feature_names, n_top_features=25): 11 | """Visualize coefficients of a linear model. 12 | 13 | Parameters 14 | ---------- 15 | coefficients : nd-array, shape (n_features,) 16 | Model coefficients. 17 | 18 | feature_names : list or nd-array of strings, shape (n_features,) 19 | Feature names for labeling the coefficients. 20 | 21 | n_top_features : int, default=25 22 | How many features to show. The function will show the largest (most 23 | positive) and smallest (most negative) n_top_features coefficients, 24 | for a total of 2 * n_top_features coefficients. 25 | """ 26 | coefficients = coefficients.squeeze() 27 | if coefficients.ndim > 1: 28 | # this is not a row or column vector 29 | raise ValueError("coeffients must be 1d array or column vector, got" 30 | " shape {}".format(coefficients.shape)) 31 | coefficients = coefficients.ravel() 32 | 33 | if len(coefficients) != len(feature_names): 34 | raise ValueError("Number of coefficients {} doesn't match number of" 35 | "feature names {}.".format(len(coefficients), 36 | len(feature_names))) 37 | # get coefficients with large absolute values 38 | coef = coefficients.ravel() 39 | positive_coefficients = np.argsort(coef)[-n_top_features:] 40 | negative_coefficients = np.argsort(coef)[:n_top_features] 41 | interesting_coefficients = np.hstack([negative_coefficients, 42 | positive_coefficients]) 43 | # plot them 44 | plt.figure(figsize=(15, 5)) 45 | colors = [cm(1) if c < 0 else cm(0) 46 | for c in coef[interesting_coefficients]] 47 | plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], 48 | color=colors) 49 | feature_names = np.array(feature_names) 50 | plt.subplots_adjust(bottom=0.3) 51 | plt.xticks(np.arange(1, 1 + 2 * n_top_features), 52 | feature_names[interesting_coefficients], rotation=60, 53 | ha="right") 54 | plt.ylabel("Coefficient magnitude") 55 | plt.xlabel("Feature") 56 | 57 | 58 | def heatmap(values, xlabel, ylabel, xticklabels, yticklabels, cmap=None, 59 | vmin=None, vmax=None, ax=None, fmt="%0.2f"): 60 | if ax is None: 61 | ax = plt.gca() 62 | # plot the mean cross-validation scores 63 | img = ax.pcolor(values, cmap=cmap, vmin=vmin, vmax=vmax) 64 | img.update_scalarmappable() 65 | ax.set_xlabel(xlabel) 66 | ax.set_ylabel(ylabel) 67 | ax.set_xticks(np.arange(len(xticklabels)) + .5) 68 | ax.set_yticks(np.arange(len(yticklabels)) + .5) 69 | ax.set_xticklabels(xticklabels) 70 | ax.set_yticklabels(yticklabels) 71 | ax.set_aspect(1) 72 | 73 | for p, color, value in zip(img.get_paths(), img.get_facecolors(), 74 | img.get_array()): 75 | x, y = p.vertices[:-2, :].mean(0) 76 | if np.mean(color[:3]) > 0.5: 77 | c = 'k' 78 | else: 79 | c = 'w' 80 | ax.text(x, y, fmt % value, color=c, ha="center", va="center") 81 | return img 82 | 83 | 84 | def make_handcrafted_dataset(): 85 | # a carefully hand-designed dataset lol 86 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 87 | y[np.array([7, 27])] = 0 88 | mask = np.ones(len(X), dtype=bool) 89 | mask[np.array([0, 1, 5, 26])] = 0 90 | X, y = X[mask], y[mask] 91 | return X, y 92 | 93 | 94 | def print_topics(topics, feature_names, sorting, topics_per_chunk=6, 95 | n_words=20): 96 | for i in range(0, len(topics), topics_per_chunk): 97 | # for each chunk: 98 | these_topics = topics[i: i + topics_per_chunk] 99 | # maybe we have less than topics_per_chunk left 100 | len_this_chunk = len(these_topics) 101 | # print topic headers 102 | print(("topic {:<8}" * len_this_chunk).format(*these_topics)) 103 | print(("-------- {0:<5}" * len_this_chunk).format("")) 104 | # print top n_words frequent words 105 | for i in range(n_words): 106 | try: 107 | print(("{:<14}" * len_this_chunk).format( 108 | *feature_names[sorting[these_topics, i]])) 109 | except: 110 | pass 111 | print("\n") 112 | 113 | 114 | def get_tree(tree, **kwargs): 115 | try: 116 | # python3 117 | from io import StringIO 118 | except ImportError: 119 | # python2 120 | from StringIO import StringIO 121 | f = StringIO() 122 | export_graphviz(tree, f, **kwargs) 123 | import graphviz 124 | return graphviz.Source(f.getvalue()) 125 | 126 | __all__ = ['plot_2d_separator', 'plot_2d_classification', 'plot_2d_scores', 127 | 'cm', 'visualize_coefficients', 'print_topics', 'heatmap', 128 | 'discrete_scatter'] 129 | -------------------------------------------------------------------------------- /preamble.py: -------------------------------------------------------------------------------- 1 | from IPython.display import set_matplotlib_formats, display 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import mglearn 6 | from cycler import cycler 7 | 8 | set_matplotlib_formats('pdf', 'png') 9 | plt.rcParams['savefig.dpi'] = 300 10 | plt.rcParams['image.cmap'] = "viridis" 11 | plt.rcParams['image.interpolation'] = "none" 12 | plt.rcParams['savefig.bbox'] = "tight" 13 | plt.rcParams['lines.linewidth'] = 2 14 | plt.rcParams['legend.numpoints'] = 1 15 | plt.rc('axes', prop_cycle=( 16 | cycler('color', mglearn.plot_helpers.cm_cycle.colors) + 17 | cycler('linestyle', ['-', '-', "--", (0, (3, 3)), (0, (1.5, 1.5))]))) 18 | 19 | np.set_printoptions(precision=3, suppress=True) 20 | 21 | pd.set_option("display.max_columns", 8) 22 | pd.set_option('display.precision', 2) 23 | 24 | __all__ = ['np', 'mglearn', 'display', 'plt', 'pd'] 25 | --------------------------------------------------------------------------------