├── .gitignore ├── README.md ├── download_data.py ├── notebooks ├── .gitignore ├── 00_Preliminaries.ipynb ├── 01_introduction.ipynb ├── 02_sklearn_data.ipynb ├── 03_machine_learning_101.ipynb ├── 04_houses_regression.ipynb ├── 05_iris_classification.ipynb ├── 06_iris_dimensionality.ipynb ├── 07_iris_clustering.ipynb ├── 08_linearly_separable.ipynb ├── 09_validation_and_testing.ipynb ├── 10_digits_classification.ipynb ├── 11_photoz_regression.ipynb ├── datasets │ ├── __init__.py │ └── galaxy_mags.py ├── figures │ ├── ML_flow_chart.py │ ├── __init__.py │ ├── bias_variance.py │ ├── linear_regression.py │ ├── sdss_filters.py │ ├── sgd_separator.py │ └── svm_gui_frames.py ├── files │ ├── iris_setosa.jpg │ ├── iris_versicolor.jpg │ └── iris_virginica.jpg ├── generate_v2.py └── soln │ ├── boston_decision_tree.py │ ├── iris_kmeans.py │ ├── iris_rpca.py │ └── show_faces.py └── scripts └── svm_gui.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | *.npy 4 | *.npz 5 | notebooks/figures/downloads/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PyCon 2013 Scikit-learn Tutorial 2 | ================================ 3 | 4 | **Note: for updated tutorial content, please see** http://github.com/jakevdp/sklearn_tutorial/ 5 | 6 | *Instructor: Jake VanderPlas* 7 | 8 | - email: 9 | - twitter: [@jakevdp](https://twitter.com/jakevdp) 10 | - github: [jakevdp](http://github.com/jakevdp) 11 | 12 | This repository will contain files and other info associated with my PyCon 13 | 2013 scikit-learn tutorial. 14 | 15 | Installation Notes 16 | ------------------ 17 | This tutorial will require recent installations of *numpy*, *scipy*, 18 | *matplotlib*, *scikit-learn*, and *ipython* with ipython notebook. 19 | The last one is important: you should be able to type 20 | 21 | ipython notebook 22 | 23 | in your terminal window and see the notebook panel load in your web browser. 24 | Because Python 3 compatibility is still being ironed-out for these packages 25 | (we're getting close, I promise!) participants should plan to use Python 2.6 26 | or 2.7 for this tutorial. 27 | 28 | For users who do not yet have these packages installed, a relatively 29 | painless way to install all the requirements is to use a package such as 30 | [Anaconda CE](http://store.continuum.io/ "Anaconda CE"), which can be 31 | downloaded and installed for free. 32 | 33 | Downloading the Tutorial Materials 34 | ---------------------------------- 35 | I would highly recommend using git, not only for this tutorial, but for the 36 | general betterment of your life. Once git is installed, you can clone the 37 | material in this tutorial by using the git address shown above: 38 | 39 | git clone git://github.com/jakevdp/sklearn_pycon2013.git 40 | 41 | If you can't or don't want to install git, there is a link above to download 42 | the contents of this repository as a zip file. I may make minor changes to 43 | the repository in the days before the tutorial, however, so cloning the 44 | repository is a much better option. 45 | 46 | Data Downloads 47 | -------------- 48 | The data for this tutorial is not included in the repository. We will be 49 | using several data sets during the tutorial: most are built-in to 50 | scikit-learn, and one is culled from the 51 | [Sloan Digital Sky Survey](http://skyserver.sdss.org/public/en/). 52 | The tutorial includes code which automatically downloads and caches these 53 | data. Because the wireless network 54 | at conferences can often be spotty, it would be a good idea to download these 55 | data sets before arriving at the conference. 56 | 57 | To cache the required data on your computer, please download the tutorial 58 | materials as described above, and execute the script called 59 | ``download_data.py`` from the top directory. It will cache several datasets 60 | to the appropriate places, and you'll be ready to go when it comes time for 61 | the tutorial! 62 | 63 | 64 | Notebook Listing 65 | ---------------- 66 | These notebooks in this repository can be statically viewed using the 67 | excellent [nbviewer](http://nbviewer.ipython.org) site. They will not 68 | be able to be modified within nbviewer. To modify them, first download 69 | the tutorial repository, change to the notebooks directory, and type 70 | ``ipython notebook``. You should see the list in the ipython notebook 71 | launch page in your web browser. 72 | 73 | - [01_introduction.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/01_introduction.ipynb) 74 | - [02_sklearn_data.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/02_sklearn_data.ipynb) 75 | - [03_machine_learning_101.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/03_machine_learning_101.ipynb) 76 | - [04_houses_regression.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/04_houses_regression.ipynb) 77 | - [05_iris_classification.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/05_iris_classification.ipynb) 78 | - [06_iris_dimensionality.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/06_iris_dimensionality.ipynb) 79 | - [07_iris_clustering.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/07_iris_clustering.ipynb) 80 | - [08_linearly_separable.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/08_linearly_separable.ipynb) 81 | - [09_validation_and_testing.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/09_validation_and_testing.ipynb) 82 | - [10_digits_classification.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/10_digits_classification.ipynb) 83 | - [11_photoz_regression.ipynb](http://nbviewer.ipython.org/urls/raw.github.com/jakevdp/sklearn_pycon2013/master/notebooks/11_photoz_regression.ipynb) 84 | 85 | Note that some of the code in these notebooks will not work outside the 86 | directory structure of this tutorial. 87 | -------------------------------------------------------------------------------- /download_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run this script to make sure data is cached in the appropriate 3 | place on your computer. 4 | 5 | The data are only a few megabytes, but conference wireless is 6 | often not very reliable... 7 | """ 8 | import os 9 | import sys 10 | from sklearn import datasets 11 | 12 | #------------------------------------------------------------ 13 | # Faces data: this will be stored in the scikit_learn_data 14 | # sub-directory of your home folder 15 | faces = datasets.fetch_olivetti_faces() 16 | print "Successfully fetched olivetti faces data" 17 | 18 | #------------------------------------------------------------ 19 | # SDSS galaxy data: this will be stored in notebooks/datasets/data 20 | sys.path.append(os.path.abspath('notebooks')) 21 | from datasets import fetch_sdss_galaxy_mags 22 | colors = fetch_sdss_galaxy_mags() 23 | print "Successfully fetched SDSS galaxy data" 24 | 25 | 26 | #------------------------------------------------------------ 27 | # SDSS filters & vega spectrum: stored in notebooks/figures/downloads 28 | from figures.sdss_filters import fetch_filter, fetch_vega_spectrum 29 | spectrum = fetch_vega_spectrum() 30 | print "Successfully fetched vega spectrum" 31 | 32 | filters = [fetch_filter(f) for f in 'ugriz'] 33 | print "Successfully fetched SDSS filters" 34 | -------------------------------------------------------------------------------- /notebooks/.gitignore: -------------------------------------------------------------------------------- 1 | *.v2.ipynb -------------------------------------------------------------------------------- /notebooks/00_Preliminaries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "00_Preliminaries" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "An Introduction to scikit-learn: Machine Learning in Python" 16 | ] 17 | }, 18 | { 19 | "cell_type": "heading", 20 | "level": 2, 21 | "metadata": {}, 22 | "source": [ 23 | "Goals of this Tutorial" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "- **Introduce the basics of Machine Learning**, and some skills useful in practice.\n", 31 | "- **Introduce the syntax of scikit-learn**, so that you can make use of the rich toolset available." 32 | ] 33 | }, 34 | { 35 | "cell_type": "heading", 36 | "level": 2, 37 | "metadata": {}, 38 | "source": [ 39 | "Schedule:" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "- **1:20 - 2:50**: Part 1\n", 47 | " + Getting started\n", 48 | " + Loading, representing, and manipulating data\n", 49 | " + Basics of Machine Learning and the scikit-learn syntax\n", 50 | " + Supervised learning\n", 51 | " * _regression_\n", 52 | " * _classification_\n", 53 | " + Unsupervised learning\n", 54 | " * _clustering_\n", 55 | " * _dimensionality reduction_\n", 56 | "\n", 57 | "- **2:50 - 3:10**: short break\n", 58 | "\n", 59 | "- **3:10 - 4:20**: part 2\n", 60 | " + Validation and testing of models\n", 61 | " + break for a short survey\n", 62 | " + examples from astronomy, image classification and others" 63 | ] 64 | }, 65 | { 66 | "cell_type": "heading", 67 | "level": 2, 68 | "metadata": {}, 69 | "source": [ 70 | "Preliminaries" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "This tutorial requires the following packages:\n", 78 | "\n", 79 | "- Python version 2.6-2.7\n", 80 | "- `numpy` version 1.5 or later: http://www.numpy.org/\n", 81 | "- `scipy` version 0.9 or later: http://www.scipy.org/\n", 82 | "- `matplotlib` version 1.0 or later: http://matplotlib.org/\n", 83 | "- `scikit-learn` version 0.12 or later: http://scikit-learn.org\n", 84 | "- `ipython` version 0.13 or later, with notebook support: http://ipython.org\n", 85 | "\n", 86 | "The easiest way to get these is to use an all-in-one installer such as\n", 87 | "[Anaconda CE](https://store.continuum.io/) from Continuum or\n", 88 | "[EPD Free](http://www.enthought.com/products/epd_free.php) from Enthought.\n", 89 | "These are available for multiple architectures.\n", 90 | "\n", 91 | "Other options do exist:\n", 92 | "\n", 93 | "- **Linux**: If you're on Linux, you can use the linux distribution tools (by typing, for\n", 94 | "example `apt-get install numpy` or `yum install numpy`.\n", 95 | "\n", 96 | "- **Mac**: If you're on OSX, there are similar tools such as MacPorts or HomeBrew which\n", 97 | "contain pre-compiled versions of these packages.\n", 98 | "\n", 99 | "- **Windows**: Windows can be challenging: the best bet is probably to use one of the package\n", 100 | "installers mentioned above.\n", 101 | "\n", 102 | "You can run the following code to check the versions of the packages on your system:\n", 103 | "\n", 104 | "(in IPython notebook, press `shift` and `return` together to execute the contents of a cell)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "collapsed": false, 110 | "input": [ 111 | "import numpy\n", 112 | "print 'numpy:', numpy.__version__\n", 113 | "\n", 114 | "import scipy\n", 115 | "print 'scipy:', scipy.__version__\n", 116 | "\n", 117 | "import matplotlib\n", 118 | "print 'matplotlib:', matplotlib.__version__\n", 119 | "\n", 120 | "import sklearn\n", 121 | "print 'scikit-learn:', sklearn.__version__" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "heading", 129 | "level": 2, 130 | "metadata": {}, 131 | "source": [ 132 | "Useful Resources" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "- **scikit-learn:** http://scikit-learn.org (see especially the narrative documentation)\n", 140 | "- **matplotlib:** http://matplotlib.org (see especially the gallery section)\n", 141 | "- **IPython:** http://ipython.org (also check out http://nbviewer.ipython.org)\n", 142 | "- **astroML:** http://astroML.github.com (shameless plug: this is my project!)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "heading", 147 | "level": 2, 148 | "metadata": {}, 149 | "source": [ 150 | "Survey/Evaluation" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "The PyCon organizers have put together a survey for tutorial attendees.\n", 158 | "\n", 159 | "Near the end of this tutorial, please follow the link below and fill out this survey:\n", 160 | "\n", 161 | "https://www.surveymonkey.com/s/pycon2013_tutorials" 162 | ] 163 | } 164 | ], 165 | "metadata": {} 166 | } 167 | ] 168 | } -------------------------------------------------------------------------------- /notebooks/01_introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "01_introduction" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Introduction to Machine Learning in Python" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "In this section we'll go through some preliminary topics, as well as some of the\n", 23 | "requirements for this tutorial.\n", 24 | "\n", 25 | "By the end of this section you should:\n", 26 | "\n", 27 | "- Know what sort of tasks qualify as Machine Learning problems.\n", 28 | "- See some simple examples of machine learning\n", 29 | "- Know the basics of creating and manipulating numpy arrays.\n", 30 | "- Know the basics of scatter plots in matplotlib." 31 | ] 32 | }, 33 | { 34 | "cell_type": "heading", 35 | "level": 2, 36 | "metadata": {}, 37 | "source": [ 38 | "What is Machine Learning?" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "In this section we will begin to explore the basic principles of machine learning.\n", 46 | "Machine Learning is about building programs with **tunable parameters** (typically an\n", 47 | "array of floating point values) that are adjusted automatically so as to improve\n", 48 | "their behavior by **adapting to previously seen data.**\n", 49 | "\n", 50 | "Machine Learning can be considered a subfield of **Artificial Intelligence** since those\n", 51 | "algorithms can be seen as building blocks to make computers learn to behave more\n", 52 | "intelligently by somehow **generalizing** rather that just storing and retrieving data items\n", 53 | "like a database system would do.\n", 54 | "\n", 55 | "We'll take a look at two very simple machine learning tasks here.\n", 56 | "The first is a **classification** task: the figure shows a\n", 57 | "collection of two-dimensional data, colored according to two different class\n", 58 | "labels. A classification algorithm may be used to draw a dividing boundary\n", 59 | "between the two clusters of points:" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "collapsed": false, 65 | "input": [ 66 | "# Start pylab inline mode, so figures will appear in the notebook\n", 67 | "%pylab inline" 68 | ], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "collapsed": false, 76 | "input": [ 77 | "# Import the example plot from the figures directory\n", 78 | "from figures import plot_sgd_separator\n", 79 | "plot_sgd_separator()" 80 | ], 81 | "language": "python", 82 | "metadata": {}, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "This may seem like a trivial task, but it is a simple version of a very important concept.\n", 90 | "By drawing this separating line, we have learned a model which can **generalize** to new\n", 91 | "data: if you were to drop another point onto the plane which is unlabeled, this algorithm\n", 92 | "could now **predict** whether it's a blue or a red point.\n", 93 | "\n", 94 | "If you'd like to see the source code used to generate this, you can either open the\n", 95 | "code in the `figures` directory, or you can load the code using the `%load` magic command:" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "collapsed": false, 101 | "input": [ 102 | "%load figures/sgd_separator.py" 103 | ], 104 | "language": "python", 105 | "metadata": {}, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "The next simple task we'll look at is a **regression** task: a simple best-fit line\n", 113 | "to a set of data:" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "collapsed": false, 119 | "input": [ 120 | "from figures import plot_linear_regression\n", 121 | "plot_linear_regression()" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "Again, this is an example of fitting a model to data, such that the model can make\n", 132 | "generalizations about new data. The model has been **learned** from the training\n", 133 | "data, and can be used to predict the result of test data:\n", 134 | "here, we might be given an x-value, and the model would\n", 135 | "allow us to predict the y value. Again, this might seem like a trivial problem,\n", 136 | "but it is a basic example of a type of operation that is fundamental to\n", 137 | "machine learning tasks." 138 | ] 139 | }, 140 | { 141 | "cell_type": "heading", 142 | "level": 2, 143 | "metadata": {}, 144 | "source": [ 145 | "Numpy" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Manipulating `numpy` arrays is an important part of doing machine learning\n", 153 | "(or, really, any type of scientific computation) in python. This will likely\n", 154 | "be review for most: we'll quickly go through some of the most important features." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "collapsed": false, 160 | "input": [ 161 | "import numpy as np\n", 162 | "\n", 163 | "# Generating a random array\n", 164 | "X = np.random.random((3, 5)) # a 3 x 5 array\n", 165 | "\n", 166 | "print X" 167 | ], 168 | "language": "python", 169 | "metadata": {}, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "collapsed": false, 175 | "input": [ 176 | "# Accessing elements\n", 177 | "\n", 178 | "# get a single element\n", 179 | "print X[0, 0]\n", 180 | "\n", 181 | "# get a row\n", 182 | "print X[1]\n", 183 | "\n", 184 | "# get a column\n", 185 | "print X[:, 1]" 186 | ], 187 | "language": "python", 188 | "metadata": {}, 189 | "outputs": [] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "collapsed": false, 194 | "input": [ 195 | "# Transposing an array\n", 196 | "print X.T" 197 | ], 198 | "language": "python", 199 | "metadata": {}, 200 | "outputs": [] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "collapsed": false, 205 | "input": [ 206 | "# Turning a row vector into a column vector\n", 207 | "y = np.linspace(0, 12, 5)\n", 208 | "print y\n", 209 | "\n", 210 | "# make into a column vector\n", 211 | "print y[:, np.newaxis]" 212 | ], 213 | "language": "python", 214 | "metadata": {}, 215 | "outputs": [] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "There is much, much more to know, but these few operations are fundamental to what we'll\n", 222 | "do during this tutorial." 223 | ] 224 | }, 225 | { 226 | "cell_type": "heading", 227 | "level": 2, 228 | "metadata": {}, 229 | "source": [ 230 | "Scipy Sparse Matrices" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "We won't make very much use of these in this tutorial, but sparse matrices are very nice\n", 238 | "in some situations. For example, in some machine learning tasks, especially those associated\n", 239 | "with textual analysis, the data may be mostly zeros. Storing all these zeros is very\n", 240 | "inefficient. We can create and manipulate sparse matrices as follows:" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "collapsed": false, 246 | "input": [ 247 | "from scipy import sparse\n", 248 | "\n", 249 | "# Create a random array with a lot of zeros\n", 250 | "X = np.random.random((10, 5))\n", 251 | "print X\n", 252 | "X[X < 0.7] = 0\n", 253 | "print X" 254 | ], 255 | "language": "python", 256 | "metadata": {}, 257 | "outputs": [] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "collapsed": false, 262 | "input": [ 263 | "# turn X into a csr (Compressed-Sparse-Row) matrix\n", 264 | "X_csr = sparse.csr_matrix(X)\n", 265 | "print X_csr" 266 | ], 267 | "language": "python", 268 | "metadata": {}, 269 | "outputs": [] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "collapsed": false, 274 | "input": [ 275 | "# convert the sparse matrix to a dense array\n", 276 | "print X_csr.toarray()" 277 | ], 278 | "language": "python", 279 | "metadata": {}, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "The CSR representation can be very efficient for computations, but it is not\n", 287 | "as good for adding elements. For that, the LIL (List-In-List) representation\n", 288 | "is better:" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "collapsed": false, 294 | "input": [ 295 | "# Create an empty LIL matrix and add some items\n", 296 | "X_lil = sparse.lil_matrix((5, 5))\n", 297 | "\n", 298 | "for i, j in np.random.randint(0, 5, (15, 2)):\n", 299 | " X_lil[i, j] = i + j\n", 300 | "\n", 301 | "print X_lil\n", 302 | "print X_lil.toarray()" 303 | ], 304 | "language": "python", 305 | "metadata": {}, 306 | "outputs": [] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Often, once an LIL matrix is created, it is useful to convert it to a CSR format\n", 313 | "(many scikit-learn algorithms require CSR or CSC format)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "collapsed": false, 319 | "input": [ 320 | "X_csr = X_lil.tocsr()\n", 321 | "print X_csr" 322 | ], 323 | "language": "python", 324 | "metadata": {}, 325 | "outputs": [] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "There are several other sparse formats that can be useful for various problems:\n", 332 | "\n", 333 | "- `CSC` (compressed sparse column)\n", 334 | "- `BSR` (block sparse row)\n", 335 | "- `COO` (coordinate)\n", 336 | "- `DIA` (diagonal)\n", 337 | "- `DOK` (dictionary of keys)\n", 338 | "\n", 339 | "The ``scipy.sparse`` submodule also has a lot of functions for sparse matrices\n", 340 | "including linear algebra, sparse solvers, graph algorithms, and much more." 341 | ] 342 | }, 343 | { 344 | "cell_type": "heading", 345 | "level": 2, 346 | "metadata": {}, 347 | "source": [ 348 | "Matplotlib" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "Another important part of machine learning is visualization of data. The most common\n", 356 | "tool for this in Python is `matplotlib`. It is an extremely flexible package, but\n", 357 | "we will go over some basics here.\n", 358 | "\n", 359 | "First, something special to IPython notebook. We can turn on the \"IPython inline\" mode,\n", 360 | "which will make plots show up inline in the notebook." 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "collapsed": false, 366 | "input": [ 367 | "%pylab inline" 368 | ], 369 | "language": "python", 370 | "metadata": {}, 371 | "outputs": [] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "collapsed": false, 376 | "input": [ 377 | "import matplotlib.pyplot as plt" 378 | ], 379 | "language": "python", 380 | "metadata": {}, 381 | "outputs": [] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "collapsed": false, 386 | "input": [ 387 | "# plotting a line\n", 388 | "\n", 389 | "x = np.linspace(0, 10, 100)\n", 390 | "plt.plot(x, np.sin(x))" 391 | ], 392 | "language": "python", 393 | "metadata": {}, 394 | "outputs": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "collapsed": false, 399 | "input": [ 400 | "# scatter-plot points\n", 401 | "\n", 402 | "x = np.random.normal(size=500)\n", 403 | "y = np.random.normal(size=500)\n", 404 | "plt.scatter(x, y)" 405 | ], 406 | "language": "python", 407 | "metadata": {}, 408 | "outputs": [] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "collapsed": false, 413 | "input": [ 414 | "# showing images\n", 415 | "x = np.linspace(1, 12, 100)\n", 416 | "y = x[:, np.newaxis]\n", 417 | "\n", 418 | "im = y * np.sin(x) * np.cos(y)\n", 419 | "print im.shape" 420 | ], 421 | "language": "python", 422 | "metadata": {}, 423 | "outputs": [] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "collapsed": false, 428 | "input": [ 429 | "# imshow - note that origin is at the top-left!\n", 430 | "plt.imshow(im)" 431 | ], 432 | "language": "python", 433 | "metadata": {}, 434 | "outputs": [] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "collapsed": false, 439 | "input": [ 440 | "# Contour plot - note that origin here is at the bottom-left!\n", 441 | "plt.contour(im)" 442 | ], 443 | "language": "python", 444 | "metadata": {}, 445 | "outputs": [] 446 | } 447 | ], 448 | "metadata": {} 449 | } 450 | ] 451 | } -------------------------------------------------------------------------------- /notebooks/02_sklearn_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "02_sklearn_data" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Representation and Visualization of Data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Machine learning is about creating models from data: for that reason, we'll start by\n", 23 | "discussing how data can be represented in order to be understood by the computer. Along\n", 24 | "with this, we'll build on our matplotlib examples from the previous section and show some\n", 25 | "examples of how to visualize data.\n", 26 | "\n", 27 | "By the end of this section you should:\n", 28 | "\n", 29 | "- Know the internal data representation of scikit-learn.\n", 30 | "- Know how to use scikit-learn's dataset loaders to load example data.\n", 31 | "- Know how to turn image & text data into data matrices for learning.\n", 32 | "- Know how to use matplotlib to help visualize different types of data." 33 | ] 34 | }, 35 | { 36 | "cell_type": "heading", 37 | "level": 2, 38 | "metadata": {}, 39 | "source": [ 40 | "Data in scikit-learn" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Data in scikit-learn, with very few exceptions, is assumed to be stored as a\n", 48 | "**two-dimensional array**, of size `[n_samples, n_features]`." 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Most machine learning algorithms implemented in scikit-learn expect data to be stored in a\n", 56 | "**two-dimensional array or matrix**. The arrays can be\n", 57 | "either ``numpy`` arrays, or in some cases ``scipy.sparse`` matrices.\n", 58 | "The size of the array is expected to be `[n_samples, n_features]`\n", 59 | "\n", 60 | "- **n_samples:** The number of samples: each sample is an item to process (e.g. classify).\n", 61 | " A sample can be a document, a picture, a sound, a video, an astronomical object,\n", 62 | " a row in database or CSV file,\n", 63 | " or whatever you can describe with a fixed set of quantitative traits.\n", 64 | "- **n_features:** The number of features or distinct traits that can be used to describe each\n", 65 | " item in a quantitative manner. Features are generally real-valued, but may be boolean or\n", 66 | " discrete-valued in some cases.\n", 67 | "\n", 68 | "The number of features must be fixed in advance. However it can be very high dimensional\n", 69 | "(e.g. millions of features) with most of them being zeros for a given sample. This is a case\n", 70 | "where `scipy.sparse` matrices can be useful, in that they are\n", 71 | "much more memory-efficient than numpy arrays." 72 | ] 73 | }, 74 | { 75 | "cell_type": "heading", 76 | "level": 3, 77 | "metadata": {}, 78 | "source": [ 79 | "A Simple Example: the Iris Dataset" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "As an example of a simple dataset, we're going to take a look at the iris data stored by scikit-learn.\n", 87 | "The data consists of measurements of three different species of irises. There are three species of iris\n", 88 | "in the dataset, which we can picture here:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "collapsed": false, 94 | "input": [ 95 | "from IPython.core.display import Image, display\n", 96 | "display(Image(filename='files/iris_setosa.jpg'))\n", 97 | "print \"Iris Setosa\\n\"\n", 98 | "\n", 99 | "display(Image(filename='files/iris_versicolor.jpg'))\n", 100 | "print \"Iris Versicolor\\n\"\n", 101 | "\n", 102 | "display(Image(filename='files/iris_virginica.jpg'))\n", 103 | "print \"Iris Virginica\"" 104 | ], 105 | "language": "python", 106 | "metadata": {}, 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "heading", 111 | "level": 3, 112 | "metadata": {}, 113 | "source": [ 114 | "Quick Question:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "**If we want to design an algorithm to recognize iris species, what might the data be?**\n", 122 | "\n", 123 | "Remember: we need a 2D array of size `[n_samples x n_features]`.\n", 124 | "\n", 125 | "- What would the `n_samples` refer to?\n", 126 | "\n", 127 | "- What might the `n_features` refer to?\n", 128 | "\n", 129 | "Remember that there must be a **fixed** number of features for each sample, and feature\n", 130 | "number ``i`` must be a similar kind of quantity for each sample." 131 | ] 132 | }, 133 | { 134 | "cell_type": "heading", 135 | "level": 3, 136 | "metadata": {}, 137 | "source": [ 138 | "Loading the Iris Data with Scikit-learn" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "Scikit-learn has a very straightforward set of data on these iris species. The data consist of\n", 146 | "the following:\n", 147 | "\n", 148 | "- Features in the Iris dataset:\n", 149 | "\n", 150 | " 1. sepal length in cm\n", 151 | " 2. sepal width in cm\n", 152 | " 3. petal length in cm\n", 153 | " 4. petal width in cm\n", 154 | "\n", 155 | "- Target classes to predict:\n", 156 | "\n", 157 | " 1. Iris Setosa\n", 158 | " 2. Iris Versicolour\n", 159 | " 3. Iris Virginica" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "``scikit-learn`` embeds a copy of the iris CSV file along with a helper function to load it into numpy arrays:" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "collapsed": false, 172 | "input": [ 173 | "from sklearn.datasets import load_iris\n", 174 | "iris = load_iris()" 175 | ], 176 | "language": "python", 177 | "metadata": {}, 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "The resulting dataset is a ``Bunch`` object: you can see what's available using\n", 185 | "the method ``keys()``:" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "collapsed": false, 191 | "input": [ 192 | "iris.keys()" 193 | ], 194 | "language": "python", 195 | "metadata": {}, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "The features of each sample flower are stored in the ``data`` attribute of the dataset:" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "collapsed": false, 208 | "input": [ 209 | "n_samples, n_features = iris.data.shape\n", 210 | "print n_samples\n", 211 | "print n_features\n", 212 | "print iris.data[0]" 213 | ], 214 | "language": "python", 215 | "metadata": {}, 216 | "outputs": [] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "The information about the class of each sample is stored in the ``target`` attribute of the dataset:" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "collapsed": false, 228 | "input": [ 229 | "print iris.data.shape\n", 230 | "print iris.target.shape" 231 | ], 232 | "language": "python", 233 | "metadata": {}, 234 | "outputs": [] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "collapsed": false, 239 | "input": [ 240 | "print iris.target" 241 | ], 242 | "language": "python", 243 | "metadata": {}, 244 | "outputs": [] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "The names of the classes are stored in the last attribute, namely ``target_names``:" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "collapsed": false, 256 | "input": [ 257 | "print iris.target_names" 258 | ], 259 | "language": "python", 260 | "metadata": {}, 261 | "outputs": [] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "This data is four dimensional, but we can visualize two of the dimensions\n", 268 | "at a time using a simple scatter-plot. Again, we'll start by enabling\n", 269 | "pylab inline mode:" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "collapsed": false, 275 | "input": [ 276 | "# note: this also imports numpy as np, imports matplotlib.pyplot as plt, and others\n", 277 | "%pylab inline" 278 | ], 279 | "language": "python", 280 | "metadata": {}, 281 | "outputs": [] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "collapsed": false, 286 | "input": [ 287 | "x_index = 0\n", 288 | "y_index = 1\n", 289 | "\n", 290 | "# this formatter will label the colorbar with the correct target names\n", 291 | "formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)])\n", 292 | "\n", 293 | "plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target)\n", 294 | "plt.colorbar(ticks=[0, 1, 2], format=formatter)\n", 295 | "plt.xlabel(iris.feature_names[x_index])\n", 296 | "plt.ylabel(iris.feature_names[y_index])" 297 | ], 298 | "language": "python", 299 | "metadata": {}, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "heading", 304 | "level": 3, 305 | "metadata": {}, 306 | "source": [ 307 | "Quick Exercise:" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "**Change** `x_index` **and** `y_index` **in the above script\n", 315 | "and find a combination of two parameters\n", 316 | "which maximally separate the three classes.**\n", 317 | "\n", 318 | "This exercise is a preview of **dimensionality reduction**, which we'll see later." 319 | ] 320 | }, 321 | { 322 | "cell_type": "heading", 323 | "level": 2, 324 | "metadata": {}, 325 | "source": [ 326 | "Other Available Data" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "You can see which datasets are available by using ipython's tab-completion feature. Simply type\n", 334 | "\n", 335 | " ``datasets.fetch_``\n", 336 | "\n", 337 | "or\n", 338 | "\n", 339 | " ``datasets.load_``\n", 340 | "\n", 341 | "and then press the tab key. This will give you a drop-down menu which lists all the datasets that can be fetched." 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "collapsed": false, 347 | "input": [ 348 | "from sklearn import datasets" 349 | ], 350 | "language": "python", 351 | "metadata": {}, 352 | "outputs": [] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "collapsed": false, 357 | "input": [], 358 | "language": "python", 359 | "metadata": {}, 360 | "outputs": [] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "The data downloaded using the ``fetch_`` scripts are stored locally,\n", 367 | "within a subdirectory of your home directory.\n", 368 | "You can use the following to determine where it is:" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "collapsed": false, 374 | "input": [ 375 | "from sklearn.datasets import get_data_home\n", 376 | "get_data_home()" 377 | ], 378 | "language": "python", 379 | "metadata": {}, 380 | "outputs": [] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "collapsed": false, 385 | "input": [ 386 | "!ls /Users/jakevdp/scikit_learn_data/" 387 | ], 388 | "language": "python", 389 | "metadata": {}, 390 | "outputs": [] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "Be warned: many of these datasets are quite large, and can take a long time to download!\n", 397 | "(especially on Conference wifi).\n", 398 | "\n", 399 | "If you start a download within the IPython notebook\n", 400 | "and you want to kill it, you can use ipython's \"kernel interrupt\" feature, available in the menu or using\n", 401 | "the shortcut ``Ctrl-m i``.\n", 402 | "\n", 403 | "You can press ``Ctrl-m h`` for a list of all ``ipython`` keyboard shortcuts." 404 | ] 405 | }, 406 | { 407 | "cell_type": "heading", 408 | "level": 2, 409 | "metadata": {}, 410 | "source": [ 411 | "Loading Digits Data" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "Now we'll take a look at another dataset, one where we have to put a bit\n", 419 | "more thought into how to represent the data. We can explore the data in\n", 420 | "a similar manner as above:" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "collapsed": false, 426 | "input": [ 427 | "from sklearn.datasets import load_digits\n", 428 | "digits = load_digits()" 429 | ], 430 | "language": "python", 431 | "metadata": {}, 432 | "outputs": [] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "collapsed": false, 437 | "input": [ 438 | "digits.keys()" 439 | ], 440 | "language": "python", 441 | "metadata": {}, 442 | "outputs": [] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "collapsed": false, 447 | "input": [ 448 | "n_samples, n_features = digits.data.shape\n", 449 | "print (n_samples, n_features)" 450 | ], 451 | "language": "python", 452 | "metadata": {}, 453 | "outputs": [] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "collapsed": false, 458 | "input": [ 459 | "print digits.data[0]\n", 460 | "print digits.target" 461 | ], 462 | "language": "python", 463 | "metadata": {}, 464 | "outputs": [] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "The target here is just the digit represented by the data. The data is an array of\n", 471 | "length 64... but what does this data mean?" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "There's a clue in the fact that we have two versions of the data array:\n", 479 | "``data`` and ``images``. Let's take a look at them:" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "collapsed": false, 485 | "input": [ 486 | "print digits.data.shape\n", 487 | "print digits.images.shape" 488 | ], 489 | "language": "python", 490 | "metadata": {}, 491 | "outputs": [] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "We can see that they're related by a simple reshaping:" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "collapsed": false, 503 | "input": [ 504 | "print np.all(digits.images.reshape((1797, 64)) == digits.data)" 505 | ], 506 | "language": "python", 507 | "metadata": {}, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "*Aside... numpy and memory efficiency:*\n", 515 | "\n", 516 | "*You might wonder whether duplicating the data is a problem. In this case, the memory\n", 517 | "overhead is very small. Even though the arrays are different shapes, they point to the\n", 518 | "same memory block, which we can see by doing a bit of digging into the guts of numpy:*" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "collapsed": false, 524 | "input": [ 525 | "print digits.data.__array_interface__['data']\n", 526 | "print digits.images.__array_interface__['data']" 527 | ], 528 | "language": "python", 529 | "metadata": {}, 530 | "outputs": [] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "metadata": {}, 535 | "source": [ 536 | "*The long integer here is a memory address: the fact that the two are the same tells\n", 537 | "us that the two arrays are looking at the same data.*" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "Let's visualize the data. It's little bit more involved than the simple scatter-plot\n", 545 | "we used above, but we can do it rather quickly." 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "collapsed": false, 551 | "input": [ 552 | "# set up the figure\n", 553 | "fig = plt.figure(figsize=(6, 6)) # figure size in inches\n", 554 | "fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)\n", 555 | "\n", 556 | "# plot the digits: each image is 8x8 pixels\n", 557 | "for i in range(64):\n", 558 | " ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])\n", 559 | " ax.imshow(digits.images[i], cmap=plt.cm.binary)\n", 560 | " \n", 561 | " # label the image with the target value\n", 562 | " ax.text(0, 7, str(digits.target[i]))" 563 | ], 564 | "language": "python", 565 | "metadata": {}, 566 | "outputs": [] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "We see now what the features mean. Each feature is a real-valued quantity representing the\n", 573 | "darkness of a pixel in an 8x8 image of a hand-written digit.\n", 574 | "\n", 575 | "Even though each sample has data that is inherently two-dimensional, the data matrix flattens\n", 576 | "this 2D data into a **single vector**, which can be contained in one **row** of the data matrix." 577 | ] 578 | }, 579 | { 580 | "cell_type": "heading", 581 | "level": 2, 582 | "metadata": {}, 583 | "source": [ 584 | "Exercise: working with the faces dataset" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": {}, 590 | "source": [ 591 | "Here we'll take a moment for you to explore the datasets yourself.\n", 592 | "Later on we'll be using the Olivetti faces dataset.\n", 593 | "Take a moment to fetch the data (about 1.4MB), and visualize the faces.\n", 594 | "You can copy the code used to visualize the digits above, and modify it for this data." 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "collapsed": false, 600 | "input": [ 601 | "from sklearn.datasets import fetch_olivetti_faces" 602 | ], 603 | "language": "python", 604 | "metadata": {}, 605 | "outputs": [] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "collapsed": false, 610 | "input": [ 611 | "# fetch the faces data\n" 612 | ], 613 | "language": "python", 614 | "metadata": {}, 615 | "outputs": [] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "collapsed": false, 620 | "input": [ 621 | "# Use a script like above to plot the faces image data.\n", 622 | "# hint: plt.cm.bone is a good colormap for this data\n" 623 | ], 624 | "language": "python", 625 | "metadata": {}, 626 | "outputs": [] 627 | } 628 | ], 629 | "metadata": {} 630 | } 631 | ] 632 | } -------------------------------------------------------------------------------- /notebooks/03_machine_learning_101.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "03_machine_learning_101" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Machine Learning 101: General Concepts" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Here is where we start diving into the field of machine learning.\n", 23 | "\n", 24 | "By the end of this section you will\n", 25 | "\n", 26 | "- Know the basic categories of supervised learning, including classification and regression problems.\n", 27 | "- Know the basic categories of unsupervised learning, including dimensionality reduction and clustering.\n", 28 | "- Know the basic syntax of the Scikit-learn **estimator** interface.\n", 29 | "- Know how features are extracted from real-world data.\n", 30 | "\n", 31 | "In addition, we will go over several basic tools within scikit-learn which can be used to accomplish the above tasks." 32 | ] 33 | }, 34 | { 35 | "cell_type": "heading", 36 | "level": 2, 37 | "metadata": {}, 38 | "source": [ 39 | "Quick Review:" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "We saw before the basic definition of Machine Learning:\n", 47 | "\n", 48 | "Machine Learning (ML) is about building programs with **tunable parameters** (typically an\n", 49 | "array of floating point values) that are adjusted automatically so as to improve\n", 50 | "their behavior by **adapting to previously seen data.**\n", 51 | "\n", 52 | "In most ML applications, the data is in a 2D array of shape ``[n_samples x n_features]``,\n", 53 | "where the number of features is the same for each object, and each feature column refers\n", 54 | "to a related piece of information about each sample." 55 | ] 56 | }, 57 | { 58 | "cell_type": "heading", 59 | "level": 2, 60 | "metadata": {}, 61 | "source": [ 62 | "Supervised Learning, Unsupervised Learning, and Scikit-learn Estimators" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Machine learning can be broken into two broad regimes:\n", 70 | "*supervised learning* and *unsupervised learning*.\n", 71 | "We\u2019ll introduce these concepts here, and discuss them in more detail below." 72 | ] 73 | }, 74 | { 75 | "cell_type": "heading", 76 | "level": 3, 77 | "metadata": {}, 78 | "source": [ 79 | "Supervised Learning" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "In **Supervised Learning**, we have a dataset consisting of both features and labels.\n", 87 | "The task is to construct an estimator which is able to predict the label of an object\n", 88 | "given the set of features. A relatively simple example is predicting the species of \n", 89 | "iris given a set of measurements of its flower. This is a relatively simple task. \n", 90 | "Some more complicated examples are:\n", 91 | "\n", 92 | "- given a multicolor image of an object through a telescope, determine\n", 93 | " whether that object is a star, a quasar, or a galaxy.\n", 94 | "- given a photograph of a person, identify the person in the photo.\n", 95 | "- given a list of movies a person has watched and their personal rating\n", 96 | " of the movie, recommend a list of movies they would like\n", 97 | " (So-called *recommender systems*: a famous example is the [Netflix Prize](http://en.wikipedia.org/wiki/Netflix_prize)).\n", 98 | "\n", 99 | "What these tasks have in common is that there is one or more unknown\n", 100 | "quantities associated with the object which needs to be determined from other\n", 101 | "observed quantities.\n", 102 | "\n", 103 | "Supervised learning is further broken down into two categories, **classification** and **regression**.\n", 104 | "In classification, the label is discrete, while in regression, the label is continuous. For example,\n", 105 | "in astronomy, the task of determining whether an object is a star, a galaxy, or a quasar is a\n", 106 | "classification problem: the label is from three distinct categories. On the other hand, we might\n", 107 | "wish to estimate the age of an object based on such observations: this would be a regression problem,\n", 108 | "because the label (age) is a continuous quantity." 109 | ] 110 | }, 111 | { 112 | "cell_type": "heading", 113 | "level": 3, 114 | "metadata": {}, 115 | "source": [ 116 | "Unsupervised Learning" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "**Unsupervised Learning** addresses a different sort of problem. Here the data has no labels,\n", 124 | "and we are interested in finding similarities between the objects in question. In a sense,\n", 125 | "you can think of unsupervised learning as a means of discovering labels from the data itself.\n", 126 | "Unsupervised learning comprises tasks such as *dimensionality reduction*, *clustering*, and\n", 127 | "*density estimation*. For example, in the iris data discussed above, we can used unsupervised\n", 128 | "methods to determine combinations of the measurements which best display the structure of the\n", 129 | "data. As we\u2019ll see below, such a projection of the data can be used to visualize the\n", 130 | "four-dimensional dataset in two dimensions. Some more involved unsupervised learning problems are:\n", 131 | "\n", 132 | "- given detailed observations of distant galaxies, determine which features or combinations of\n", 133 | " features are most important in distinguishing between galaxies.\n", 134 | "- given a mixture of two sound sources (for example, a person talking over some music),\n", 135 | " separate the two (this is called the [blind source separation](http://en.wikipedia.org/wiki/Blind_signal_separation) problem).\n", 136 | "- given a video, isolate a moving object and categorize in relation to other moving objects which have been seen.\n", 137 | "\n", 138 | "Sometimes the two may even be combined: e.g. Unsupervised learning can be used to find useful\n", 139 | "features in heterogeneous data, and then these features can be used within a supervised\n", 140 | "framework." 141 | ] 142 | }, 143 | { 144 | "cell_type": "heading", 145 | "level": 3, 146 | "metadata": {}, 147 | "source": [ 148 | "Scikit-learn's interface" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "In scikit-learn, almost all operations are done through an estimator object.\n", 156 | "\n", 157 | "For example, a linear regression estimator can be instantiated as follows:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "collapsed": false, 163 | "input": [ 164 | "from sklearn.linear_model import LinearRegression\n", 165 | "model = LinearRegression()" 166 | ], 167 | "language": "python", 168 | "metadata": {}, 169 | "outputs": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "collapsed": false, 174 | "input": [ 175 | "print model" 176 | ], 177 | "language": "python", 178 | "metadata": {}, 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Scikit-learn strives to have a uniform interface across all methods,\n", 186 | "and we\u2019ll see examples of these below. Given a scikit-learn *estimator*\n", 187 | "object named `model`, the following methods are available:\n", 188 | "\n", 189 | "- *Available in all Estimators*\n", 190 | " + `model.fit()` : fit training data. For supervised learning applications,\n", 191 | " this accepts two arguments: the data `X` and the labels `y` (e.g. `model.fit(X, y)`).\n", 192 | " For unsupervised learning applications, this accepts only a single argument,\n", 193 | " the data `X` (e.g. `model.fit(X)`).\n", 194 | "- *Available in supervised estimators*\n", 195 | " + `model.predict()` : given a trained model, predict the label of a new set of data.\n", 196 | " This method accepts one argument, the new data `X_new` (e.g. `model.predict(X_new)`),\n", 197 | " and returns the learned label for each object in the array.\n", 198 | " + `model.predict_proba()` : For classification problems, some estimators also provide\n", 199 | " this method, which returns the probability that a new observation has each categorical label.\n", 200 | " In this case, the label with the highest probability is returned by `model.predict()`.\n", 201 | " + `model.score()` : for classification or regression problems, most (all?) estimators implement\n", 202 | " a score method. Scores are between 0 and 1, with a larger score indicating a better fit.\n", 203 | "- *Available in unsupervised estimators*\n", 204 | " + `model.transform()` : given an unsupervised model, transform new data into the new basis.\n", 205 | " This also accepts one argument `X_new`, and returns the new representation of the data based\n", 206 | " on the unsupervised model.\n", 207 | " + `model.fit_transform()` : some estimators implement this method,\n", 208 | " which more efficiently performs a fit and a transform on the same input data." 209 | ] 210 | }, 211 | { 212 | "cell_type": "heading", 213 | "level": 3, 214 | "metadata": {}, 215 | "source": [ 216 | "Diagrams of Supervised and Unsupervised Learning" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "collapsed": false, 222 | "input": [ 223 | "%pylab inline" 224 | ], 225 | "language": "python", 226 | "metadata": {}, 227 | "outputs": [] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "collapsed": false, 232 | "input": [ 233 | "from figures import plot_supervised_chart, plot_unsupervised_chart" 234 | ], 235 | "language": "python", 236 | "metadata": {}, 237 | "outputs": [] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "collapsed": false, 242 | "input": [ 243 | "plot_supervised_chart(annotate=False)" 244 | ], 245 | "language": "python", 246 | "metadata": {}, 247 | "outputs": [] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "collapsed": false, 252 | "input": [ 253 | "plot_supervised_chart(annotate=True)" 254 | ], 255 | "language": "python", 256 | "metadata": {}, 257 | "outputs": [] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "collapsed": false, 262 | "input": [ 263 | "plot_unsupervised_chart()" 264 | ], 265 | "language": "python", 266 | "metadata": {}, 267 | "outputs": [] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "*(Aside: these charts are generated in matplotlib. You can see the code using the %load magic)*" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "collapsed": false, 279 | "input": [ 280 | "%load figures/ML_flow_chart.py" 281 | ], 282 | "language": "python", 283 | "metadata": {}, 284 | "outputs": [], 285 | "prompt_number": 12 286 | }, 287 | { 288 | "cell_type": "heading", 289 | "level": 2, 290 | "metadata": {}, 291 | "source": [ 292 | "Returning to Feature Extraction" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "Recall that previously, we have seen two types of features:\n", 300 | "\n", 301 | "- The iris dataset had measured features: (the lengths of petals, sepals, etc.)\n", 302 | "- The digits and faces datasets were pixel values (the images were pre-aligned)\n", 303 | "\n", 304 | "How might we handle other types of features?" 305 | ] 306 | }, 307 | { 308 | "cell_type": "heading", 309 | "level": 3, 310 | "metadata": {}, 311 | "source": [ 312 | "Categorical Features" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "Sometimes we have categorical features: for example, imagine the dataset included\n", 320 | "the colors:\n", 321 | "\n", 322 | " color in [red, blue, purple]\n", 323 | "\n", 324 | "Often it is best for categorical features to have their own dimenions:\n", 325 | "\n", 326 | "The enriched iris feature set would hence be in this case:\n", 327 | "\n", 328 | "- sepal length in cm\n", 329 | "- sepal width in cm\n", 330 | "- petal length in cm\n", 331 | "- petal width in cm\n", 332 | "- color#purple (1.0 or 0.0)\n", 333 | "- color#blue (1.0 or 0.0)\n", 334 | "- color#red (1.0 or 0.0)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "heading", 339 | "level": 3, 340 | "metadata": {}, 341 | "source": [ 342 | "Unstructured data" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "Most often, data does not come in a nice, structured, CSV file where every\n", 350 | "column measures the same thing. In this case, we must be more imaginitive\n", 351 | "in how we extract features.\n", 352 | "\n", 353 | "Here is an overview of strategies to turn unstructed data items into arrays of numerical features." 354 | ] 355 | }, 356 | { 357 | "cell_type": "heading", 358 | "level": 4, 359 | "metadata": {}, 360 | "source": [ 361 | "Text documents:\t" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "- Count the frequency of each word or pair of consecutive words in each document. This approach is called **Bag of Words**\n", 369 | "\n", 370 | "*Note:* we include other file formats such as HTML and PDF in this category:\n", 371 | "an ad-hoc preprocessing step is required to extract the plain text in\n", 372 | "UTF-8 encoding for instance.\n", 373 | "\n", 374 | "For a tutorial on text processing in scikit-learn, see\n", 375 | "http://scikit-learn.github.com/scikit-learn-tutorial/working_with_text_data.html" 376 | ] 377 | }, 378 | { 379 | "cell_type": "heading", 380 | "level": 4, 381 | "metadata": {}, 382 | "source": [ 383 | "Images:\t" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "- Rescale the picture to a fixed size and take all the raw pixels values (with or without luminosity normalization)\n", 391 | "\n", 392 | "- Take some transformation of the signal (gradients in each pixel, wavelets transforms...)\n", 393 | "\n", 394 | "- Compute the Euclidean, Manhattan or cosine similarities of the sample to a set reference prototype images aranged\n", 395 | " in a code book. The code book may have been previously extracted from the same dataset using an unsupervised\n", 396 | " learning algorithm on the raw pixel signal. Each feature value is the distance to one element of the code book.\n", 397 | "\n", 398 | "- Perform local feature extraction: split the picture into small regions and perform feature extraction locally in each area,\n", 399 | " Then combine all the features of the individual areas into a single array." 400 | ] 401 | }, 402 | { 403 | "cell_type": "heading", 404 | "level": 4, 405 | "metadata": {}, 406 | "source": [ 407 | "Sounds:\t" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "Same type of strategies as for images; the difference its it's a 1D rather than 2D space.\n", 415 | "\n", 416 | "For more information on feature extraction in scikit-learn, see\n", 417 | "http://scikit-learn.org/stable/modules/feature_extraction.html" 418 | ] 419 | }, 420 | { 421 | "cell_type": "heading", 422 | "level": 2, 423 | "metadata": {}, 424 | "source": [ 425 | "Looking Ahead" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "In the next couple notebooks, we will explore a simple examples of classification,\n", 433 | "regression, dimensionality reduction, and clustering using the datasets we've\n", 434 | "seen." 435 | ] 436 | } 437 | ], 438 | "metadata": {} 439 | } 440 | ] 441 | } -------------------------------------------------------------------------------- /notebooks/04_houses_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "04_houses_regression" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Supervised Learning: Regression of Housing Data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "By the end of this section you will\n", 23 | "\n", 24 | "- Know how to instantiate a scikit-learn regression model\n", 25 | "- Know how to train a regressor by calling the `fit(...)` method\n", 26 | "- Know how to predict new labels by calling the `predict(...)` method\n", 27 | "\n", 28 | "Here we'll do a short example of a regression problem: learning a continuous value\n", 29 | "from a set of features.\n", 30 | "\n", 31 | "We'll use the simple Boston house prices set, available in scikit-learn. This\n", 32 | "records measurements of 13 attributes of housing markets around Boston, as well\n", 33 | "as the median price. The question is: can you predict the price of a new\n", 34 | "market given its attributes?\n", 35 | "\n", 36 | "First we'll load the dataset:" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "collapsed": false, 42 | "input": [ 43 | "from sklearn.datasets import load_boston\n", 44 | "data = load_boston()\n", 45 | "print data.keys()" 46 | ], 47 | "language": "python", 48 | "metadata": {}, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "We can see that there are just over 500 data points:" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "collapsed": false, 61 | "input": [ 62 | "print data.data.shape\n", 63 | "print data.target.shape" 64 | ], 65 | "language": "python", 66 | "metadata": {}, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "The ``DESCR`` variable has a long description of the dataset:" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "collapsed": false, 79 | "input": [ 80 | "print data.DESCR" 81 | ], 82 | "language": "python", 83 | "metadata": {}, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "It often helps to quickly visualize pieces of the data using histograms, scatter plots,\n", 91 | "or other plot types. Here we'll load pylab and show a histogram of the target values:\n", 92 | "the median price in each neighborhood." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "collapsed": false, 98 | "input": [ 99 | "%pylab inline" 100 | ], 101 | "language": "python", 102 | "metadata": {}, 103 | "outputs": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "collapsed": false, 108 | "input": [ 109 | "plt.hist(data.target)\n", 110 | "plt.xlabel('price ($1000s)')\n", 111 | "plt.ylabel('count')" 112 | ], 113 | "language": "python", 114 | "metadata": {}, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "**Quick Exercise:** Try some scatter plots of the features versus the target.\n", 122 | "\n", 123 | "Are there any features that seem to have a strong correlation with the\n", 124 | "target value? Any that don't?\n", 125 | "\n", 126 | "Remember, you can get at the data columns using:\n", 127 | "\n", 128 | " column_i = data.data[:, i]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "collapsed": false, 134 | "input": [], 135 | "language": "python", 136 | "metadata": {}, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "This is a manual version of a technique called **feature selection**.\n", 144 | "\n", 145 | "Sometimes, in Machine Learning it is useful to use \n", 146 | "feature selection to decide which features are most useful for a\n", 147 | "particular problem. Automated methods exist which quantify this sort\n", 148 | "of exercise of choosing the most informative features. We won't cover\n", 149 | "feature selection in this tutorial, but you can read about it elsewhere." 150 | ] 151 | }, 152 | { 153 | "cell_type": "heading", 154 | "level": 2, 155 | "metadata": {}, 156 | "source": [ 157 | "Predicting Home Prices: a Simple Linear Regression" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "Now we'll use ``scikit-learn`` to perform a simple linear regression\n", 165 | "on the housing data. There are many possibilities of regressors to\n", 166 | "use. A particularly simple one is ``LinearRegression``: this is\n", 167 | "basically a wrapper around an ordinary least squares calculation.\n", 168 | "\n", 169 | "We'll set it up like this:" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "collapsed": false, 175 | "input": [ 176 | "from sklearn.linear_model import LinearRegression\n", 177 | "\n", 178 | "clf = LinearRegression()\n", 179 | "\n", 180 | "clf.fit(data.data, data.target)" 181 | ], 182 | "language": "python", 183 | "metadata": {}, 184 | "outputs": [] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "collapsed": false, 189 | "input": [ 190 | "predicted = clf.predict(data.data)" 191 | ], 192 | "language": "python", 193 | "metadata": {}, 194 | "outputs": [] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "collapsed": false, 199 | "input": [ 200 | "plt.scatter(data.target, predicted)\n", 201 | "plt.plot([0, 50], [0, 50], '--k')\n", 202 | "plt.axis('tight')\n", 203 | "plt.xlabel('True price ($1000s)')\n", 204 | "plt.ylabel('Predicted price ($1000s)')" 205 | ], 206 | "language": "python", 207 | "metadata": {}, 208 | "outputs": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "The prediction at least correlates with the true price, though there\n", 215 | "are clearly some biases. We could imagine evaluating the performance\n", 216 | "of the regressor by, say, computing the RMS residuals between the\n", 217 | "true and predicted price. There are some subtleties in this, however,\n", 218 | "which we'll cover in a later section." 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "There are many examples of regression-type problems in machine learning\n", 226 | "\n", 227 | "- **Sales:** given consumer data, predict how much they will spend\n", 228 | "- **Advertising:** given information about a user, predict the click-through rate for a web ad.\n", 229 | "- **Collaborative Filtering:** given a collection of user-ratings for movies, predict preferences for other movies & users\n", 230 | "- **Astronomy:** given observations of galaxies, predict their mass or redshift\n", 231 | "\n", 232 | "And much, much more." 233 | ] 234 | }, 235 | { 236 | "cell_type": "heading", 237 | "level": 2, 238 | "metadata": {}, 239 | "source": [ 240 | "Exercise: Decision Tree Regression" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "There are many other types of regressors available in scikit-learn:\n", 248 | "we'll try one more here.\n", 249 | "\n", 250 | "**Use the DecisionTreeRegressor class to fit the housing data**.\n", 251 | "\n", 252 | "You can copy and paste some of the above code, replacing `LinearRegression`\n", 253 | "with `DecisionTreeRegressor`." 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "collapsed": false, 259 | "input": [ 260 | "from sklearn.tree import DecisionTreeRegressor\n", 261 | "# Instantiate the model, fit the results, and scatter in vs. out" 262 | ], 263 | "language": "python", 264 | "metadata": {}, 265 | "outputs": [] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "collapsed": false, 270 | "input": [], 271 | "language": "python", 272 | "metadata": {}, 273 | "outputs": [] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "Do you see anything surprising in the results?\n", 280 | "\n", 281 | "The Decision Tree classifier is an example of an *instance-based* algorithm.\n", 282 | "Rather than try to determine a model that best fits the data, an instance-based\n", 283 | "algorithm in some way matches unknown data to the known catalog of training points.\n", 284 | "\n", 285 | "How does this fact explain the results you saw here?" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "We'll return to the subject of Decision trees at a later point in the tutorial." 293 | ] 294 | } 295 | ], 296 | "metadata": {} 297 | } 298 | ] 299 | } -------------------------------------------------------------------------------- /notebooks/05_iris_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "05_iris_classification" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Supervised Learning: Classification of Iris Data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "By the end of this section you will\n", 23 | "\n", 24 | "- Know how to instantiate a scikit-learn classifier\n", 25 | "- Know how to train a classifier by calling the `fit(...)` method\n", 26 | "- Know how to predict new labels by calling the `predict(...)` method\n", 27 | "\n", 28 | "In this example we will perform classification of the iris data with several different classifiers." 29 | ] 30 | }, 31 | { 32 | "cell_type": "heading", 33 | "level": 2, 34 | "metadata": {}, 35 | "source": [ 36 | "Linear Support Vector Classifier (SVC)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "First we'll load the iris data as we did before:" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "collapsed": false, 49 | "input": [ 50 | "from sklearn.datasets import load_iris\n", 51 | "iris = load_iris()" 52 | ], 53 | "language": "python", 54 | "metadata": {}, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "In the iris dataset example, suppose we are assigned the task to guess\n", 62 | "the class of an individual flower given the measurements of petals and\n", 63 | "sepals. This is a *classification* task, hence we have:" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "collapsed": false, 69 | "input": [ 70 | "X = iris.data\n", 71 | "y = iris.target\n", 72 | "\n", 73 | "print X.shape\n", 74 | "print y.shape" 75 | ], 76 | "language": "python", 77 | "metadata": {}, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "Once the data has this format it is trivial to train a classifier, for instance a support vector machine with a linear kernel:" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "collapsed": false, 90 | "input": [ 91 | "from sklearn.svm import LinearSVC" 92 | ], 93 | "language": "python", 94 | "metadata": {}, 95 | "outputs": [] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "``LinearSVC`` is an example of a scikit-learn classifier. If you're curious about how it is used, you can use ``ipython``'s ``\"?\"`` magic function to see the documentation:" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "collapsed": false, 107 | "input": [], 108 | "language": "python", 109 | "metadata": {}, 110 | "outputs": [] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "The first thing to do is to create an instance of the classifier. This can be done simply by calling the class name, with any arguments that the object accepts:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "collapsed": false, 122 | "input": [ 123 | "clf = LinearSVC(loss = 'l2')" 124 | ], 125 | "language": "python", 126 | "metadata": {}, 127 | "outputs": [] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "``clf`` is a statistical model that has parameters that control the learning algorithm (those parameters are sometimes called the *hyperparameters*). Those hyperparameters can be supplied by the user in the constructor of the model. We will explain later how to choose a good combination using either simple empirical rules or data driven selection:" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "collapsed": false, 139 | "input": [ 140 | "print clf" 141 | ], 142 | "language": "python", 143 | "metadata": {}, 144 | "outputs": [] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "By default the model parameters are not initialized. They will be tuned automatically from the data by calling the ``fit`` method with the data ``X`` and labels ``y``:" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "collapsed": false, 156 | "input": [ 157 | "clf = clf.fit(X, y)" 158 | ], 159 | "language": "python", 160 | "metadata": {}, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "We can now see some of the fit parameters within the classifier object.\n", 168 | "\n", 169 | "**In scikit-learn, parameters defined by training have a trailing underscore.**" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "collapsed": false, 175 | "input": [ 176 | "clf.coef_" 177 | ], 178 | "language": "python", 179 | "metadata": {}, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "collapsed": false, 185 | "input": [ 186 | "clf.intercept_" 187 | ], 188 | "language": "python", 189 | "metadata": {}, 190 | "outputs": [] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "Once the model is trained, it can be used to predict the most likely outcome on unseen data. For instance let us define a list of simple sample that looks like the first sample of the iris dataset:" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "collapsed": false, 202 | "input": [ 203 | "X_new = [[ 5.0, 3.6, 1.3, 0.25]]\n", 204 | "\n", 205 | "clf.predict(X_new)" 206 | ], 207 | "language": "python", 208 | "metadata": {}, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "All classification tasks involve predicting an unknown category based on observed features.\n", 216 | "\n", 217 | "Some examples of interested classification tasks:\n", 218 | "\n", 219 | "- **E-mail classification:** label email as spam, normal, priority mail\n", 220 | "- **Language identification:** label documents as English, Spanish, German, etc.\n", 221 | "- **News articles categorization:** label articles as business, technology, sports...\n", 222 | "- **Sentiment analysis in customer feedback:** label feedback as negative, neutral, positive\n", 223 | "- **Face verification in pictures:** label images as same / different person\n", 224 | "- **Speaker verification in voice recordings:** label recording as same / different person\n", 225 | "- **Astronomical Sources:** label object as star / quasar / galaxy" 226 | ] 227 | }, 228 | { 229 | "cell_type": "heading", 230 | "level": 2, 231 | "metadata": {}, 232 | "source": [ 233 | "Exercise: Using a Different Classifier" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "Now we'll take a few minutes and try out another learning model. Because of ``scikit-learn``'s uniform interface, the syntax is identical to that of ``LinearSVC`` above.\n", 241 | "\n", 242 | "There are many possibilities of classifiers; you could try any of the methods discussed at . Alternatively, you can explore what's available in ``scikit-learn`` using just the tab-completion feature. For example, import the ``linear_model`` submodule:" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "collapsed": false, 248 | "input": [ 249 | "from sklearn import linear_model" 250 | ], 251 | "language": "python", 252 | "metadata": {}, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "And use the tab completion to find what's available. Type ``linear_model.`` and then the tab key to see an interactive list of the functions within this submodule. The ones which begin with capital letters are the models which are available." 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "collapsed": false, 265 | "input": [], 266 | "language": "python", 267 | "metadata": {}, 268 | "outputs": [] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "Now select a new classifier and try out a classification of the iris data.\n", 275 | "\n", 276 | "Some good choices are\n", 277 | "\n", 278 | "- ``sklearn.naive_bayes.GaussianNB`` :\n", 279 | " Gaussian Naive Bayes model. This is an unsophisticated model which can be trained very quickly.\n", 280 | " It is often used to obtain baseline results before moving to a more sophisticated classifier.\n", 281 | "\n", 282 | "- ``sklearn.svm.LinearSVC`` :\n", 283 | " Support Vector Machines without kernels based on liblinear\n", 284 | "\n", 285 | "- ``sklearn.svm.SVC`` :\n", 286 | " Support Vector Machines with kernels based on libsvm\n", 287 | "\n", 288 | "- ``sklearn.linear_model.LogisticRegression`` :\n", 289 | " Regularized Logistic Regression based on liblinear\n", 290 | "\n", 291 | "- ``sklearn.linear_model.SGDClassifier`` :\n", 292 | " Regularized linear models (SVM or logistic regression) using a Stochastic Gradient Descent algorithm written in Cython\n", 293 | "\n", 294 | "- ``sklearn.neighbors.NeighborsClassifier`` :\n", 295 | " k-Nearest Neighbors classifier based on the ball tree datastructure for low dimensional data and brute force search for high dimensional data\n", 296 | "\n", 297 | "- ``sklearn.tree.DecisionTreeClassifier`` :\n", 298 | " A classifier based on a series of binary decisions. This is another very fast classifier, which can be very powerful.\n", 299 | "\n", 300 | "Choose one of the above, import it, and use the ``?`` feature to learn about it." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "collapsed": false, 306 | "input": [], 307 | "language": "python", 308 | "metadata": {}, 309 | "outputs": [] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "Now instantiate this model as we did with ``LinearSVC`` above." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "collapsed": false, 321 | "input": [], 322 | "language": "python", 323 | "metadata": {}, 324 | "outputs": [] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Now use our data ``X`` and ``y`` to train the model, using the ``fit(...)`` method" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "collapsed": false, 336 | "input": [], 337 | "language": "python", 338 | "metadata": {}, 339 | "outputs": [] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "Now call the ``predict`` method, and find the classification of ``X_new``." 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "collapsed": false, 351 | "input": [], 352 | "language": "python", 353 | "metadata": {}, 354 | "outputs": [] 355 | }, 356 | { 357 | "cell_type": "heading", 358 | "level": 2, 359 | "metadata": {}, 360 | "source": [ 361 | "Probabilistic Prediction" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "Some models have additional prediction modes. For example, if ``clf`` is a ``LogisticRegression`` classifier, then it is possible to do a probibilistic prediction for any point. This can be done through the ``predict_proba`` function:" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "collapsed": false, 374 | "input": [ 375 | "from sklearn.linear_model import LogisticRegression\n", 376 | "clf2 = LogisticRegression()\n", 377 | "clf2.fit(X, y)\n", 378 | "print clf2.predict_proba(X_new)" 379 | ], 380 | "language": "python", 381 | "metadata": {}, 382 | "outputs": [] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "The result gives the probability (between zero and one) that the test point comes from any of the three classes.\n", 389 | "\n", 390 | "This means that the model estimates that the sample in X_new has:\n", 391 | "\n", 392 | "- 90% likelyhood to belong to the \u2018setosa\u2019 class (``target = 0``)\n", 393 | "- 9% likelyhood to belong to the \u2018versicolor\u2019 class (``target = 1``)\n", 394 | "- < 1% likelyhood to belong to the \u2018virginica\u2019 class (``target = 2``)\n", 395 | "\n", 396 | "Of course, the predict method that outputs the label id of the most likely outcome is also available:" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "collapsed": false, 402 | "input": [ 403 | "clf2.predict(X_new)" 404 | ], 405 | "language": "python", 406 | "metadata": {}, 407 | "outputs": [] 408 | }, 409 | { 410 | "cell_type": "heading", 411 | "level": 2, 412 | "metadata": {}, 413 | "source": [ 414 | "Evaluating the Model" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "Predicting a new value is nice, but how do we guage how well we've done?\n", 422 | "We'll explore this in more depth later, but here's a quick taste now.\n", 423 | "\n", 424 | "Let's get a rough evaluation our model by using\n", 425 | "it to predict the values of the training data:" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "collapsed": false, 431 | "input": [ 432 | "y_model = clf2.predict(X)" 433 | ], 434 | "language": "python", 435 | "metadata": {}, 436 | "outputs": [] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "collapsed": false, 441 | "input": [ 442 | "print y_model == y" 443 | ], 444 | "language": "python", 445 | "metadata": {}, 446 | "outputs": [] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": {}, 451 | "source": [ 452 | "We see that most of the predictions are correct!\n", 453 | "\n", 454 | "Be careful, though: what we've done here is not a very good model evaluation\n", 455 | "scheme. In a later section we'll introduce a set of techniques called\n", 456 | "*Cross-validation*, which treats model evaluation a little bit more carefully." 457 | ] 458 | } 459 | ], 460 | "metadata": {} 461 | } 462 | ] 463 | } -------------------------------------------------------------------------------- /notebooks/06_iris_dimensionality.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "06_iris_dimensionality" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Unsupervised Learning: Dimensionality Reduction and Visualization" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Previously we worked on visualizing the iris data by plotting\n", 23 | "pairs of dimensions by trial and error, until we arrived at\n", 24 | "the best pair of dimensions for our dataset. Here we will\n", 25 | "use an unsupervised *dimensionality reduction* algorithm\n", 26 | "to accomplish this more automatically." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "By the end of this section you will\n", 34 | "\n", 35 | "- Know how to instantiate and train an unsupervised dimensionality reduction algorithm:\n", 36 | " Principal Component Analysis (PCA)\n", 37 | "- Know how to use PCA to visualize high-dimensional data" 38 | ] 39 | }, 40 | { 41 | "cell_type": "heading", 42 | "level": 2, 43 | "metadata": {}, 44 | "source": [ 45 | "Dimensionality Reduction: PCA" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "Dimensionality reduction is the task of deriving a set of new\n", 53 | "artificial features that is smaller than the original feature\n", 54 | "set while retaining most of the variance of the original data.\n", 55 | "Here we'll use a common but powerful dimensionality reduction\n", 56 | "technique called Principal Component Analysis (PCA).\n", 57 | "We'll perform PCA on the iris dataset that we saw before:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "collapsed": false, 63 | "input": [ 64 | "from sklearn.datasets import load_iris\n", 65 | "iris = load_iris()\n", 66 | "X = iris.data\n", 67 | "y = iris.target" 68 | ], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "PCA is performed using linear combinations of the original features\n", 78 | "using a truncated Singular Value Decomposition of the matrix X so\n", 79 | "as to project the data onto a base of the top singular vectors.\n", 80 | "If the number of retained components is 2 or 3, PCA can be used\n", 81 | "to visualize the dataset." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "collapsed": false, 87 | "input": [ 88 | "from sklearn.decomposition import PCA\n", 89 | "pca = PCA(n_components=2, whiten=True)\n", 90 | "pca.fit(X)" 91 | ], 92 | "language": "python", 93 | "metadata": {}, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Once fitted, the pca model exposes the singular vectors in the components_ attribute:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "collapsed": false, 106 | "input": [ 107 | "pca.components_" 108 | ], 109 | "language": "python", 110 | "metadata": {}, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Other attributes are available as well:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "collapsed": false, 123 | "input": [ 124 | "pca.explained_variance_ratio_" 125 | ], 126 | "language": "python", 127 | "metadata": {}, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "collapsed": false, 133 | "input": [ 134 | "pca.explained_variance_ratio_.sum()" 135 | ], 136 | "language": "python", 137 | "metadata": {}, 138 | "outputs": [] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "Let us project the iris dataset along those first two dimensions:" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "collapsed": false, 150 | "input": [ 151 | "X_pca = pca.transform(X)" 152 | ], 153 | "language": "python", 154 | "metadata": {}, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "PCA `normalizes` and `whitens` the data, which means that the data\n", 162 | "is now centered on both components with unit variance:" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "collapsed": false, 168 | "input": [ 169 | "import numpy as np\n", 170 | "np.round(X_pca.mean(axis=0), decimals=5)" 171 | ], 172 | "language": "python", 173 | "metadata": {}, 174 | "outputs": [] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "collapsed": false, 179 | "input": [ 180 | "np.round(X_pca.std(axis=0), decimals=5)" 181 | ], 182 | "language": "python", 183 | "metadata": {}, 184 | "outputs": [] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "Furthermore, the samples components do no longer carry any linear correlation:" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "collapsed": false, 196 | "input": [ 197 | "np.corrcoef(X_pca.T)" 198 | ], 199 | "language": "python", 200 | "metadata": {}, 201 | "outputs": [] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "We can visualize the projection using pylab, but first\n", 208 | "let's make sure our ipython notebook is in pylab inline mode" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "collapsed": false, 214 | "input": [ 215 | "%pylab inline" 216 | ], 217 | "language": "python", 218 | "metadata": {}, 219 | "outputs": [] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "Now we can visualize the results using the following utility function:" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "collapsed": false, 231 | "input": [ 232 | "import pylab as pl\n", 233 | "from itertools import cycle\n", 234 | "\n", 235 | "def plot_PCA_2D(data, target, target_names):\n", 236 | " colors = cycle('rgbcmykw')\n", 237 | " target_ids = range(len(target_names))\n", 238 | " pl.figure()\n", 239 | " for i, c, label in zip(target_ids, colors, target_names):\n", 240 | " pl.scatter(data[target == i, 0], data[target == i, 1],\n", 241 | " c=c, label=label)\n", 242 | " pl.legend()" 243 | ], 244 | "language": "python", 245 | "metadata": {}, 246 | "outputs": [] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Now calling this function for our data, we see the plot:" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "collapsed": false, 258 | "input": [ 259 | "plot_PCA_2D(X_pca, iris.target, iris.target_names)" 260 | ], 261 | "language": "python", 262 | "metadata": {}, 263 | "outputs": [] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "Note that this projection was determined *without* any information about the\n", 270 | "labels (represented by the colors): this is the sense in which the learning\n", 271 | "is **unsupervised**. Nevertheless, we see that the projection gives us insight\n", 272 | "into the distribution of the different flowers in parameter space: notably,\n", 273 | "*iris setosa* is much more distinct than the other two species." 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "Note also that the default implementation of PCA computes the\n", 281 | "singular value decomposition (SVD) of the full\n", 282 | "data matrix, which is not scalable when both ``n_samples`` and\n", 283 | "``n_features`` are big (more that a few thousands).\n", 284 | "If you are interested in a number of components that is much\n", 285 | "smaller than both ``n_samples`` and ``n_features``, consider using\n", 286 | "`sklearn.decomposition.RandomizedPCA` instead." 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "Other dimensionality reduction techniques which are useful to know about:\n", 294 | "\n", 295 | "- `sklearn.decomposition.PCA`:
\n", 296 | " Principal Component Analysis\n", 297 | "- `sklearn.decomposition.RandomizedPCA`:
\n", 298 | " fast non-exact PCA implementation based on a randomized algorithm\n", 299 | "- `sklearn.decomposition.SparsePCA`:
\n", 300 | " PCA variant including L1 penalty for sparsity\n", 301 | "- `sklearn.decomposition.FastICA`:
\n", 302 | " Independent Component Analysis\n", 303 | "- `sklearn.decomposition.NMF`:
\n", 304 | " non-negative matrix factorization\n", 305 | "- `sklearn.manifold.LocallyLinearEmbedding`:
\n", 306 | " nonlinear manifold learning technique based on local neighborhood geometry\n", 307 | "- `sklearn.manifold.IsoMap`:
\n", 308 | " nonlinear manifold learning technique based on a sparse graph algorithm" 309 | ] 310 | }, 311 | { 312 | "cell_type": "heading", 313 | "level": 2, 314 | "metadata": {}, 315 | "source": [ 316 | "Exercise:" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "Repeat the above dimensionality reduction with\n", 324 | "``sklearn.decomposition.RandomizedPCA``.\n", 325 | "\n", 326 | "You can re-use the ``plot_PCA_2D`` function from above.\n", 327 | "Are the results similar to those from standard PCA?" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "collapsed": false, 333 | "input": [], 334 | "language": "python", 335 | "metadata": {}, 336 | "outputs": [] 337 | } 338 | ], 339 | "metadata": {} 340 | } 341 | ] 342 | } -------------------------------------------------------------------------------- /notebooks/07_iris_clustering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "07_iris_clustering" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Clustering of Iris Data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Clustering is the task of gathering samples into groups of similar\n", 23 | "samples according to some predefined similarity or dissimilarity\n", 24 | "measure (such as the Euclidean distance).\n", 25 | "In this section we will explore a basic clustering task on the\n", 26 | "iris data." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "By the end of this section you will\n", 34 | "\n", 35 | "- Know how to instantiate and train KMeans, an unsupervised clustering algorithm\n", 36 | "- Know several other interesting clustering algorithms within scikit-learn" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Let's re-use the results of the 2D PCA of the iris dataset in order to\n", 44 | "explore clustering. First we need to repeat some of the code from the\n", 45 | "previous notebook" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "collapsed": false, 51 | "input": [ 52 | "# make sure ipython inline mode is activated\n", 53 | "%pylab inline" 54 | ], 55 | "language": "python", 56 | "metadata": {}, 57 | "outputs": [] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "collapsed": false, 62 | "input": [ 63 | "# all of this is copied from the previous notebook, '06_iris_dimensionality' \n", 64 | "from sklearn.datasets import load_iris\n", 65 | "from sklearn.decomposition import PCA\n", 66 | "import pylab as pl\n", 67 | "from itertools import cycle\n", 68 | "\n", 69 | "iris = load_iris()\n", 70 | "X = iris.data\n", 71 | "y = iris.target\n", 72 | "\n", 73 | "pca = PCA(n_components=2, whiten=True).fit(X)\n", 74 | "X_pca = pca.transform(X)\n", 75 | "\n", 76 | "def plot_2D(data, target, target_names):\n", 77 | " colors = cycle('rgbcmykw')\n", 78 | " target_ids = range(len(target_names))\n", 79 | " pl.figure()\n", 80 | " for i, c, label in zip(target_ids, colors, target_names):\n", 81 | " pl.scatter(data[target == i, 0], data[target == i, 1],\n", 82 | " c=c, label=label)\n", 83 | " pl.legend()" 84 | ], 85 | "language": "python", 86 | "metadata": {}, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "To remind ourselves what we're looking at, let's again plot the PCA components\n", 94 | "we defined in the last notebook:" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "collapsed": false, 100 | "input": [ 101 | "plot_2D(X_pca, iris.target, iris.target_names)" 102 | ], 103 | "language": "python", 104 | "metadata": {}, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Now we will use one of the simplest clustering algorithms, K-means.\n", 112 | "This is an iterative algorithm which searches for three cluster\n", 113 | "centers such that the distance from each point to its cluster is\n", 114 | "minimizied. First, let's step back for a second,\n", 115 | "look at the above plot, and think about what this will do.\n", 116 | "The algorithm will look for three cluster centers, and label the\n", 117 | "points according to which cluster center they're closest to.\n", 118 | "\n", 119 | "**Question:** what would you expect the output to look like?" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "collapsed": false, 125 | "input": [ 126 | "from sklearn.cluster import KMeans\n", 127 | "from numpy.random import RandomState\n", 128 | "rng = RandomState(42)\n", 129 | "\n", 130 | "kmeans = KMeans(n_clusters=3, random_state=rng)\n", 131 | "kmeans.fit(X_pca)" 132 | ], 133 | "language": "python", 134 | "metadata": {}, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "collapsed": false, 140 | "input": [ 141 | "import numpy as np\n", 142 | "np.round(kmeans.cluster_centers_, decimals=2)" 143 | ], 144 | "language": "python", 145 | "metadata": {}, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "The ``labels_`` attribute of the K means estimator contains the ID of the\n", 153 | "cluster that each point is assigned to." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "collapsed": false, 159 | "input": [ 160 | "kmeans.labels_" 161 | ], 162 | "language": "python", 163 | "metadata": {}, 164 | "outputs": [] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "The K-means algorithm has been used to infer cluster labels for the\n", 171 | "points. Let's call the ``plot_2D`` function again, but color the points\n", 172 | "based on the cluster labels rather than the iris species." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "collapsed": false, 178 | "input": [ 179 | "plot_2D(X_pca, kmeans.labels_, [\"c0\", \"c1\", \"c2\"])\n", 180 | "plt.title('K-Means labels')\n", 181 | "\n", 182 | "plot_2D(X_pca, iris.target, iris.target_names)\n", 183 | "plt.title('True labels')" 184 | ], 185 | "language": "python", 186 | "metadata": {}, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "heading", 191 | "level": 2, 192 | "metadata": {}, 193 | "source": [ 194 | "Some Notable Clustering Routines" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "The following are two well-known clustering algorithms. Like most unsupervised learning\n", 202 | "models in the scikit, they expect the data to be clustered to have the shape `(n_samples, n_features)`:\n", 203 | "\n", 204 | "- `sklearn.cluster.KMeans`:
\n", 205 | " The simplest, yet effective clustering algorithm. Needs to be provided with the\n", 206 | " number of clusters in advance, and assumes that the data is normalized as input\n", 207 | " (but use a PCA model as preprocessor).\n", 208 | "- `sklearn.cluster.MeanShift`:
\n", 209 | " Can find better looking clusters than KMeans but is not scalable to high number of samples.\n", 210 | "- `sklearn.cluster.DBSCAN`:
\n", 211 | " Can detect irregularly shaped clusters based on density, i.e. sparse regions in\n", 212 | " the input space are likely to become inter-cluster boundaries. Can also detect\n", 213 | " outliers (samples that are not part of a cluster).\n", 214 | "\n", 215 | "Other clustering algorithms do not work with a data array of shape (n_samples, n_features)\n", 216 | "but directly with a precomputed affinity matrix of shape (n_samples, n_samples):\n", 217 | "\n", 218 | "- `sklearn.cluster.AffinityPropagation`:
\n", 219 | " Clustering algorithm based on message passing between data points.\n", 220 | "- `sklearn.cluster.SpectralClustering`:
\n", 221 | " KMeans applied to a projection of the normalized graph Laplacian: finds\n", 222 | " normalized graph cuts if the affinity matrix is interpreted as an adjacency matrix of a graph.\n", 223 | "- `sklearn.cluster.Ward`:
\n", 224 | " Ward implements hierarchical clustering based on the Ward algorithm,\n", 225 | " a variance-minimizing approach. At each step, it minimizes the sum of\n", 226 | " squared differences within all clusters (inertia criterion).\n", 227 | "- `sklearn.cluster.DBSCAN`:
\n", 228 | " DBSCAN can work with either an array of samples or an affinity matrix." 229 | ] 230 | }, 231 | { 232 | "cell_type": "heading", 233 | "level": 2, 234 | "metadata": {}, 235 | "source": [ 236 | "Some Applications of Clustering" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "Here are some common applications of clustering algorithms:\n", 244 | "\n", 245 | "- Building customer profiles for market analysis\n", 246 | "- Grouping related web news (e.g. Google News) and web search results\n", 247 | "- Grouping related stock quotes for investment portfolio management\n", 248 | "- Can be used as a preprocessing step for recommender systems\n", 249 | "- Can be used to build a code book of prototype samples for unsupervised feature extraction for supervised learning algorithms\n" 250 | ] 251 | }, 252 | { 253 | "cell_type": "heading", 254 | "level": 2, 255 | "metadata": {}, 256 | "source": [ 257 | "Exercise" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Perform the K-Means cluster search again, but this time learn the\n", 265 | "clusters using the full data matrix ``X``, rather than the projected\n", 266 | "matrix ``X_pca``.\n", 267 | "\n", 268 | "Does this change the results?\n", 269 | "\n", 270 | "Plot the results (you can still use X_pca for visualization, but plot\n", 271 | "the labels derived from the full 4-D set).\n", 272 | "Do the 4D K-means labels look closer to the true labels?" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "collapsed": false, 278 | "input": [], 279 | "language": "python", 280 | "metadata": {}, 281 | "outputs": [] 282 | } 283 | ], 284 | "metadata": {} 285 | } 286 | ] 287 | } -------------------------------------------------------------------------------- /notebooks/08_linearly_separable.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "08_linearly_separable" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Quick Example: Linearly Separable Data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "*Much of the code here is from the scikit-learn documentation:* http://scikit-learn.org/stable/auto_examples/applications/svm_gui.html" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Some supervised learning problems can be solved by very simple models\n", 30 | "(called generalized linear models) depending on the data. Others simply don\u2019t.\n", 31 | "\n", 32 | "To grasp the difference between the two cases, we'll run the interactive\n", 33 | "graphical example found in the figures directory. To do this, you can\n", 34 | "open a terminal and run the file ``svm_gui.py``" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | " >$ cd scripts\n", 42 | " >$ python svm_gui.py" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Put some data points belonging to one of the two target\n", 50 | "classes (\u2018white\u2019 or \u2018black\u2019) using left click and right click.\n", 51 | "Choose some parameters of a Support Vector Machine to be trained\n", 52 | "on this toy dataset (`n_samples` is the number of clicks, `n_features` is 2).\n", 53 | "Click the Fit but to train the model and see the decision boundary.\n", 54 | "The accurracy of the model is displayed on stdout." 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "The following figures demonstrate one case where a linear model can perfectly separate the two classes while the other is not linearly separable (a model with a gaussian kernel is required in that case)." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "collapsed": false, 67 | "input": [ 68 | "%pylab inline" 69 | ], 70 | "language": "python", 71 | "metadata": {}, 72 | "outputs": [] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "collapsed": false, 77 | "input": [ 78 | "from figures.svm_gui_frames import plot_linear_model, plot_rbf_model\n", 79 | "plot_linear_model()" 80 | ], 81 | "language": "python", 82 | "metadata": {}, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "This figure shows a linear Support Vector Machine trained to perfectly separate two sets of data points labeled as white and black in a 2D space." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "collapsed": false, 95 | "input": [ 96 | "plot_rbf_model()" 97 | ], 98 | "language": "python", 99 | "metadata": {}, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "This shows a Support Vector Machine with gaussian kernel trained to separate\n", 107 | "2 sets of data points labeled as white and black in a 2D space. This dataset\n", 108 | "would not have been seperated by a simple linear model." 109 | ] 110 | }, 111 | { 112 | "cell_type": "heading", 113 | "level": 3, 114 | "metadata": {}, 115 | "source": [ 116 | "Exercise 1:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "Use the GUI to fit a model that is able to solve the *XOR problem*\n", 124 | "using the GUI: the XOR problem is composed of 4 samples:\n", 125 | "\n", 126 | "- 2 white samples in the top-left and bottom-right corners\n", 127 | "- 2 black samples in the bottom-left and top-right corners\n", 128 | "\n", 129 | "Question: is the XOR problem linearly separable?" 130 | ] 131 | }, 132 | { 133 | "cell_type": "heading", 134 | "level": 3, 135 | "metadata": {}, 136 | "source": [ 137 | "Exercise 2:" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "Use the GUI to construct a problem with less than 10 points\n", 145 | "where the predictive accuracy of the best linear model is 50%." 146 | ] 147 | }, 148 | { 149 | "cell_type": "heading", 150 | "level": 2, 151 | "metadata": {}, 152 | "source": [ 153 | "Notes:" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "- The higher the dimension of the feature space, the more likely the data is\n", 161 | " linearly separable." 162 | ] 163 | } 164 | ], 165 | "metadata": {} 166 | } 167 | ] 168 | } -------------------------------------------------------------------------------- /notebooks/09_validation_and_testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "09_validation_and_testing" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Measuring Classification Performance: Validation & Testing" 16 | ] 17 | }, 18 | { 19 | "cell_type": "heading", 20 | "level": 2, 21 | "metadata": {}, 22 | "source": [ 23 | "Checking Performance on the Iris Dataset" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Previously, we looked at a simplistic example of how to test the performance\n", 31 | "of a classifier. Using the iris data set, it looked something like this:" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "collapsed": false, 37 | "input": [ 38 | "# Get the data\n", 39 | "from sklearn.datasets import load_iris\n", 40 | "iris = load_iris()\n", 41 | "X = iris.data\n", 42 | "y = iris.target" 43 | ], 44 | "language": "python", 45 | "metadata": {}, 46 | "outputs": [] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "collapsed": false, 51 | "input": [ 52 | "# Instantiate and train the classifier\n", 53 | "from sklearn.svm import LinearSVC\n", 54 | "clf = LinearSVC(loss = 'l2')\n", 55 | "clf.fit(X, y)" 56 | ], 57 | "language": "python", 58 | "metadata": {}, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "collapsed": false, 64 | "input": [ 65 | "# Check input vs. output labels\n", 66 | "y_pred = clf.predict(X)\n", 67 | "print (y_pred == y)" 68 | ], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "**Question:** what might be the problem with this approach?" 78 | ] 79 | }, 80 | { 81 | "cell_type": "heading", 82 | "level": 2, 83 | "metadata": {}, 84 | "source": [ 85 | "A Better Approach: Cross-Validation" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Learning the parameters of a prediction function and testing it on the\n", 93 | "same data is a methodological mistake: a model that would just repeat\n", 94 | "the labels of the samples that it has just seen would have a perfect\n", 95 | "score but would fail to predict anything useful on yet-unseen data.\n", 96 | "\n", 97 | "To avoid over-fitting, we have to define two different sets:\n", 98 | "\n", 99 | "- a training set X_train, y_train which is used for learning the parameters of a predictive model\n", 100 | "- a testing set X_test, y_test which is used for evaluating the fitted predictive model\n", 101 | "\n", 102 | "In scikit-learn such a random split can be quickly computed with the\n", 103 | "`train_test_split` helper function. It can be used this way:" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "collapsed": false, 109 | "input": [ 110 | "from sklearn import cross_validation\n", 111 | "X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=0)\n", 112 | "\n", 113 | "print X.shape, X_train.shape, X_test.shape" 114 | ], 115 | "language": "python", 116 | "metadata": {}, 117 | "outputs": [] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "Now we train on the training data, and test on the testing data:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "collapsed": false, 129 | "input": [ 130 | "clf = LinearSVC(loss='l2').fit(X_train, y_train)\n", 131 | "y_pred = clf.predict(X_test)\n", 132 | "print (y_pred == y_test)" 133 | ], 134 | "language": "python", 135 | "metadata": {}, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "There is an issue here, however:\n", 143 | "by defining these two sets, we drastically reduce the number\n", 144 | "of samples which can be used for learning the model, and the results\n", 145 | "can depend on a particular random choice for the pair of (train, test) sets.\n", 146 | "\n", 147 | "A solution is to split the whole data several consecutive times in different\n", 148 | "train set and test set, and to return the averaged value of the prediction\n", 149 | "scores obtained with the different sets. Such a procedure is called **cross-validation**.\n", 150 | "This approach can be computationally expensive, but does not waste too much data\n", 151 | "(as it is the case when fixing an arbitrary test set), which is a major advantage\n", 152 | "in problem such as inverse inference where the number of samples is very small.\n", 153 | "\n", 154 | "We'll explore cross-validation a bit later, but\n", 155 | "you can find much more information on cross-validation in scikit-learn here:\n", 156 | "http://scikit-learn.org/dev/modules/cross_validation.html\n" 157 | ] 158 | }, 159 | { 160 | "cell_type": "heading", 161 | "level": 2, 162 | "metadata": {}, 163 | "source": [ 164 | "Diving Deeper: Hyperparameters, Over-fitting, and Under-fitting" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "*The content in this section is adapted from Andrew Ng's excellent\n", 172 | "Coursera course, available here:* https://www.coursera.org/course/ml" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "The issues associated with validation and \n", 180 | "cross-validation are some of the most important\n", 181 | "aspects of the practice of machine learning. Selecting the optimal model\n", 182 | "for your data is vital, and is a piece of the problem that is not often\n", 183 | "appreciated by machine learning practitioners.\n", 184 | "\n", 185 | "Of core importance is the following question:\n", 186 | "\n", 187 | "**If our estimator is underperforming, how should we move forward?**\n", 188 | "\n", 189 | "- Use simpler or more complicated model?\n", 190 | "- Add more features to each observed data point?\n", 191 | "- Add more training samples?\n", 192 | "\n", 193 | "The answer is often counter-intuitive. In particular, **Sometimes using a\n", 194 | "more complicated model will give _worse_ results.** Also, **Sometimes adding\n", 195 | "training data will not improve your results.** The ability to determine\n", 196 | "what steps will improve your model is what separates the successful machine\n", 197 | "learning practitioners from the unsuccessful." 198 | ] 199 | }, 200 | { 201 | "cell_type": "heading", 202 | "level": 3, 203 | "metadata": {}, 204 | "source": [ 205 | "A Simple Regression Problem" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "For this section, we'll work with a simple 1D regression problem. This will help us to\n", 213 | "easily visualize the data and the model, and the results generalize easily to higher-dimensional\n", 214 | "datasets. We'll explore **polynomial regression**: the fitting of a polynomial to points.\n", 215 | "Though this can be accomplished within scikit-learn (the machinery is in `sklearn.linear_model`),\n", 216 | "for simplicity we'll use `numpy.polyfit` and `numpy.polyval`:" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "collapsed": false, 222 | "input": [ 223 | "%pylab inline" 224 | ], 225 | "language": "python", 226 | "metadata": {}, 227 | "outputs": [] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "collapsed": false, 232 | "input": [ 233 | "import numpy as np\n", 234 | "\n", 235 | "x = 10 * np.random.random(20)\n", 236 | "y = 0.5 * x ** 2 - x + 1\n", 237 | "\n", 238 | "p = np.polyfit(x, y, deg=2)\n", 239 | "print p" 240 | ], 241 | "language": "python", 242 | "metadata": {}, 243 | "outputs": [] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "As we can see, polyfit fits a polynomial to one-dimensional data. We can\n", 250 | "visualize this to see the result:" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "collapsed": false, 256 | "input": [ 257 | "x_new = np.linspace(-1, 12, 1000)\n", 258 | "y_new = np.polyval(p, x_new)\n", 259 | "\n", 260 | "plt.scatter(x, y)\n", 261 | "plt.plot(x_new, y_new)" 262 | ], 263 | "language": "python", 264 | "metadata": {}, 265 | "outputs": [] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "We've chosen the model to use through the *hyperparameter* `deg`.\n", 272 | "\n", 273 | "A *hyperparameter* is a parameter that determines the type of\n", 274 | "model we use: for example, `deg=1` gives a linear model, `deg=2`\n", 275 | "gives a 2nd-order polynomial, etc." 276 | ] 277 | }, 278 | { 279 | "cell_type": "heading", 280 | "level": 3, 281 | "metadata": {}, 282 | "source": [ 283 | "Adding some noise" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "Now, what if the data is not a perfect polynomial? Below, we'll take the above\n", 291 | "problem and add a small\n", 292 | "amount of Gaussian scatter in ``y``. Here we'll take the additional step of computing\n", 293 | "the RMS error of the resulting model on the input data." 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "collapsed": false, 299 | "input": [ 300 | "np.random.seed(42)\n", 301 | "x = 10 * np.random.random(20)\n", 302 | "y = 0.5 * x ** 2 - x + 1 + np.random.normal(0, 2, x.shape)\n", 303 | "\n", 304 | "# ---> Change the degree here\n", 305 | "p = np.polyfit(x, y, deg=2)\n", 306 | "x_new = np.linspace(0, 10, 100)\n", 307 | "y_new = np.polyval(p, x_new)\n", 308 | "\n", 309 | "plt.scatter(x, y)\n", 310 | "plt.plot(x_new, y_new)\n", 311 | "plt.ylim(-10, 50)\n", 312 | "print \"RMS error = %.4g\" % np.sqrt(np.mean((y - np.polyval(p, x)) ** 2))" 313 | ], 314 | "language": "python", 315 | "metadata": {}, 316 | "outputs": [] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "**What happens to the fit and the RMS error as the degree is increased?**" 323 | ] 324 | }, 325 | { 326 | "cell_type": "heading", 327 | "level": 2, 328 | "metadata": {}, 329 | "source": [ 330 | "Learning Curves and the Bias/Variance Tradeoff" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "One way to address this issue is to use what are often called **Learning Curves**.\n", 338 | "Given a particular dataset and a model we'd like to fit (e.g. a polynomial), we'd\n", 339 | "like to tune our value of the *hyperparameter* `d` to give us the best fit.\n", 340 | "\n", 341 | "We'll imagine we have a simple regression problem: given the size of a house, we'd\n", 342 | "like to predict how much it's worth. We'll fit it with our polynomial regression\n", 343 | "model.\n", 344 | "\n", 345 | "Run the following code to see an example plot:" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "collapsed": false, 351 | "input": [ 352 | "from figures import plot_bias_variance\n", 353 | "plot_bias_variance(8)" 354 | ], 355 | "language": "python", 356 | "metadata": {}, 357 | "outputs": [] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "In the above figure, we see fits for three different values of `d`.\n", 364 | "For `d = 1`, the data is under-fit. This means that the model is too\n", 365 | "simplistic: no straight line will ever be a good fit to this data. In\n", 366 | "this case, we say that the model suffers from high bias. The model\n", 367 | "itself is biased, and this will be reflected in the fact that the data\n", 368 | "is poorly fit. At the other extreme, for `d = 6` the data is over-fit.\n", 369 | "This means that the model has too many free parameters (6 in this case)\n", 370 | "which can be adjusted to perfectly fit the training data. If we add a\n", 371 | "new point to this plot, though, chances are it will be very far from\n", 372 | "the curve representing the degree-6 fit. In this case, we say that the\n", 373 | "model suffers from high variance. The reason for this label is that if\n", 374 | "any of the input points are varied slightly, it could result in an\n", 375 | "extremely different model.\n", 376 | "\n", 377 | "In the middle, for `d = 2`, we have found a good mid-point. It fits\n", 378 | "the data fairly well, and does not suffer from the bias and variance\n", 379 | "problems seen in the figures on either side. What we would like is a\n", 380 | "way to quantitatively identify bias and variance, and optimize the\n", 381 | "metaparameters (in this case, the polynomial degree d) in order to\n", 382 | "determine the best algorithm. This can be done through a process\n", 383 | "called cross-validation." 384 | ] 385 | }, 386 | { 387 | "cell_type": "heading", 388 | "level": 3, 389 | "metadata": {}, 390 | "source": [ 391 | "Validation Curves" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "We'll create a dataset like in the example above, and use this to test our\n", 399 | "validation scheme. First we'll define some utility routines:" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "collapsed": false, 405 | "input": [ 406 | "def test_func(x, err=0.5):\n", 407 | " return np.random.normal(10 - 1. / (x + 0.1), err)\n", 408 | "\n", 409 | "def compute_error(x, y, p):\n", 410 | " yfit = np.polyval(p, x)\n", 411 | " return np.sqrt(np.mean((y - yfit) ** 2))" 412 | ], 413 | "language": "python", 414 | "metadata": {}, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "collapsed": false, 420 | "input": [ 421 | "from sklearn.cross_validation import train_test_split\n", 422 | "\n", 423 | "N = 200\n", 424 | "f_crossval = 0.5\n", 425 | "error = 1.0\n", 426 | "\n", 427 | "# randomly sample the data\n", 428 | "np.random.seed(1)\n", 429 | "x = np.random.random(N)\n", 430 | "y = test_func(x, error)\n", 431 | "\n", 432 | "# split into training, validation, and testing sets.\n", 433 | "xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=f_crossval)\n", 434 | "\n", 435 | "# show the training and cross-validation sets\n", 436 | "plt.scatter(xtrain, ytrain, color='red')\n", 437 | "plt.scatter(xtest, ytest, color='blue')" 438 | ], 439 | "language": "python", 440 | "metadata": {}, 441 | "outputs": [] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "In order to quantify the effects of bias and variance and construct\n", 448 | "the best possible estimator, we will split our training data into\n", 449 | "a *training set* and a *validation set*. As a general rule, the\n", 450 | "training set should be about 60% of the samples.\n", 451 | "\n", 452 | "The general idea is as follows. The model parameters (in our case,\n", 453 | "the coefficients of the polynomials) are learned using the training\n", 454 | "set as above. The error is evaluated on the cross-validation set,\n", 455 | "and the meta-parameters (in our case, the degree of the polynomial)\n", 456 | "are adjusted so that this cross-validation error is minimized.\n", 457 | "Finally, the labels are predicted for the test set. These labels\n", 458 | "are used to evaluate how well the algorithm can be expected to\n", 459 | "perform on unlabeled data.\n", 460 | "\n", 461 | "The cross-validation error of our polynomial classifier can be visualized\n", 462 | "by plotting the error as a function of the polynomial degree d. We can do\n", 463 | "this as follows:" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "collapsed": false, 469 | "input": [ 470 | "# suppress warnings from Polyfit\n", 471 | "import warnings\n", 472 | "warnings.filterwarnings('ignore', message='Polyfit*')\n", 473 | "\n", 474 | "degrees = np.arange(21)\n", 475 | "train_err = np.zeros(len(degrees))\n", 476 | "validation_err = np.zeros(len(degrees))\n", 477 | "\n", 478 | "for i, d in enumerate(degrees):\n", 479 | " p = np.polyfit(xtrain, ytrain, d)\n", 480 | "\n", 481 | " train_err[i] = compute_error(xtrain, ytrain, p)\n", 482 | " validation_err[i] = compute_error(xtest, ytest, p)\n", 483 | "\n", 484 | "fig, ax = plt.subplots()\n", 485 | "\n", 486 | "ax.plot(degrees, validation_err, lw=2, label = 'cross-validation error')\n", 487 | "ax.plot(degrees, train_err, lw=2, label = 'training error')\n", 488 | "ax.plot([0, 20], [error, error], '--k', label='intrinsic error')\n", 489 | "\n", 490 | "ax.legend(loc=0)\n", 491 | "ax.set_xlabel('degree of fit')\n", 492 | "ax.set_ylabel('rms error')" 493 | ], 494 | "language": "python", 495 | "metadata": {}, 496 | "outputs": [] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "This figure compactly shows the reason that cross-validation is\n", 503 | "important. On the left side of the plot, we have very low-degree\n", 504 | "polynomial, which under-fits the data. This leads to a very high\n", 505 | "error for both the training set and the cross-validation set. On\n", 506 | "the far right side of the plot, we have a very high degree\n", 507 | "polynomial, which over-fits the data. This can be seen in the fact\n", 508 | "that the training error is very low, while the cross-validation\n", 509 | "error is very high. Plotted for comparison is the intrinsic error\n", 510 | "(this is the scatter artificially added to the data: click on the\n", 511 | "above image to see the source code). For this toy dataset,\n", 512 | "error = 1.0 is the best we can hope to attain. Choosing `d=6` in\n", 513 | "this case gets us very close to the optimal error.\n", 514 | "\n", 515 | "The astute reader will realize that something is amiss here: in\n", 516 | "the above plot, `d = 6` gives the best results. But in the previous\n", 517 | "plot, we found that `d = 6` vastly over-fits the data. What\u2019s going\n", 518 | "on here? The difference is the **number of training points** used.\n", 519 | "In the previous example, there were only eight training points.\n", 520 | "In this example, we have 100. As a general rule of thumb, the more\n", 521 | "training points used, the more complicated model can be used.\n", 522 | "But how can you determine for a given model whether more training\n", 523 | "points will be helpful? A useful diagnostic for this are learning curves." 524 | ] 525 | }, 526 | { 527 | "cell_type": "heading", 528 | "level": 3, 529 | "metadata": {}, 530 | "source": [ 531 | "Learning Curves" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "A learning curve is a plot of the training and cross-validation\n", 539 | "error as a function of the number of training points. Note that\n", 540 | "when we train on a small subset of the training data, the training\n", 541 | "error is computed using this subset, not the full training set.\n", 542 | "These plots can give a quantitative view into how beneficial it\n", 543 | "will be to add training samples." 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "collapsed": false, 549 | "input": [ 550 | "# suppress warnings from Polyfit\n", 551 | "import warnings\n", 552 | "warnings.filterwarnings('ignore', message='Polyfit*')\n", 553 | "\n", 554 | "def plot_learning_curve(d):\n", 555 | " sizes = np.linspace(2, N, 50).astype(int)\n", 556 | " train_err = np.zeros(sizes.shape)\n", 557 | " crossval_err = np.zeros(sizes.shape)\n", 558 | "\n", 559 | " for i, size in enumerate(sizes):\n", 560 | " # Train on only the first `size` points\n", 561 | " p = np.polyfit(xtrain[:size], ytrain[:size], d)\n", 562 | " \n", 563 | " # Validation error is on the *entire* validation set\n", 564 | " crossval_err[i] = compute_error(xtest, ytest, p)\n", 565 | " \n", 566 | " # Training error is on only the points used for training\n", 567 | " train_err[i] = compute_error(xtrain[:size], ytrain[:size], p)\n", 568 | "\n", 569 | " fig, ax = plt.subplots()\n", 570 | " ax.plot(sizes, crossval_err, lw=2, label='validation error')\n", 571 | " ax.plot(sizes, train_err, lw=2, label='training error')\n", 572 | " ax.plot([0, N], [error, error], '--k', label='intrinsic error')\n", 573 | "\n", 574 | " ax.set_xlabel('traning set size')\n", 575 | " ax.set_ylabel('rms error')\n", 576 | " \n", 577 | " ax.legend(loc=0)\n", 578 | " \n", 579 | " ax.set_xlim(0, 99)\n", 580 | "\n", 581 | " ax.set_title('d = %i' % d)" 582 | ], 583 | "language": "python", 584 | "metadata": {}, 585 | "outputs": [] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": {}, 590 | "source": [ 591 | "Now that we've defined this function, let's plot an example learning curve:" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "collapsed": false, 597 | "input": [ 598 | "plot_learning_curve(d=1)" 599 | ], 600 | "language": "python", 601 | "metadata": {}, 602 | "outputs": [] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "Here we show the learning curve for `d = 1`. From the above\n", 609 | "discussion, we know that `d = 1` is a high-bias estimator which\n", 610 | "under-fits the data. This is indicated by the fact that both the\n", 611 | "training and validation errors are very high. If this is\n", 612 | "the case, adding more training data will not help matters: both\n", 613 | "lines have converged to a relatively high error.\n", 614 | "\n", 615 | "**When the learning curves have converged, we need a more sophisticated\n", 616 | "model or more features to improve the error.**\n", 617 | "\n", 618 | "*(equivalently we can decrease regularization, which we won't discuss in this tutorial)*" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "collapsed": false, 624 | "input": [ 625 | "plot_learning_curve(d=20)\n", 626 | "plt.ylim(0, 15)" 627 | ], 628 | "language": "python", 629 | "metadata": {}, 630 | "outputs": [] 631 | }, 632 | { 633 | "cell_type": "markdown", 634 | "metadata": {}, 635 | "source": [ 636 | "Here we show the learning curve for `d = 20`. From the above\n", 637 | "discussion, we know that `d = 20` is a high-variance estimator\n", 638 | "which over-fits the data. This is indicated by the fact that the\n", 639 | "training error is much less than the validation error. As\n", 640 | "we add more samples to this training set, the training error will\n", 641 | "continue to climb, while the cross-validation error will continue\n", 642 | "to decrease, until they meet in the middle. In this case, our\n", 643 | "intrinsic error was set to 1.0, and we can infer that adding more\n", 644 | "data will allow the estimator to very closely match the best\n", 645 | "possible cross-validation error.\n", 646 | "\n", 647 | "**When the learning curves have not converged, it indicates that the\n", 648 | "model is too complicated for the amount of data we have. We should\n", 649 | "either find more training data, or use a simpler model.**\n", 650 | "\n", 651 | "*(equivalently we can increase __regularization__, which we won't discuss in this tutorial)*" 652 | ] 653 | }, 654 | { 655 | "cell_type": "heading", 656 | "level": 2, 657 | "metadata": {}, 658 | "source": [ 659 | "Summary" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": {}, 665 | "source": [ 666 | "We\u2019ve seen above that an under-performing algorithm can be due\n", 667 | "to two possible situations: high bias (under-fitting) and high\n", 668 | "variance (over-fitting). In order to evaluate our algorithm, we\n", 669 | "set aside a portion of our training data for cross-validation.\n", 670 | "Using the technique of learning curves, we can train on progressively\n", 671 | "larger subsets of the data, evaluating the training error and\n", 672 | "cross-validation error to determine whether our algorithm has\n", 673 | "high variance or high bias. But what do we do with this information?" 674 | ] 675 | }, 676 | { 677 | "cell_type": "heading", 678 | "level": 3, 679 | "metadata": {}, 680 | "source": [ 681 | "High Bias" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "If our algorithm shows high **bias**, the following actions might help:\n", 689 | "\n", 690 | "- **Add more features**. In our example of predicting home prices,\n", 691 | " it may be helpful to make use of information such as the neighborhood\n", 692 | " the house is in, the year the house was built, the size of the lot, etc.\n", 693 | " Adding these features to the training and test sets can improve\n", 694 | " a high-bias estimator\n", 695 | "- **Use a more sophisticated model**. Adding complexity to the model can\n", 696 | " help improve on bias. For a polynomial fit, this can be accomplished\n", 697 | " by increasing the degree d. Each learning technique has its own\n", 698 | " methods of adding complexity.\n", 699 | "- **Use fewer samples**. Though this will not improve the classification,\n", 700 | " a high-bias algorithm can attain nearly the same error with a smaller\n", 701 | " training sample. For algorithms which are computationally expensive,\n", 702 | " reducing the training sample size can lead to very large improvements\n", 703 | " in speed.\n", 704 | "- **Decrease regularization**. Regularization is a technique used to impose\n", 705 | " simplicity in some machine learning models, by adding a penalty term that\n", 706 | " depends on the characteristics of the parameters. If a model has high bias,\n", 707 | " decreasing the effect of regularization can lead to better results." 708 | ] 709 | }, 710 | { 711 | "cell_type": "heading", 712 | "level": 3, 713 | "metadata": {}, 714 | "source": [ 715 | "High Variance" 716 | ] 717 | }, 718 | { 719 | "cell_type": "markdown", 720 | "metadata": {}, 721 | "source": [ 722 | "If our algorithm shows **high variance**, the following actions might help:\n", 723 | "\n", 724 | "- **Use fewer features**. Using a feature selection technique may be\n", 725 | " useful, and decrease the over-fitting of the estimator.\n", 726 | "- **Use more training samples**. Adding training samples can reduce\n", 727 | " the effect of over-fitting, and lead to improvements in a high\n", 728 | " variance estimator.\n", 729 | "- **Increase Regularization**. Regularization is designed to prevent\n", 730 | " over-fitting. In a high-variance model, increasing regularization\n", 731 | " can lead to better results.\n", 732 | "\n", 733 | "These choices become very important in real-world situations. For example,\n", 734 | "due to limited telescope time, astronomers must seek a balance between\n", 735 | "observing a large number of objects, and observing a large number of\n", 736 | "features for each object. Determining which is more important for a\n", 737 | "particular learning task can inform the observing strategy that the\n", 738 | "astronomer employs. In a later exercise, we will explore the use of\n", 739 | "learning curves for the photometric redshift problem." 740 | ] 741 | }, 742 | { 743 | "cell_type": "heading", 744 | "level": 3, 745 | "metadata": {}, 746 | "source": [ 747 | "More Sophisticated Methods" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "metadata": {}, 753 | "source": [ 754 | "There are a lot more options for performing validation and model testing.\n", 755 | "In particular, there are several schemes for cross-validation, in which\n", 756 | "the model is fit multiple times with different training and test sets.\n", 757 | "The details are different, but the principles are the same as what we've\n", 758 | "seen here.\n", 759 | "\n", 760 | "For more information see the ``sklearn.cross_validation`` module documentation,\n", 761 | "and the information on the scikit-learn website." 762 | ] 763 | }, 764 | { 765 | "cell_type": "heading", 766 | "level": 2, 767 | "metadata": {}, 768 | "source": [ 769 | "One Last Caution" 770 | ] 771 | }, 772 | { 773 | "cell_type": "markdown", 774 | "metadata": {}, 775 | "source": [ 776 | "Using validation schemes to determine hyper-parameters means that we are\n", 777 | "fitting the hyper-parameters to the particular validation set. In the same\n", 778 | "way that parameters can be over-fit to the training set, hyperparameters can\n", 779 | "be over-fit to the validation set. Because of this, the validation error\n", 780 | "tends to under-predict the classification error of new data.\n", 781 | "\n", 782 | "For this reason, it is recommended to split the data into three sets:\n", 783 | "\n", 784 | "- The **training set**, used to train the model (usually ~60% of the data)\n", 785 | "- The **validation set**, used to validate the model (usually ~20% of the data)\n", 786 | "- The **test set**, used to evaluate the expected error of the validated model (usually ~20% of the data)\n", 787 | "\n", 788 | "This may seem excessive, and many machine learning practitioners ignore the need\n", 789 | "for a test set. But if your goal is to predict the error of a model on unknown\n", 790 | "data, using a test set is vital." 791 | ] 792 | } 793 | ], 794 | "metadata": {} 795 | } 796 | ] 797 | } -------------------------------------------------------------------------------- /notebooks/10_digits_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "10_digits_classification" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Classification of Handwritten Digits" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "In this section we'll apply scikit-learn to the classification of handwritten\n", 23 | "digits. This will go a bit beyond the iris classification we saw before: we'll\n", 24 | "discuss some of the metrics which can be used in evaluating the effectiveness\n", 25 | "of a classification model, see an example of K-fold cross-validation, and\n", 26 | "present a more involved exercise.\n", 27 | "\n", 28 | "We'll work with the handwritten digits dataset which we saw in an earlier\n", 29 | "section of the tutorial." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "collapsed": false, 35 | "input": [ 36 | "from sklearn.datasets import load_digits\n", 37 | "digits = load_digits()" 38 | ], 39 | "language": "python", 40 | "metadata": {}, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "We'll re-use some of our code from before to visualize the data and remind us what\n", 48 | "we're looking at:" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "collapsed": false, 54 | "input": [ 55 | "%pylab inline" 56 | ], 57 | "language": "python", 58 | "metadata": {}, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "collapsed": false, 64 | "input": [ 65 | "# copied from notebook 02_sklearn_data.ipynb\n", 66 | "fig = plt.figure(figsize=(6, 6)) # figure size in inches\n", 67 | "fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)\n", 68 | "\n", 69 | "# plot the digits: each image is 8x8 pixels\n", 70 | "for i in range(64):\n", 71 | " ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])\n", 72 | " ax.imshow(digits.images[i], cmap=plt.cm.binary)\n", 73 | " \n", 74 | " # label the image with the target value\n", 75 | " ax.text(0, 7, str(digits.target[i]))" 76 | ], 77 | "language": "python", 78 | "metadata": {}, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "heading", 83 | "level": 2, 84 | "metadata": {}, 85 | "source": [ 86 | "Visualizing the Data" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "A good first-step for many problems is to visualize the data using one of the\n", 94 | "*Dimensionality Reduction* techniques we saw earlier. We'll start with the\n", 95 | "most straightforward one, Principal Component Analysis (PCA).\n", 96 | "\n", 97 | "PCA seeks orthogonal linear combinations of the features which show the greatest\n", 98 | "variance, and as such, can help give you a good idea of the structure of the\n", 99 | "data set. Here we'll use `RandomizedPCA`, because it's faster for large `N`." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "collapsed": false, 105 | "input": [ 106 | "from sklearn.decomposition import RandomizedPCA\n", 107 | "pca = RandomizedPCA(n_components=2)\n", 108 | "proj = pca.fit_transform(digits.data)" 109 | ], 110 | "language": "python", 111 | "metadata": {}, 112 | "outputs": [] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "collapsed": false, 117 | "input": [ 118 | "plt.scatter(proj[:, 0], proj[:, 1], c=digits.target)\n", 119 | "plt.colorbar()" 120 | ], 121 | "language": "python", 122 | "metadata": {}, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "Here we see that the digits do cluster fairly well, so we can expect even\n", 130 | "a fairly naive classification scheme to do a decent job separating them.\n", 131 | "\n", 132 | "A weakness of PCA is that it produces a linear dimensionality reduction:\n", 133 | "this may miss some interesting relationships in the data. If we want to\n", 134 | "see a nonlinear mapping of the data, we can use one of the several\n", 135 | "methods in the `manifold` module. Here we'll use Isomap (a concatenation\n", 136 | "of Isometric Mapping) which is a manifold learning method based on\n", 137 | "graph theory:" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "collapsed": false, 143 | "input": [ 144 | "from sklearn.manifold import Isomap\n", 145 | "iso = Isomap(n_neighbors=5, n_components=2)\n", 146 | "proj = iso.fit_transform(digits.data)" 147 | ], 148 | "language": "python", 149 | "metadata": {}, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "collapsed": false, 155 | "input": [ 156 | "plt.scatter(proj[:, 0], proj[:, 1], c=digits.target)\n", 157 | "plt.colorbar()" 158 | ], 159 | "language": "python", 160 | "metadata": {}, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "It can be fun to explore the various manifold learning methods available,\n", 168 | "and how the output depends on the various parameters used to tune the\n", 169 | "projection.\n", 170 | "In any case, these visualizations show us that there is hope: even a simple\n", 171 | "classifier should be able to adequately identify the members of the various\n", 172 | "classes." 173 | ] 174 | }, 175 | { 176 | "cell_type": "heading", 177 | "level": 2, 178 | "metadata": {}, 179 | "source": [ 180 | "Gaussian Naive Bayes Classification" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "For most classification problems, it's nice to have a simple, fast, go-to\n", 188 | "method to provide a quick baseline classification. If the simple and fast\n", 189 | "method is sufficient, then we don't have to waste CPU cycles on more complex\n", 190 | "models. If not, we can use the results of the simple method to give us\n", 191 | "clues about our data.\n", 192 | "\n", 193 | "One good method to keep in mind is Gaussian Naive Bayes. It is a *generative*\n", 194 | "classifier which fits an axis-aligned multi-dimensional Gaussian distribution to\n", 195 | "each training label, and uses this to quickly give a rough classification. It\n", 196 | "is generally not sufficiently accurate for real-world data, but (especially in\n", 197 | "high dimensions) can perform surprisingly well." 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "collapsed": false, 203 | "input": [ 204 | "from sklearn.naive_bayes import GaussianNB\n", 205 | "from sklearn import cross_validation" 206 | ], 207 | "language": "python", 208 | "metadata": {}, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "collapsed": false, 214 | "input": [ 215 | "# split the data into training and validation sets\n", 216 | "data_train, data_test, target_train, target_test = cross_validation.train_test_split(digits.data, digits.target)\n", 217 | "\n", 218 | "# train the model\n", 219 | "clf = GaussianNB()\n", 220 | "clf.fit(data_train, target_train)\n", 221 | "\n", 222 | "# predict the labels of the test data\n", 223 | "predicted = clf.predict(data_test)\n", 224 | "expected = target_test" 225 | ], 226 | "language": "python", 227 | "metadata": {}, 228 | "outputs": [] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "Previously we've done something like\n", 235 | "\n", 236 | " print (predicted == expected)\n", 237 | "\n", 238 | "as a rough evaluation of our model. Here we'll do something more sophisticated:\n", 239 | "scikit-learn includes a ``metrics`` module which contains several metrics for\n", 240 | "evaluating classifiers like this. One of the most useful combines several of\n", 241 | "the metrics and prints a table showing the results:" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "collapsed": false, 247 | "input": [ 248 | "from sklearn import metrics\n", 249 | "print metrics.classification_report(expected, predicted)" 250 | ], 251 | "language": "python", 252 | "metadata": {}, 253 | "outputs": [] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "Another enlightening metric for this sort of task is a\n", 260 | "*confusion matrix*: it helps us visualize which labels are\n", 261 | "being interchanged in the classification errors:" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "collapsed": false, 267 | "input": [ 268 | "print metrics.confusion_matrix(expected, predicted)" 269 | ], 270 | "language": "python", 271 | "metadata": {}, 272 | "outputs": [] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "We see here that in particular, a lot of twos are being mistakenly labeled eights.\n", 279 | "\n", 280 | "Previously we mentioned cross-validation. Here, rather than having a single training\n", 281 | "and test set, we divide the data into `K` subsets, and perform `K` different classifications,\n", 282 | "each time training on `K - 1` of the subsets and validating on the one left out.\n", 283 | "\n", 284 | "The tools to accomplish this are also in the `cross_validation` submodule." 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "collapsed": false, 290 | "input": [ 291 | "cv = cross_validation.KFold(digits.data.shape[0], 5, shuffle=True, random_state=0)\n", 292 | "\n", 293 | "clf = GaussianNB()\n", 294 | "\n", 295 | "print cross_validation.cross_val_score(clf, digits.data, digits.target, cv=cv)" 296 | ], 297 | "language": "python", 298 | "metadata": {}, 299 | "outputs": [] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "These metrics show us that the simplistic Gaussian Naive Bayes classifier is giving us correct\n", 306 | "classifications of about 4 out of 5 digits. This is probably not sufficient: imagine the chaos\n", 307 | "at the post office if their zipcode scanning software misread one out of five digits!\n", 308 | "\n", 309 | "We can do better... but how?" 310 | ] 311 | }, 312 | { 313 | "cell_type": "heading", 314 | "level": 2, 315 | "metadata": {}, 316 | "source": [ 317 | "Exercise" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "Here we come to the first major exercise of the tutorial, which will take some thought and\n", 325 | "creativity. It will combine and extend several of the ideas we've worked through today.\n", 326 | "The question is this:\n", 327 | "\n", 328 | "- **Given the digits data and the Gaussian Naive Bayes classifier, would it be better for us to\n", 329 | "invest effort in gathering more training samples, or seeking a more sophisticated\n", 330 | "model?**\n", 331 | "\n", 332 | "- **What is the best possible classification score you could expect from Gaussian Naive Bayes?**\n", 333 | "\n", 334 | "Some things to keep in mind:\n", 335 | "\n", 336 | "- we can use the ideas of *learning curves* to answer this question\n", 337 | "- previously, we used learning curves with RMS error in a regression model.\n", 338 | " Here, we'll have to decide on some suitable metric for classification.\n", 339 | "- We can compute the learning curves on classification **loss** (i.e. larger loss is worse)\n", 340 | " or classification **score** (i.e. larger score is better).\n", 341 | "- Classification is complicated by the fact that we generally have one score per class.\n", 342 | " Whether you use the (weighted) average of these, or take the min or max, depends on your\n", 343 | " ultimate goal.\n", 344 | "\n", 345 | "Once you're finished with this, you may wish to try out some other classifiers on the data\n", 346 | "and see how the results compare. Another extremely simple and fast classifier is\n", 347 | "``sklearn.tree.DecisionTreeClassifier``. Several more sophisticated (and therefore\n", 348 | "slower) classifiers can be found in the Support Vector Machines module, ``sklearn.svm``." 349 | ] 350 | } 351 | ], 352 | "metadata": {} 353 | } 354 | ] 355 | } -------------------------------------------------------------------------------- /notebooks/11_photoz_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "11_photoz_regression" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Regression Example: Photometric Redshifts" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Here we will explore a regression problem important in the field of astronomy:\n", 23 | "Data in Astronomy is most often in one of two forms: spectral data, and photometric\n", 24 | "data. Spectra are high-resolution measurements of the energy of a source as a\n", 25 | "function of wavelength. Photometry, usually measured in a logarithmic scale\n", 26 | "called *magnitudes*, can be thought of as the integral of the spectrum through\n", 27 | "a broad filter.\n", 28 | "\n", 29 | "Run the following code to see an example of a stellar spectrum (in particular,\n", 30 | "the spectrum of the star Vega) along with the five filters used in the\n", 31 | "Sloan Digital Sky Survey:" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "collapsed": false, 37 | "input": [ 38 | "%pylab inline" 39 | ], 40 | "language": "python", 41 | "metadata": {}, 42 | "outputs": [] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "collapsed": false, 47 | "input": [ 48 | "from figures import plot_sdss_filters\n", 49 | "plot_sdss_filters()" 50 | ], 51 | "language": "python", 52 | "metadata": {}, 53 | "outputs": [] 54 | }, 55 | { 56 | "cell_type": "heading", 57 | "level": 2, 58 | "metadata": {}, 59 | "source": [ 60 | "Photometric Redshifts" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "One interesting regression problem which appears often in Astronomy is the photometric\n", 68 | "determination of the galaxy redshift.\n", 69 | "\n", 70 | "In the current standard cosmological model, the universe began nearly 14 billion years ago,\n", 71 | "in an explosive event commonly known as the Big Bang. Since then, the very fabric\n", 72 | "of space has been expanding, so that distant galaxies appear to be moving away from\n", 73 | "us at very high speeds. The uniformity of this expansion means that there is a\n", 74 | "relationship between the distance to a galaxy, and the speed that it appears to be\n", 75 | "receeding from us (this relationship is known as Hubble\u2019s Law, named after Edwin Hubble).\n", 76 | "This recession speed leads to a shift in the frequency of photons, very similar to the\n", 77 | "more familiar doppler shift that causes the pitch of a siren to change as an emergency\n", 78 | "vehicle passes by. If a galaxy or star were moving toward us, its light would be shifted\n", 79 | "to higher frequencies, or blue-shifted. Because the universe is expanding away from us,\n", 80 | "distant galaxies appear to be red-shifted: their photons are shifted to lower frequencies.\n", 81 | "\n", 82 | "In cosmology, the redshift is measured with the parameter $z$, defined in terms of the\n", 83 | "observed wavelength $\\lambda_{obs}$ and the emitted wavelength $\\lambda_{em}$:\n", 84 | "\n", 85 | "$\\lambda_{obs} = (1 + z)\\lambda_{em}$\n", 86 | "\n", 87 | "When a spectrum can be obtained, determining the redshift is rather straight-forward:\n", 88 | "if you can localize the spectral fingerprint of a common element, such as hydrogen,\n", 89 | "then the redshift can be computed using simple arithmetic. But the task becomes much\n", 90 | "more difficult when only photometric observations are available.\n", 91 | "\n", 92 | "Because of the spectrum shift, an identical source at different redshifts will have a\n", 93 | "different color through each pair of filters. See the following figure:" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "collapsed": false, 99 | "input": [ 100 | "from figures import plot_redshifts\n", 101 | "plot_redshifts()" 102 | ], 103 | "language": "python", 104 | "metadata": {}, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "This again shows the spectrum of the star Vega ($\\alpha$-Lyr), but\n", 112 | "at three different redshifts. The SDSS ugriz filters are shown in gray for reference.\n", 113 | "\n", 114 | "At redshift z=0.0, the spectrum is bright in the u and g filters,\n", 115 | "but dim in the i and z filters. At redshift z=0.8, the opposite\n", 116 | "is the case. This suggests the possibility of determining redshift\n", 117 | "from photometry alone. The situation is complicated by the fact that\n", 118 | "each individual source has unique spectral characteristics, but\n", 119 | "nevertheless, these photometric redshifts are often used in astronomical applications." 120 | ] 121 | }, 122 | { 123 | "cell_type": "heading", 124 | "level": 2, 125 | "metadata": {}, 126 | "source": [ 127 | "Motivation: Dark Energy, Dark Matter, and the Fate of the Universe" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "The photometric redshift problem is very important. Future astronomical\n", 135 | "surveys hope to image trillions of very faint galaxies, and use this data \n", 136 | "to inform our view of the universe as a whole: its history, its geometry, \n", 137 | "and its fate. Obtaining an accurate estimate of the redshift to each of \n", 138 | "these galaxies is a pivotal part of this task. Because these surveys will \n", 139 | "image so many extremely faint galaxies, there is no possibility of obtaining \n", 140 | "a spectrum for each one. Thus sophisticated photometric redshift codes will \n", 141 | "be required to advance our understanding of the Universe, including more \n", 142 | "precisely understanding the nature of the dark energy that is currently \n", 143 | "accelerating the cosmic expansion." 144 | ] 145 | }, 146 | { 147 | "cell_type": "heading", 148 | "level": 2, 149 | "metadata": {}, 150 | "source": [ 151 | "Decision Tree Regression" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Here we will address the photometric redshift problem using 50,000 observations\n", 159 | "from the Sloan Digital Sky Survey. This example draws from examples available\n", 160 | "through the ``astroML`` python package, which can be found here:\n", 161 | "\n", 162 | "- http://astroml.github.com/book_figures/chapter9/fig_photoz_tree.html\n", 163 | "- http://astroml.github.com/book_figures/chapter9/fig_photoz_forest.html\n", 164 | "\n", 165 | "We'll start by downloading the data. This fetch function actually generates\n", 166 | "an SQL query and downloads the data from the SDSS database. The results\n", 167 | "will be cached locally so that subsequent calls to the function don't result\n", 168 | "in another download." 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "collapsed": false, 174 | "input": [ 175 | "import numpy as np\n", 176 | "from datasets import fetch_sdss_galaxy_mags" 177 | ], 178 | "language": "python", 179 | "metadata": {}, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "collapsed": false, 185 | "input": [ 186 | "# This will download a ~3MB file the first time you call the function\n", 187 | "data = fetch_sdss_galaxy_mags()\n", 188 | "\n", 189 | "print data.shape\n", 190 | "print data.dtype" 191 | ], 192 | "language": "python", 193 | "metadata": {}, 194 | "outputs": [] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "Next we'll extract the data. Because the relative magnitudes are easier to\n", 201 | "calibrate than the absolute magnitude, we'll work with what astronomers call\n", 202 | "*colors*, the difference of two magnitudes. Because the magnitudes are related\n", 203 | "to the logarithm of the flux, the colors can be thought of as normalized magnitudes." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "collapsed": false, 209 | "input": [ 210 | "redshift = data['redshift']\n", 211 | "mags = np.vstack([data[f] for f in 'ugriz']).transpose()\n", 212 | "colors = mags[:, :-1] - mags[:, 1:]\n", 213 | "print colors.shape" 214 | ], 215 | "language": "python", 216 | "metadata": {}, 217 | "outputs": [] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "We'll split the data into training and validation sets, and do a decision tree\n", 224 | "fit to the data:" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "collapsed": false, 230 | "input": [ 231 | "from sklearn import cross_validation\n", 232 | "ctrain, ctest, ztrain, ztest = cross_validation.train_test_split(colors, redshift)\n", 233 | "\n", 234 | "from sklearn.tree import DecisionTreeRegressor\n", 235 | "clf = DecisionTreeRegressor()" 236 | ], 237 | "language": "python", 238 | "metadata": {}, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "Let's start by running a 4-fold cross validation and see how we're doing. The\n", 246 | "cross validation here prints the r2 score, which lies between 0 and 1. The\n", 247 | "closer the score is to 1, the better:" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "collapsed": false, 253 | "input": [ 254 | "print cross_validation.cross_val_score(clf, colors, redshift, cv=4)" 255 | ], 256 | "language": "python", 257 | "metadata": {}, 258 | "outputs": [] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Another way we can visualize the results is to scatter-plot the input versus the output:" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "collapsed": false, 270 | "input": [ 271 | "# We'll use this function several times below\n", 272 | "def plot_redshifts(ztrue, zpred):\n", 273 | " \"\"\"scatter-plot the true vs predicted redshifts\"\"\"\n", 274 | " fig, ax = plt.subplots(figsize=(8, 8))\n", 275 | " ax.plot(ztrue, zpred, '.k')\n", 276 | " \n", 277 | " # plot trend lines, +/- 0.1 in z\n", 278 | " ax.plot([0, 3], [0, 3], '--k')\n", 279 | " ax.plot([0, 3], [0.2, 3.2], ':k')\n", 280 | " ax.plot([0.2, 3.2], [0, 3], ':k')\n", 281 | " \n", 282 | " ax.text(2.9, 0.1,\n", 283 | " \"RMS = %.2g\" % np.sqrt(np.mean((ztrue - zpred) ** 2)),\n", 284 | " ha='right', va='bottom')\n", 285 | "\n", 286 | " ax.set_xlim(0, 3)\n", 287 | " ax.set_ylim(0, 3)\n", 288 | " \n", 289 | " ax.set_xlabel('True redshift')\n", 290 | " ax.set_ylabel('Predicted redshift')" 291 | ], 292 | "language": "python", 293 | "metadata": {}, 294 | "outputs": [] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "collapsed": false, 299 | "input": [ 300 | "clf = DecisionTreeRegressor()\n", 301 | "clf.fit(ctrain, ztrain)\n", 302 | "zpred = clf.predict(ctest)" 303 | ], 304 | "language": "python", 305 | "metadata": {}, 306 | "outputs": [] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "collapsed": false, 311 | "input": [ 312 | "plot_redshifts(ztest, zpred)" 313 | ], 314 | "language": "python", 315 | "metadata": {}, 316 | "outputs": [] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "We see several things here: first, there are some regions of redshift space where\n", 323 | "the results are not very precise: they can vary by $\\pm 0.2$ or so.\n", 324 | "\n", 325 | "Second, there are many *catastrophic outliers*: values where the prediction is\n", 326 | "completely wrong. Both these sources of error are important to minimize, or\n", 327 | "at the very least, statistically characterize. We'll explore some ways to do\n", 328 | "this below." 329 | ] 330 | }, 331 | { 332 | "cell_type": "heading", 333 | "level": 2, 334 | "metadata": {}, 335 | "source": [ 336 | "Optimizing the Model" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "The question of how to improve the model goes back to the discussion of\n", 344 | "Learning Curves from earlier. We'll start by attempting to optimize the\n", 345 | "model itself through cross-validation.\n", 346 | "\n", 347 | "The decision tree regressors have a few hyperparameters, but one of the\n", 348 | "more important is the *depth*. This tells how many times the data set\n", 349 | "is split in the process of computing a fit. Here we'll plot the validation\n", 350 | "curve for the maximum depth:" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "collapsed": false, 356 | "input": [ 357 | "# we'll explore results for max_depth from 1 to 20\n", 358 | "max_depth_array = np.arange(1, 21)\n", 359 | "train_error = np.zeros(len(max_depth_array))\n", 360 | "test_error = np.zeros(len(max_depth_array))" 361 | ], 362 | "language": "python", 363 | "metadata": {}, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "collapsed": false, 369 | "input": [ 370 | "for i, max_depth in enumerate(max_depth_array):\n", 371 | " clf = DecisionTreeRegressor(max_depth=max_depth)\n", 372 | " clf.fit(ctrain, ztrain)\n", 373 | "\n", 374 | " ztrain_pred = clf.predict(ctrain)\n", 375 | " ztest_pred = clf.predict(ctest)\n", 376 | "\n", 377 | " train_error[i] = np.sqrt(np.mean((ztrain_pred - ztrain) ** 2))\n", 378 | " test_error[i] = np.sqrt(np.mean((ztest_pred - ztest) ** 2))" 379 | ], 380 | "language": "python", 381 | "metadata": {}, 382 | "outputs": [] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "collapsed": false, 387 | "input": [ 388 | "plt.plot(max_depth_array, train_error, label='training')\n", 389 | "plt.plot(max_depth_array, test_error, label='validation')\n", 390 | "plt.grid()\n", 391 | "plt.legend(loc=3)\n", 392 | "plt.xlabel('max_depth')\n", 393 | "plt.ylabel('error')" 394 | ], 395 | "language": "python", 396 | "metadata": {}, 397 | "outputs": [] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "We see a very clean curve which looks much like what we'd expect: the\n", 404 | "training error decreases consistently as the model over-fits it more\n", 405 | "and more, while the validation error turns over at some optimal value.\n", 406 | "\n", 407 | "Note that scikit-learn has a set of functions which automate this sort\n", 408 | "of calculation: it's in the ``grid_search`` model. As of scikit-learn version 0.13,\n", 409 | "the interface for grid search is still evolving, so the following code\n", 410 | "may have to be adjusted when newer scikit-learn versions are released:" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "collapsed": false, 416 | "input": [ 417 | "from sklearn.grid_search import GridSearchCV\n", 418 | "clf = DecisionTreeRegressor()\n", 419 | "grid = GridSearchCV(clf, param_grid=dict(max_depth=max_depth_array))\n", 420 | "grid.fit(colors, redshift)\n", 421 | "print grid.best_params_" 422 | ], 423 | "language": "python", 424 | "metadata": {}, 425 | "outputs": [] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "The grid search uses a full cross-validation rather than a single validation set:\n", 432 | "this is what leads to the larger optimal value of `max_depth`." 433 | ] 434 | }, 435 | { 436 | "cell_type": "heading", 437 | "level": 3, 438 | "metadata": {}, 439 | "source": [ 440 | "Plotting the optimal model" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "We can read from the above plot that the optimal RMS is obtained with a\n", 448 | "max depth of about 7. Let's see what this looks like:" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "collapsed": false, 454 | "input": [ 455 | "clf = DecisionTreeRegressor(max_depth=7)\n", 456 | "clf.fit(ctrain, ztrain)\n", 457 | "zpred = clf.predict(ctest)\n", 458 | "plot_redshifts(ztest, zpred)" 459 | ], 460 | "language": "python", 461 | "metadata": {}, 462 | "outputs": [] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "Ugh... not pretty.\n", 469 | "\n", 470 | "Even though *by eye* this looks like a much worse fit, than we had above, it actually\n", 471 | "does have a better RMS residual (0.21 vs. 0.27). This is a good illustration that\n", 472 | "**the form of the loss or score function is very important**." 473 | ] 474 | }, 475 | { 476 | "cell_type": "heading", 477 | "level": 2, 478 | "metadata": {}, 479 | "source": [ 480 | "An Alternative Loss Function" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "We said above that along with being concerned about RMS error, we're also\n", 488 | "concerned about the level of outliers in the data. With that in mind, we\n", 489 | "could propose another loss function particular to this problem: the fraction\n", 490 | "of points with an absolute deviation greater than 0.2 (that is, outside the\n", 491 | "dotted lines in the scatter-plot).\n", 492 | "\n", 493 | "We'll define the function to compute this, and then create the validation\n", 494 | "curve for this metric:" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "collapsed": false, 500 | "input": [ 501 | "def outlier_fraction(y_pred, y_true, cutoff=0.2):\n", 502 | " return np.sum((abs(y_pred - y_true) > cutoff)) * 1. / len(y_pred)" 503 | ], 504 | "language": "python", 505 | "metadata": {}, 506 | "outputs": [] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "collapsed": false, 511 | "input": [ 512 | "train_outfrac = np.zeros(len(max_depth_array))\n", 513 | "test_outfrac = np.zeros(len(max_depth_array))\n", 514 | "\n", 515 | "for i, max_depth in enumerate(max_depth_array):\n", 516 | " clf = DecisionTreeRegressor(max_depth=max_depth)\n", 517 | " clf.fit(ctrain, ztrain)\n", 518 | "\n", 519 | " ztrain_pred = clf.predict(ctrain)\n", 520 | " ztest_pred = clf.predict(ctest)\n", 521 | "\n", 522 | " train_outfrac[i] = outlier_fraction(ztrain_pred, ztrain)\n", 523 | " test_outfrac[i] = outlier_fraction(ztest_pred, ztest)" 524 | ], 525 | "language": "python", 526 | "metadata": {}, 527 | "outputs": [] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "collapsed": false, 532 | "input": [ 533 | "plt.plot(max_depth_array, train_outfrac)\n", 534 | "plt.plot(max_depth_array, test_outfrac)\n", 535 | "plt.grid()" 536 | ], 537 | "language": "python", 538 | "metadata": {}, 539 | "outputs": [] 540 | }, 541 | { 542 | "cell_type": "markdown", 543 | "metadata": {}, 544 | "source": [ 545 | "This outlier-based loss function settles on a much deeper tree. Let's\n", 546 | "see what the result looks like with a max depth of 19:" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "collapsed": false, 552 | "input": [ 553 | "clf = DecisionTreeRegressor(max_depth=20)\n", 554 | "clf.fit(ctrain, ztrain)\n", 555 | "zpred = clf.predict(ctest)\n", 556 | "plot_redshifts(ztest, zpred)" 557 | ], 558 | "language": "python", 559 | "metadata": {}, 560 | "outputs": [] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "Unfortunately, this is about as far as we can go with a simple decision tree\n", 567 | "trained on this data. There are two more possibilities we can pursue, though:\n", 568 | "\n", 569 | "- Optimize the data: observe more samples, or observe more features of each sample\n", 570 | " (using learning curves to determine which is better)\n", 571 | "- Optimize the model: use a more sophisticated estimator \n", 572 | "\n", 573 | "These approaches will be exercises below." 574 | ] 575 | }, 576 | { 577 | "cell_type": "heading", 578 | "level": 2, 579 | "metadata": {}, 580 | "source": [ 581 | "Exercise: Optimizing the Data" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "Earlier we showed how *learning curves* can be used to determine the best course\n", 589 | "of action when a model is under-performing: should we gather more samples? Gather\n", 590 | "more features? Seek a more sophisticated model?\n", 591 | "\n", 592 | "The goal of this exercise is to use learning curves to answer this question:\n", 593 | "**how should astronomers spend their resources when trying to improve photometric redshifts?**" 594 | ] 595 | }, 596 | { 597 | "cell_type": "heading", 598 | "level": 2, 599 | "metadata": {}, 600 | "source": [ 601 | "Exercise: Better Results Through Ensemble Methods" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "One way to improve upon Decision Trees is to use *Ensemble Methods*.\n", 609 | "Ensemble methods (also known as *boosting* and *bagging*) use ensembles\n", 610 | "of randomized estimators which can prevent over-fitting and lead to very\n", 611 | "powerful learning algorithms.\n", 612 | "\n", 613 | "It is interesting to see how small an RMS you can attain on the photometric\n", 614 | "redshift problem using a more sophisticated method. Try one of the following:\n", 615 | "\n", 616 | "- ``sklearn.ensemble.RandomForestRegressor``\n", 617 | "- ``sklearn.ensemble.GradientBoostingRegressor``\n", 618 | "- ``sklearn.ensemble.ExtraTreesRegressor``\n", 619 | "\n", 620 | "You can read more about each of these methods in the scikit-learn documentation:\n", 621 | "\n", 622 | "- http://scikit-learn.org/stable/modules/ensemble.html\n", 623 | "\n", 624 | "Each method has hyperparameters which need to be determined using cross-validation\n", 625 | "steps like those above. Can you use ensemble methods to reduce the rms error for\n", 626 | "the test set down below 0.1?\n", 627 | "\n", 628 | "Here you can adjust several hyperparameters, but the important ones will be\n", 629 | "the number of estimators ``n_estimators`` as well as the maximum depth\n", 630 | "``max_depth`` that we saw above." 631 | ] 632 | } 633 | ], 634 | "metadata": {} 635 | } 636 | ] 637 | } -------------------------------------------------------------------------------- /notebooks/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from galaxy_mags import fetch_sdss_galaxy_mags 2 | -------------------------------------------------------------------------------- /notebooks/datasets/galaxy_mags.py: -------------------------------------------------------------------------------- 1 | # This download script comes from astroML: http://astroml.github.com 2 | import os 3 | import urllib 4 | import numpy as np 5 | 6 | #---------------------------------------------------------------------- 7 | # Tools for querying the SDSS database using SQL 8 | PUBLIC_URL = 'http://skyserver.sdss.org/dr7/en/tools/search/x_sql.asp' 9 | DEFAULT_FMT = 'csv' 10 | 11 | 12 | def remove_sql_comments(sql): 13 | """Strip SQL comments starting with --""" 14 | return ' \n'.join(map(lambda x: x.split('--')[0], sql.split('\n'))) 15 | 16 | 17 | def sql_query(sql_str, url=PUBLIC_URL, format='csv'): 18 | """Execute query 19 | 20 | Parameters 21 | ---------- 22 | sql_str : string 23 | valid sql query 24 | 25 | url: string (optional) 26 | query url. Default is http://cas.sdss.org query script 27 | 28 | format: string (default='csv') 29 | query output format 30 | 31 | Returns 32 | ------- 33 | F: file object 34 | results of the query 35 | """ 36 | sql_str = remove_sql_comments(sql_str) 37 | params = urllib.urlencode(dict(cmd=sql_str, format=format)) 38 | return urllib.urlopen(url + '?%s' % params) 39 | 40 | 41 | SPECCLASS = ['UNKNOWN', 'STAR', 'GALAXY', 'QSO', 42 | 'HIZ_QSO', 'SKY', 'STAR_LATE', 'GAL_EM'] 43 | 44 | NOBJECTS = 50000 45 | 46 | GAL_MAGS_DTYPE = [('u', float), 47 | ('g', float), 48 | ('r', float), 49 | ('i', float), 50 | ('z', float), 51 | ('specClass', int), 52 | ('redshift', float), 53 | ('redshift_err', float)] 54 | 55 | ARCHIVE_FILE = 'sdss_galaxy_mags.npy' 56 | 57 | DATA_HOME = os.path.join(os.path.dirname(os.path.abspath(__file__)), 58 | 'data') 59 | 60 | def fetch_sdss_galaxy_mags(data_home=DATA_HOME, download_if_missing=True): 61 | """Loader for SDSS galaxy magnitudes. 62 | 63 | This function directly queries the sdss SQL database at 64 | http://cas.sdss.org/ 65 | 66 | Parameters 67 | ---------- 68 | data_home : optional, default=None 69 | Specify another download and cache folder for the datasets. By default 70 | all scikit learn data is stored in '~/astroML_data' subfolders. 71 | 72 | download_if_missing : optional, default=True 73 | If False, raise a IOError if the data is not locally available 74 | instead of trying to download the data from the source site. 75 | 76 | Returns 77 | ------- 78 | data : recarray, shape = (10000,) 79 | record array containing magnitudes and redshift for each galaxy 80 | """ 81 | if not os.path.exists(data_home): 82 | os.makedirs(data_home) 83 | 84 | archive_file = os.path.join(data_home, ARCHIVE_FILE) 85 | 86 | query_text = ('\n'.join( 87 | ("SELECT TOP %i" % NOBJECTS, 88 | " p.u, p.g, p.r, p.i, p.z, s.specClass, s.z, s.zerr", 89 | "FROM PhotoObj AS p", 90 | " JOIN SpecObj AS s ON s.bestobjid = p.objid", 91 | "WHERE ", 92 | " p.u BETWEEN 0 AND 19.6", 93 | " AND p.g BETWEEN 0 AND 20", 94 | " AND s.specClass > 1 -- not UNKNOWN or STAR", 95 | " AND s.specClass <> 5 -- not SKY", 96 | " AND s.specClass <> 6 -- not STAR_LATE"))) 97 | 98 | if not os.path.exists(archive_file): 99 | if not download_if_missing: 100 | raise IOError('data not present on disk. ' 101 | 'set download_if_missing=True to download') 102 | 103 | print "querying for %i objects" % NOBJECTS 104 | print query_text 105 | output = sql_query(query_text) 106 | print "finished." 107 | 108 | data = np.loadtxt(output, delimiter=',', 109 | skiprows=1, dtype=GAL_MAGS_DTYPE) 110 | np.save(archive_file, data) 111 | 112 | else: 113 | data = np.load(archive_file) 114 | 115 | return data 116 | -------------------------------------------------------------------------------- /notebooks/figures/ML_flow_chart.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tutorial Diagrams 3 | ----------------- 4 | 5 | This script plots the flow-charts used in the scikit-learn tutorials. 6 | """ 7 | 8 | import numpy as np 9 | import pylab as pl 10 | from matplotlib.patches import Circle, Rectangle, Polygon, Arrow, FancyArrow 11 | 12 | def create_base(box_bg = '#CCCCCC', 13 | arrow1 = '#88CCFF', 14 | arrow2 = '#88FF88', 15 | supervised=True): 16 | fig = pl.figure(figsize=(9, 6), facecolor='w') 17 | ax = pl.axes((0, 0, 1, 1), 18 | xticks=[], yticks=[], frameon=False) 19 | ax.set_xlim(0, 9) 20 | ax.set_ylim(0, 6) 21 | 22 | patches = [Rectangle((0.3, 3.6), 1.5, 1.8, zorder=1, fc=box_bg), 23 | Rectangle((0.5, 3.8), 1.5, 1.8, zorder=2, fc=box_bg), 24 | Rectangle((0.7, 4.0), 1.5, 1.8, zorder=3, fc=box_bg), 25 | 26 | Rectangle((2.9, 3.6), 0.2, 1.8, fc=box_bg), 27 | Rectangle((3.1, 3.8), 0.2, 1.8, fc=box_bg), 28 | Rectangle((3.3, 4.0), 0.2, 1.8, fc=box_bg), 29 | 30 | Rectangle((0.3, 0.2), 1.5, 1.8, fc=box_bg), 31 | 32 | Rectangle((2.9, 0.2), 0.2, 1.8, fc=box_bg), 33 | 34 | Circle((5.5, 3.5), 1.0, fc=box_bg), 35 | 36 | Polygon([[5.5, 1.7], 37 | [6.1, 1.1], 38 | [5.5, 0.5], 39 | [4.9, 1.1]], fc=box_bg), 40 | 41 | FancyArrow(2.3, 4.6, 0.35, 0, fc=arrow1, 42 | width=0.25, head_width=0.5, head_length=0.2), 43 | 44 | FancyArrow(3.75, 4.2, 0.5, -0.2, fc=arrow1, 45 | width=0.25, head_width=0.5, head_length=0.2), 46 | 47 | FancyArrow(5.5, 2.4, 0, -0.4, fc=arrow1, 48 | width=0.25, head_width=0.5, head_length=0.2), 49 | 50 | FancyArrow(2.0, 1.1, 0.5, 0, fc=arrow2, 51 | width=0.25, head_width=0.5, head_length=0.2), 52 | 53 | FancyArrow(3.3, 1.1, 1.3, 0, fc=arrow2, 54 | width=0.25, head_width=0.5, head_length=0.2), 55 | 56 | FancyArrow(6.2, 1.1, 0.8, 0, fc=arrow2, 57 | width=0.25, head_width=0.5, head_length=0.2)] 58 | 59 | if supervised: 60 | patches += [Rectangle((0.3, 2.4), 1.5, 0.5, zorder=1, fc=box_bg), 61 | Rectangle((0.5, 2.6), 1.5, 0.5, zorder=2, fc=box_bg), 62 | Rectangle((0.7, 2.8), 1.5, 0.5, zorder=3, fc=box_bg), 63 | FancyArrow(2.3, 2.9, 2.0, 0, fc=arrow1, 64 | width=0.25, head_width=0.5, head_length=0.2), 65 | Rectangle((7.3, 0.85), 1.5, 0.5, fc=box_bg)] 66 | else: 67 | patches += [Rectangle((7.3, 0.2), 1.5, 1.8, fc=box_bg)] 68 | 69 | for p in patches: 70 | ax.add_patch(p) 71 | 72 | pl.text(1.45, 4.9, "Training\nText,\nDocuments,\nImages,\netc.", 73 | ha='center', va='center', fontsize=14) 74 | 75 | pl.text(3.6, 4.9, "Feature\nVectors", 76 | ha='left', va='center', fontsize=14) 77 | 78 | pl.text(5.5, 3.5, "Machine\nLearning\nAlgorithm", 79 | ha='center', va='center', fontsize=14) 80 | 81 | pl.text(1.05, 1.1, "New Text,\nDocument,\nImage,\netc.", 82 | ha='center', va='center', fontsize=14) 83 | 84 | pl.text(3.3, 1.7, "Feature\nVector", 85 | ha='left', va='center', fontsize=14) 86 | 87 | pl.text(5.5, 1.1, "Predictive\nModel", 88 | ha='center', va='center', fontsize=12) 89 | 90 | if supervised: 91 | pl.text(1.45, 3.05, "Labels", 92 | ha='center', va='center', fontsize=14) 93 | 94 | pl.text(8.05, 1.1, "Expected\nLabel", 95 | ha='center', va='center', fontsize=14) 96 | pl.text(8.8, 5.8, "Supervised Learning Model", 97 | ha='right', va='top', fontsize=18) 98 | 99 | else: 100 | pl.text(8.05, 1.1, 101 | "Likelihood\nor Cluster ID\nor Better\nRepresentation", 102 | ha='center', va='center', fontsize=12) 103 | pl.text(8.8, 5.8, "Unsupervised Learning Model", 104 | ha='right', va='top', fontsize=18) 105 | 106 | 107 | 108 | def plot_supervised_chart(annotate=False): 109 | create_base(supervised=True) 110 | if annotate: 111 | fontdict = dict(color='r', weight='bold', size=14) 112 | pl.text(1.9, 4.55, 'X = vec.fit_transform(input)', 113 | fontdict=fontdict, 114 | rotation=20, ha='left', va='bottom') 115 | pl.text(3.7, 3.2, 'clf.fit(X, y)', 116 | fontdict=fontdict, 117 | rotation=20, ha='left', va='bottom') 118 | pl.text(1.7, 1.5, 'X_new = vec.transform(input)', 119 | fontdict=fontdict, 120 | rotation=20, ha='left', va='bottom') 121 | pl.text(6.1, 1.5, 'y_new = clf.predict(X_new)', 122 | fontdict=fontdict, 123 | rotation=20, ha='left', va='bottom') 124 | 125 | def plot_unsupervised_chart(): 126 | create_base(supervised=False) 127 | 128 | 129 | if __name__ == '__main__': 130 | plot_supervised_chart(False) 131 | plot_supervised_chart(True) 132 | plot_unsupervised_chart() 133 | pl.show() 134 | 135 | 136 | -------------------------------------------------------------------------------- /notebooks/figures/__init__.py: -------------------------------------------------------------------------------- 1 | from sgd_separator import plot_sgd_separator 2 | from linear_regression import plot_linear_regression 3 | from ML_flow_chart import plot_supervised_chart, plot_unsupervised_chart 4 | from bias_variance import plot_bias_variance 5 | from sdss_filters import plot_sdss_filters, plot_redshifts 6 | -------------------------------------------------------------------------------- /notebooks/figures/bias_variance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def test_func(x, err=0.5): 6 | return np.random.normal(10 - 1. / (x + 0.1), err) 7 | 8 | 9 | def compute_error(x, y, p): 10 | yfit = np.polyval(p, x) 11 | return np.sqrt(np.mean((y - yfit) ** 2)) 12 | 13 | 14 | def plot_bias_variance(N=8, random_seed=42, err=0.5): 15 | np.random.seed(random_seed) 16 | x = 10 ** np.linspace(-2, 0, N) 17 | y = test_func(x) 18 | 19 | xfit = np.linspace(-0.2, 1.2, 1000) 20 | 21 | titles = ['d = 1 (under-fit; high bias)', 22 | 'd = 2', 23 | 'd = 6 (over-fit; high variance)'] 24 | degrees = [1, 2, 6] 25 | 26 | fig = plt.figure(figsize = (9, 3.5)) 27 | fig.subplots_adjust(left = 0.06, right=0.98, 28 | bottom=0.15, top=0.85, 29 | wspace=0.05) 30 | for i, d in enumerate(degrees): 31 | ax = fig.add_subplot(131 + i, xticks=[], yticks=[]) 32 | ax.scatter(x, y, marker='x', c='k', s=50) 33 | 34 | p = np.polyfit(x, y, d) 35 | yfit = np.polyval(p, xfit) 36 | ax.plot(xfit, yfit, '-b') 37 | 38 | ax.set_xlim(-0.2, 1.2) 39 | ax.set_ylim(0, 12) 40 | ax.set_xlabel('house size') 41 | if i == 0: 42 | ax.set_ylabel('price') 43 | 44 | ax.set_title(titles[i]) 45 | 46 | if __name__ == '__main__': 47 | plot_bias_variance() 48 | plt.show() 49 | -------------------------------------------------------------------------------- /notebooks/figures/linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.linear_model import LinearRegression 4 | 5 | 6 | def plot_linear_regression(): 7 | a = 0.5 8 | b = 1.0 9 | 10 | # x from 0 to 10 11 | x = 30 * np.random.random(20) 12 | 13 | # y = a*x + b with noise 14 | y = a * x + b + np.random.normal(size=x.shape) 15 | 16 | # create a linear regression classifier 17 | clf = LinearRegression() 18 | clf.fit(x[:, None], y) 19 | 20 | # predict y from the data 21 | x_new = np.linspace(0, 30, 100) 22 | y_new = clf.predict(x_new[:, None]) 23 | 24 | # plot the results 25 | ax = plt.axes() 26 | ax.scatter(x, y) 27 | ax.plot(x_new, y_new) 28 | 29 | ax.set_xlabel('x') 30 | ax.set_ylabel('y') 31 | 32 | ax.axis('tight') 33 | 34 | 35 | if __name__ == '__main__': 36 | plot_linear_regression() 37 | plt.show() 38 | -------------------------------------------------------------------------------- /notebooks/figures/sdss_filters.py: -------------------------------------------------------------------------------- 1 | """ 2 | SDSS Filters 3 | ------------ 4 | 5 | This example downloads and plots the filters from the Sloan Digital Sky 6 | Survey, along with a reference spectrum. 7 | """ 8 | import os 9 | import urllib2 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from matplotlib.patches import Arrow 14 | 15 | DOWNLOAD_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 16 | 'downloads') 17 | REFSPEC_URL = 'http://www.astro.washington.edu/users/ivezic/DMbook/data/1732526_nic_002.ascii' 18 | FILTER_URL = 'http://www.sdss.org/dr7/instruments/imager/filters/%s.dat' 19 | 20 | def fetch_filter(filt): 21 | assert filt in 'ugriz' 22 | url = FILTER_URL % filt 23 | 24 | if not os.path.exists(DOWNLOAD_DIR): 25 | os.makedirs(DOWNLOAD_DIR) 26 | 27 | loc = os.path.join(DOWNLOAD_DIR, '%s.dat' % filt) 28 | if not os.path.exists(loc): 29 | print "downloading from %s" % url 30 | F = urllib2.urlopen(url) 31 | open(loc, 'w').write(F.read()) 32 | 33 | F = open(loc) 34 | 35 | data = np.loadtxt(F) 36 | return data 37 | 38 | 39 | def fetch_vega_spectrum(): 40 | if not os.path.exists(DOWNLOAD_DIR): 41 | os.makedirs(DOWNLOAD_DIR) 42 | 43 | refspec_file = os.path.join(DOWNLOAD_DIR, REFSPEC_URL.split('/')[-1]) 44 | 45 | if not os.path.exists(refspec_file): 46 | print "downloading from %s" % REFSPEC_URL 47 | F = urllib2.urlopen(REFSPEC_URL) 48 | open(refspec_file, 'w').write(F.read()) 49 | 50 | F = open(refspec_file) 51 | 52 | data = np.loadtxt(F) 53 | return data 54 | 55 | 56 | def plot_sdss_filters(): 57 | Xref = fetch_vega_spectrum() 58 | Xref[:, 1] /= 2.1 * Xref[:, 1].max() 59 | 60 | #---------------------------------------------------------------------- 61 | # Plot filters in color with a single spectrum 62 | fig, ax = plt.subplots() 63 | ax.plot(Xref[:, 0], Xref[:, 1], '-k', lw=2) 64 | 65 | for f,c in zip('ugriz', 'bgrmk'): 66 | X = fetch_filter(f) 67 | ax.fill(X[:, 0], X[:, 1], ec=c, fc=c, alpha=0.4) 68 | 69 | kwargs = dict(fontsize=20, ha='center', va='center', alpha=0.5) 70 | ax.text(3500, 0.02, 'u', color='b', **kwargs) 71 | ax.text(4600, 0.02, 'g', color='g', **kwargs) 72 | ax.text(6100, 0.02, 'r', color='r', **kwargs) 73 | ax.text(7500, 0.02, 'i', color='m', **kwargs) 74 | ax.text(8800, 0.02, 'z', color='k', **kwargs) 75 | 76 | ax.set_xlim(3000, 11000) 77 | 78 | ax.set_title('SDSS Filters and Reference Spectrum') 79 | ax.set_xlabel('Wavelength (Angstroms)') 80 | ax.set_ylabel('normalized flux / filter transmission') 81 | 82 | 83 | def plot_redshifts(): 84 | Xref = fetch_vega_spectrum() 85 | Xref[:, 1] /= 2.1 * Xref[:, 1].max() 86 | 87 | #---------------------------------------------------------------------- 88 | # Plot filters in gray with several redshifted spectra 89 | fig, ax = plt.subplots() 90 | 91 | redshifts = [0.0, 0.4, 0.8] 92 | colors = 'bgr' 93 | 94 | for z, c in zip(redshifts, colors): 95 | plt.plot((1. + z) * Xref[:, 0], Xref[:, 1], color=c) 96 | 97 | ax.add_patch(Arrow(4200, 0.47, 1300, 0, lw=0, width=0.05, color='r')) 98 | ax.add_patch(Arrow(5800, 0.47, 1250, 0, lw=0, width=0.05, color='r')) 99 | 100 | ax.text(3800, 0.49, 'z = 0.0', fontsize=14, color=colors[0]) 101 | ax.text(5500, 0.49, 'z = 0.4', fontsize=14, color=colors[1]) 102 | ax.text(7300, 0.49, 'z = 0.8', fontsize=14, color=colors[2]) 103 | 104 | for f in 'ugriz': 105 | X = fetch_filter(f) 106 | ax.fill(X[:, 0], X[:, 1], ec='k', fc='k', alpha=0.2) 107 | 108 | kwargs = dict(fontsize=20, color='gray', ha='center', va='center') 109 | ax.text(3500, 0.02, 'u', **kwargs) 110 | ax.text(4600, 0.02, 'g', **kwargs) 111 | ax.text(6100, 0.02, 'r', **kwargs) 112 | ax.text(7500, 0.02, 'i', **kwargs) 113 | ax.text(8800, 0.02, 'z', **kwargs) 114 | 115 | ax.set_xlim(3000, 11000) 116 | ax.set_ylim(0, 0.55) 117 | 118 | ax.set_title('Redshifting of a Spectrum') 119 | ax.set_xlabel('Observed Wavelength (Angstroms)') 120 | ax.set_ylabel('normalized flux / filter transmission') 121 | 122 | 123 | if __name__ == '__main__': 124 | plot_sdss_filters() 125 | plot_redshifts() 126 | plt.show() 127 | -------------------------------------------------------------------------------- /notebooks/figures/sgd_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.linear_model import SGDClassifier 4 | from sklearn.datasets.samples_generator import make_blobs 5 | 6 | def plot_sgd_separator(): 7 | # we create 50 separable points 8 | X, Y = make_blobs(n_samples=50, centers=2, 9 | random_state=0, cluster_std=0.60) 10 | 11 | # fit the model 12 | clf = SGDClassifier(loss="hinge", alpha=0.01, 13 | n_iter=200, fit_intercept=True) 14 | clf.fit(X, Y) 15 | 16 | # plot the line, the points, and the nearest vectors to the plane 17 | xx = np.linspace(-1, 5, 10) 18 | yy = np.linspace(-1, 5, 10) 19 | 20 | X1, X2 = np.meshgrid(xx, yy) 21 | Z = np.empty(X1.shape) 22 | for (i, j), val in np.ndenumerate(X1): 23 | x1 = val 24 | x2 = X2[i, j] 25 | p = clf.decision_function([x1, x2]) 26 | Z[i, j] = p[0] 27 | levels = [-1.0, 0.0, 1.0] 28 | linestyles = ['dashed', 'solid', 'dashed'] 29 | colors = 'k' 30 | 31 | ax = plt.axes() 32 | ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) 33 | ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) 34 | 35 | ax.axis('tight') 36 | 37 | 38 | if __name__ == '__main__': 39 | plot_sgd_separator() 40 | plt.show() 41 | -------------------------------------------------------------------------------- /notebooks/figures/svm_gui_frames.py: -------------------------------------------------------------------------------- 1 | """ 2 | Linear Model Example 3 | -------------------- 4 | 5 | This is an example plot from the tutorial which accompanies an explanation 6 | of the support vector machine GUI. 7 | """ 8 | 9 | import numpy as np 10 | import pylab as pl 11 | import matplotlib 12 | 13 | from sklearn import svm 14 | 15 | 16 | def linear_model(rseed=42, Npts=30): 17 | np.random.seed(rseed) 18 | 19 | 20 | data = np.random.normal(0, 10, (Npts, 2)) 21 | data[:Npts / 2] -= 15 22 | data[Npts / 2:] += 15 23 | 24 | labels = np.ones(Npts) 25 | labels[:Npts / 2] = -1 26 | 27 | return data, labels 28 | 29 | 30 | def nonlinear_model(rseed=42, Npts=30): 31 | radius = 40 * np.random.random(Npts) 32 | far_pts = radius > 20 33 | radius[far_pts] *= 1.2 34 | radius[~far_pts] *= 1.1 35 | 36 | theta = np.random.random(Npts) * np.pi * 2 37 | 38 | data = np.empty((Npts, 2)) 39 | data[:, 0] = radius * np.cos(theta) 40 | data[:, 1] = radius * np.sin(theta) 41 | 42 | labels = np.ones(Npts) 43 | labels[far_pts] = -1 44 | 45 | return data, labels 46 | 47 | 48 | def plot_linear_model(): 49 | X, y = linear_model() 50 | clf = svm.SVC(kernel='linear', 51 | gamma=0.01, coef0=0, degree=3) 52 | clf.fit(X, y) 53 | 54 | fig = pl.figure() 55 | ax = pl.subplot(111, xticks=[], yticks=[]) 56 | ax.scatter(X[:, 0], X[:, 1], c=y, cmap=pl.cm.bone) 57 | 58 | ax.scatter(clf.support_vectors_[:, 0], 59 | clf.support_vectors_[:, 1], 60 | s=80, edgecolors="k", facecolors="none") 61 | 62 | delta = 1 63 | y_min, y_max = -50, 50 64 | x_min, x_max = -50, 50 65 | x = np.arange(x_min, x_max + delta, delta) 66 | y = np.arange(y_min, y_max + delta, delta) 67 | X1, X2 = np.meshgrid(x, y) 68 | Z = clf.decision_function(np.c_[X1.ravel(), X2.ravel()]) 69 | Z = Z.reshape(X1.shape) 70 | 71 | levels = [-1.0, 0.0, 1.0] 72 | linestyles = ['dashed', 'solid', 'dashed'] 73 | colors = 'k' 74 | ax.contour(X1, X2, Z, levels, 75 | colors=colors, 76 | linestyles=linestyles) 77 | 78 | 79 | def plot_rbf_model(): 80 | X, y = nonlinear_model() 81 | clf = svm.SVC(kernel='rbf', 82 | gamma=0.001, coef0=0, degree=3) 83 | clf.fit(X, y) 84 | 85 | fig = pl.figure() 86 | ax = pl.subplot(111, xticks=[], yticks=[]) 87 | ax.scatter(X[:, 0], X[:, 1], c=y, cmap=pl.cm.bone, zorder=2) 88 | 89 | ax.scatter(clf.support_vectors_[:, 0], 90 | clf.support_vectors_[:, 1], 91 | s=80, edgecolors="k", facecolors="none") 92 | 93 | delta = 1 94 | y_min, y_max = -50, 50 95 | x_min, x_max = -50, 50 96 | x = np.arange(x_min, x_max + delta, delta) 97 | y = np.arange(y_min, y_max + delta, delta) 98 | X1, X2 = np.meshgrid(x, y) 99 | Z = clf.decision_function(np.c_[X1.ravel(), X2.ravel()]) 100 | Z = Z.reshape(X1.shape) 101 | 102 | levels = [-1.0, 0.0, 1.0] 103 | linestyles = ['dashed', 'solid', 'dashed'] 104 | colors = 'k' 105 | 106 | ax.contourf(X1, X2, Z, 10, 107 | cmap=matplotlib.cm.bone, 108 | origin='lower', 109 | alpha=0.85, zorder=1) 110 | ax.contour(X1, X2, Z, [0.0], 111 | colors='k', 112 | linestyles=['solid'], zorder=1) 113 | 114 | 115 | if __name__ == '__main__': 116 | plot_linear_model() 117 | plot_rbf_model() 118 | pl.show() 119 | 120 | -------------------------------------------------------------------------------- /notebooks/files/iris_setosa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakevdp/sklearn_pycon2013/c76e7cb6a62d05c3d32bfb8af23cbeab1dc75676/notebooks/files/iris_setosa.jpg -------------------------------------------------------------------------------- /notebooks/files/iris_versicolor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakevdp/sklearn_pycon2013/c76e7cb6a62d05c3d32bfb8af23cbeab1dc75676/notebooks/files/iris_versicolor.jpg -------------------------------------------------------------------------------- /notebooks/files/iris_virginica.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jakevdp/sklearn_pycon2013/c76e7cb6a62d05c3d32bfb8af23cbeab1dc75676/notebooks/files/iris_virginica.jpg -------------------------------------------------------------------------------- /notebooks/generate_v2.py: -------------------------------------------------------------------------------- 1 | """Simple utility script for semi-gracefully downgrading v3 notebooks to v2""" 2 | 3 | import io 4 | import os 5 | 6 | from IPython.nbformat import current 7 | 8 | def heading_to_md(cell): 9 | """turn heading cell into corresponding markdown""" 10 | cell.cell_type = "markdown" 11 | level = cell.pop('level', 1) 12 | cell.source = '#'*level + ' ' + cell.source 13 | 14 | def raw_to_md(cell): 15 | """let raw passthrough as markdown""" 16 | cell.cell_type = "markdown" 17 | 18 | def downgrade(nb): 19 | """downgrade a v3 notebook to v2""" 20 | if nb.nbformat != 3: 21 | return nb 22 | nb.nbformat = 2 23 | for ws in nb.worksheets: 24 | for cell in ws.cells: 25 | if cell.cell_type == 'heading': 26 | heading_to_md(cell) 27 | elif cell.cell_type == 'raw': 28 | raw_to_md(cell) 29 | return nb 30 | 31 | def downgrade_ipynb(fname): 32 | base, ext = os.path.splitext(fname) 33 | newname = base+'.v2'+ext 34 | print "downgrading %s -> %s" % (fname, newname) 35 | with io.open(fname, 'r', encoding='utf8') as f: 36 | nb = current.read(f, 'json') 37 | nb = downgrade(nb) 38 | with open(newname, 'w') as f: 39 | current.write(nb, f, 'json') 40 | 41 | if __name__ == '__main__': 42 | map(downgrade_ipynb, [f for f in os.listdir('.') 43 | if f.endswith('.ipynb') and 'v2' not in f]) 44 | -------------------------------------------------------------------------------- /notebooks/soln/boston_decision_tree.py: -------------------------------------------------------------------------------- 1 | clf = DecisionTreeRegressor() 2 | clf.fit(data.data, data.target) 3 | 4 | predicted = clf.predict(data.data) 5 | 6 | plt.scatter(data.target, predicted) 7 | plt.plot([0, 50], [0, 50], '--k') 8 | plt.axis('tight') 9 | plt.xlabel('True price ($1000s)') 10 | plt.ylabel('Predicted price ($1000s)') 11 | -------------------------------------------------------------------------------- /notebooks/soln/iris_kmeans.py: -------------------------------------------------------------------------------- 1 | kmeans = KMeans(n_clusters=3, random_state=rng).fit(X) 2 | 3 | plot_2D(X_pca, kmeans.labels_, ["c0", "c1", "c2"]) 4 | plt.title('K-Means labels') 5 | 6 | plot_2D(X_pca, iris.target, iris.target_names) 7 | plt.title('True labels') 8 | -------------------------------------------------------------------------------- /notebooks/soln/iris_rpca.py: -------------------------------------------------------------------------------- 1 | from sklearn.decomposition import RandomizedPCA 2 | 3 | X_rpca = RandomizedPCA(n_components=2).fit_transform(X) 4 | 5 | plot_PCA_2D(X_rpca, iris.target, iris.target_names) 6 | plt.title('Randomized PCA') 7 | 8 | plot_PCA_2D(X_pca, iris.target, iris.target_names) 9 | plt.title('PCA') 10 | -------------------------------------------------------------------------------- /notebooks/soln/show_faces.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import fetch_olivetti_faces 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | faces = fetch_olivetti_faces() 6 | 7 | # set up the figure 8 | fig = plt.figure(figsize=(6, 6)) # figure size in inches 9 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) 10 | 11 | # plot the faces: each image is 64x64 pixels 12 | for i in range(64): 13 | ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[]) 14 | ax.imshow(faces.images[i], cmap=plt.cm.bone) 15 | -------------------------------------------------------------------------------- /scripts/svm_gui.py: -------------------------------------------------------------------------------- 1 | """ 2 | ========== 3 | Libsvm GUI 4 | ========== 5 | 6 | A simple graphical frontend for Libsvm mainly intended for didactic 7 | purposes. You can create data points by point and click and visualize 8 | the decision region induced by different kernels and parameter settings. 9 | 10 | To create positive examples click the left mouse button; to create 11 | negative examples click the right button. 12 | 13 | If all examples are from the same class, it uses a one-class SVM. 14 | 15 | """ 16 | from __future__ import division 17 | 18 | # Author: Peter Prettenhoer 19 | # 20 | # License: BSD Style. 21 | 22 | import matplotlib 23 | matplotlib.use('TkAgg') 24 | 25 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg 26 | from matplotlib.backends.backend_tkagg import NavigationToolbar2TkAgg 27 | from matplotlib.figure import Figure 28 | from matplotlib.contour import ContourSet 29 | 30 | import Tkinter as Tk 31 | import sys 32 | import numpy as np 33 | 34 | from sklearn import svm 35 | from sklearn.datasets import dump_svmlight_file 36 | 37 | y_min, y_max = -50, 50 38 | x_min, x_max = -50, 50 39 | 40 | 41 | class Model(object): 42 | """The Model which hold the data. It implements the 43 | observable in the observer pattern and notifies the 44 | registered observers on change event. 45 | """ 46 | 47 | def __init__(self): 48 | self.observers = [] 49 | self.surface = None 50 | self.data = [] 51 | self.cls = None 52 | self.surface_type = 0 53 | 54 | def changed(self, event): 55 | """Notify the observers. """ 56 | for observer in self.observers: 57 | observer.update(event, self) 58 | 59 | def add_observer(self, observer): 60 | """Register an observer. """ 61 | self.observers.append(observer) 62 | 63 | def set_surface(self, surface): 64 | self.surface = surface 65 | 66 | def dump_svmlight_file(self, file): 67 | data = np.array(self.data) 68 | X = data[:, 0:2] 69 | y = data[:, 2] 70 | dump_svmlight_file(X, y, file) 71 | 72 | 73 | class Controller(object): 74 | def __init__(self, model): 75 | self.model = model 76 | self.kernel = Tk.IntVar() 77 | self.surface_type = Tk.IntVar() 78 | # Whether or not a model has been fitted 79 | self.fitted = False 80 | 81 | def fit(self): 82 | print "fit the model" 83 | train = np.array(self.model.data) 84 | X = train[:, 0:2] 85 | y = train[:, 2] 86 | 87 | C = float(self.complexity.get()) 88 | gamma = float(self.gamma.get()) 89 | coef0 = float(self.coef0.get()) 90 | degree = int(self.degree.get()) 91 | kernel_map = {0: "linear", 1: "rbf", 2: "poly"} 92 | if len(np.unique(y)) == 1: 93 | clf = svm.OneClassSVM(kernel=kernel_map[self.kernel.get()], 94 | gamma=gamma, coef0=coef0, degree=degree) 95 | clf.fit(X) 96 | else: 97 | clf = svm.SVC(kernel=kernel_map[self.kernel.get()], C=C, 98 | gamma=gamma, coef0=coef0, degree=degree) 99 | clf.fit(X, y) 100 | if hasattr(clf, 'score'): 101 | print "Accuracy:", clf.score(X, y) * 100 102 | X1, X2, Z = self.decision_surface(clf) 103 | self.model.clf = clf 104 | self.model.set_surface((X1, X2, Z)) 105 | self.model.surface_type = self.surface_type.get() 106 | self.fitted = True 107 | self.model.changed("surface") 108 | 109 | def decision_surface(self, cls): 110 | delta = 1 111 | x = np.arange(x_min, x_max + delta, delta) 112 | y = np.arange(y_min, y_max + delta, delta) 113 | X1, X2 = np.meshgrid(x, y) 114 | Z = cls.decision_function(np.c_[X1.ravel(), X2.ravel()]) 115 | Z = Z.reshape(X1.shape) 116 | return X1, X2, Z 117 | 118 | def clear_data(self): 119 | self.model.data = [] 120 | self.fitted = False 121 | self.model.changed("clear") 122 | 123 | def add_example(self, x, y, label): 124 | self.model.data.append((x, y, label)) 125 | self.model.changed("example_added") 126 | 127 | # update decision surface if already fitted. 128 | self.refit() 129 | 130 | def refit(self): 131 | """Refit the model if already fitted. """ 132 | if self.fitted: 133 | self.fit() 134 | 135 | 136 | class View(object): 137 | """Test docstring. """ 138 | def __init__(self, root, controller): 139 | f = Figure() 140 | ax = f.add_subplot(111) 141 | ax.set_xticks([]) 142 | ax.set_yticks([]) 143 | ax.set_xlim((x_min, x_max)) 144 | ax.set_ylim((y_min, y_max)) 145 | canvas = FigureCanvasTkAgg(f, master=root) 146 | canvas.show() 147 | canvas.get_tk_widget().pack(side=Tk.TOP, fill=Tk.BOTH, expand=1) 148 | canvas._tkcanvas.pack(side=Tk.TOP, fill=Tk.BOTH, expand=1) 149 | canvas.mpl_connect('button_press_event', self.onclick) 150 | canvas.mpl_connect('key_press_event', self.keypress) 151 | canvas.mpl_connect('key_release_event', self.keyrelease) 152 | toolbar = NavigationToolbar2TkAgg(canvas, root) 153 | toolbar.update() 154 | self.controllbar = ControllBar(root, controller) 155 | self.f = f 156 | self.ax = ax 157 | self.canvas = canvas 158 | self.controller = controller 159 | self.contours = [] 160 | self.c_labels = None 161 | self.plot_kernels() 162 | 163 | self.control_key = False 164 | 165 | def plot_kernels(self): 166 | self.ax.text(-50, -60, "Linear: $u^T v$") 167 | self.ax.text(-20, -60, "RBF: $\exp (-\gamma \| u-v \|^2)$") 168 | self.ax.text(10, -60, "Poly: $(\gamma \, u^T v + r)^d$") 169 | 170 | def keypress(self, event): 171 | if event.key == 'control': 172 | self.control_key = True 173 | 174 | def keyrelease(self, event): 175 | if event.key == 'control': 176 | self.control_key = False 177 | 178 | def onclick(self, event): 179 | if event.xdata and event.ydata: 180 | if event.button == 1: 181 | if self.control_key: 182 | self.controller.add_example(event.xdata, event.ydata, -1) 183 | else: 184 | self.controller.add_example(event.xdata, event.ydata, 1) 185 | elif event.button in (2, 3): 186 | self.controller.add_example(event.xdata, event.ydata, -1) 187 | 188 | def update_example(self, model, idx): 189 | x, y, l = model.data[idx] 190 | if l == 1: 191 | color = 'w' 192 | elif l == -1: 193 | color = 'k' 194 | self.ax.plot([x], [y], "%so" % color, scalex=0.0, scaley=0.0) 195 | 196 | def update(self, event, model): 197 | if event == "examples_loaded": 198 | for i in xrange(len(model.data)): 199 | self.update_example(model, i) 200 | 201 | if event == "example_added": 202 | self.update_example(model, -1) 203 | 204 | if event == "clear": 205 | self.ax.clear() 206 | self.ax.set_xticks([]) 207 | self.ax.set_yticks([]) 208 | self.contours = [] 209 | self.c_labels = None 210 | self.plot_kernels() 211 | 212 | if event == "surface": 213 | self.remove_surface() 214 | self.plot_support_vectors(model.clf.support_vectors_) 215 | self.plot_decision_surface(model.surface, model.surface_type) 216 | 217 | self.canvas.draw() 218 | 219 | def remove_surface(self): 220 | """Remove old decision surface.""" 221 | if len(self.contours) > 0: 222 | for contour in self.contours: 223 | if isinstance(contour, ContourSet): 224 | for lineset in contour.collections: 225 | lineset.remove() 226 | else: 227 | contour.remove() 228 | self.contours = [] 229 | 230 | def plot_support_vectors(self, support_vectors): 231 | """Plot the support vectors by placing circles over the 232 | corresponding data points and adds the circle collection 233 | to the contours list.""" 234 | cs = self.ax.scatter(support_vectors[:, 0], support_vectors[:, 1], 235 | s=80, edgecolors="k", facecolors="none") 236 | self.contours.append(cs) 237 | 238 | def plot_decision_surface(self, surface, type): 239 | X1, X2, Z = surface 240 | if type == 0: 241 | levels = [-1.0, 0.0, 1.0] 242 | linestyles = ['dashed', 'solid', 'dashed'] 243 | colors = 'k' 244 | self.contours.append(self.ax.contour(X1, X2, Z, levels, 245 | colors=colors, 246 | linestyles=linestyles)) 247 | elif type == 1: 248 | self.contours.append(self.ax.contourf(X1, X2, Z, 10, 249 | cmap=matplotlib.cm.bone, 250 | origin='lower', 251 | alpha=0.85)) 252 | self.contours.append(self.ax.contour(X1, X2, Z, [0.0], 253 | colors='k', 254 | linestyles=['solid'])) 255 | else: 256 | raise ValueError("surface type unknown") 257 | 258 | 259 | class ControllBar(object): 260 | def __init__(self, root, controller): 261 | fm = Tk.Frame(root) 262 | kernel_group = Tk.Frame(fm) 263 | Tk.Radiobutton(kernel_group, text="Linear", variable=controller.kernel, 264 | value=0, command=controller.refit).pack(anchor=Tk.W) 265 | Tk.Radiobutton(kernel_group, text="RBF", variable=controller.kernel, 266 | value=1, command=controller.refit).pack(anchor=Tk.W) 267 | Tk.Radiobutton(kernel_group, text="Poly", variable=controller.kernel, 268 | value=2, command=controller.refit).pack(anchor=Tk.W) 269 | kernel_group.pack(side=Tk.LEFT) 270 | 271 | valbox = Tk.Frame(fm) 272 | controller.complexity = Tk.StringVar() 273 | controller.complexity.set("1.0") 274 | c = Tk.Frame(valbox) 275 | Tk.Label(c, text="C:", anchor="e", width=7).pack(side=Tk.LEFT) 276 | Tk.Entry(c, width=6, textvariable=controller.complexity).pack( 277 | side=Tk.LEFT) 278 | c.pack() 279 | 280 | controller.gamma = Tk.StringVar() 281 | controller.gamma.set("0.01") 282 | g = Tk.Frame(valbox) 283 | Tk.Label(g, text="gamma:", anchor="e", width=7).pack(side=Tk.LEFT) 284 | Tk.Entry(g, width=6, textvariable=controller.gamma).pack(side=Tk.LEFT) 285 | g.pack() 286 | 287 | controller.degree = Tk.StringVar() 288 | controller.degree.set("3") 289 | d = Tk.Frame(valbox) 290 | Tk.Label(d, text="degree:", anchor="e", width=7).pack(side=Tk.LEFT) 291 | Tk.Entry(d, width=6, textvariable=controller.degree).pack(side=Tk.LEFT) 292 | d.pack() 293 | 294 | controller.coef0 = Tk.StringVar() 295 | controller.coef0.set("0") 296 | r = Tk.Frame(valbox) 297 | Tk.Label(r, text="coef0:", anchor="e", width=7).pack(side=Tk.LEFT) 298 | Tk.Entry(r, width=6, textvariable=controller.coef0).pack(side=Tk.LEFT) 299 | r.pack() 300 | valbox.pack(side=Tk.LEFT) 301 | 302 | cmap_group = Tk.Frame(fm) 303 | Tk.Radiobutton(cmap_group, text="Hyperplanes", 304 | variable=controller.surface_type, value=0, 305 | command=controller.refit).pack(anchor=Tk.W) 306 | Tk.Radiobutton(cmap_group, text="Surface", 307 | variable=controller.surface_type, value=1, 308 | command=controller.refit).pack(anchor=Tk.W) 309 | 310 | cmap_group.pack(side=Tk.LEFT) 311 | 312 | train_button = Tk.Button(fm, text='Fit', width=5, 313 | command=controller.fit) 314 | train_button.pack() 315 | fm.pack(side=Tk.LEFT) 316 | Tk.Button(fm, text='Clear', width=5, 317 | command=controller.clear_data).pack(side=Tk.LEFT) 318 | 319 | 320 | def get_parser(): 321 | from optparse import OptionParser 322 | op = OptionParser() 323 | op.add_option("--output", 324 | action="store", type="str", dest="output", 325 | help="Path where to dump data.") 326 | return op 327 | 328 | 329 | def run_gui(): 330 | root = Tk.Tk() 331 | model = Model() 332 | controller = Controller(model) 333 | root.wm_title("Scikit-learn Libsvm GUI") 334 | view = View(root, controller) 335 | model.add_observer(view) 336 | Tk.mainloop() 337 | 338 | 339 | def main(argv): 340 | op = get_parser() 341 | opts, args = op.parse_args(argv[1:]) 342 | 343 | run_gui() 344 | 345 | if opts.output: 346 | model.dump_svmlight_file(opts.output) 347 | 348 | if __name__ == "__main__": 349 | main(sys.argv) 350 | --------------------------------------------------------------------------------