├── README.md ├── Chapter 1 - Machine Learning.ipynb ├── Chapter 7 - Writing Fast Code.ipynb ├── Chapter 3 - Model Selection; Overfitting and Generalization.ipynb ├── Chapter 2.5 - Data visualization .ipynb ├── Chapter 4 - Cross-validation.ipynb ├── Chapter 0 - Hello World.ipynb ├── Chapter 5 - Model Selection And Pipelines.ipynb ├── Chapter 6 - Working With Text Data..ipynb └── Chapter 2 - Introduction to Scikit-learn.ipynb /README.md: -------------------------------------------------------------------------------- 1 | scikit-learn-interactive-tutorial 2 | ================================= 3 | 4 | IPython notebooks and data for a scikit-learn tutorial. 5 | 6 | You can view the notebooks online [on nbviewer](http://nbviewer.ipython.org/github/amueller/scikit-learn-interactive-tutorial/tree/master/) 7 | -------------------------------------------------------------------------------- /Chapter 1 - Machine Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "What is Machine Learning?\n", 15 | "==========================\n", 16 | "* Data + Algorithm -> predictive program.\n", 17 | "* Learn from past data, predict quantities on new, unseen data." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Three kinds of Learning\n", 25 | "========================\n", 26 | "* Supervised\n", 27 | "* Unsupervised\n", 28 | "* Reinforcement\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Supervised learning\n", 36 | "====================\n", 37 | "\n", 38 | "Training: Examples X_train together with labels y_train.\n", 39 | "\n", 40 | "Testing: Given X_test, predict y_test.\n", 41 | "\n", 42 | "Examples\n", 43 | "---------\n", 44 | "\n", 45 | "* Classification (spam, sentiment analysis, ...)\n", 46 | "* Regression (stocks, sales, ...)\n", 47 | "* Ranking (retrieval, search, ...)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Unsupervised Learning\n", 55 | "=====================\n", 56 | "\n", 57 | "Examples X.\n", 58 | "Learn something about X.\n", 59 | "\n", 60 | "Examples\n", 61 | "--------\n", 62 | "* Dimensionality reduction\n", 63 | "* Clustering\n", 64 | "* Manifold learning\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Reinforcement Learning\n", 72 | "========================\n", 73 | "* Learn from experience (not covered)\n", 74 | "\n", 75 | "Examples\n", 76 | "---------\n", 77 | "* A robot learning to walk.\n", 78 | "* A car learning to park itself." 79 | ] 80 | } 81 | ], 82 | "metadata": {} 83 | } 84 | ] 85 | } -------------------------------------------------------------------------------- /Chapter 7 - Writing Fast Code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Vectorize\n", 15 | "============" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "collapsed": false, 21 | "input": [ 22 | "import numpy as np\n", 23 | "a = np.random.uniform(size=10000)\n", 24 | "b = np.random.uniform(size=10000)\n", 25 | "%time c = a + b\n" 26 | ], 27 | "language": "python", 28 | "metadata": {}, 29 | "outputs": [] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "collapsed": false, 34 | "input": [ 35 | "def add_vectors(first, second):\n", 36 | " a = np.empty(first.shape)\n", 37 | " for i in xrange(len(first)):\n", 38 | " a[i] = first[i] + second[i]\n", 39 | " " 40 | ], 41 | "language": "python", 42 | "metadata": {}, 43 | "outputs": [] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "collapsed": false, 48 | "input": [ 49 | "%time c = add_vectors(a, b)" 50 | ], 51 | "language": "python", 52 | "metadata": {}, 53 | "outputs": [] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Don't write for-loops!\n", 60 | "------------------------\n", 61 | "\n", 62 | "If you can't avoid it (writing your custom algorithms)\n", 63 | "Use Cython\n", 64 | "===========" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "collapsed": false, 70 | "input": [ 71 | "%load_ext cythonmagic" 72 | ], 73 | "language": "python", 74 | "metadata": {}, 75 | "outputs": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "collapsed": false, 80 | "input": [ 81 | "%%cython\n", 82 | "\n", 83 | "import numpy as np\n", 84 | "cimport numpy as np\n", 85 | "\n", 86 | "def add_vectors_fast(first, second):\n", 87 | " a = np.empty(first.shape)\n", 88 | " for i in xrange(len(first)):\n", 89 | " a[i] = first[i] + second[i]\n", 90 | " return a\n" 91 | ], 92 | "language": "python", 93 | "metadata": {}, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "collapsed": false, 99 | "input": [ 100 | "%time c = add_vectors_fast(a, b)" 101 | ], 102 | "language": "python", 103 | "metadata": {}, 104 | "outputs": [] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "collapsed": false, 109 | "input": [ 110 | "%%cython\n", 111 | "\n", 112 | "cimport cython\n", 113 | "cimport numpy as np\n", 114 | "\n", 115 | "#@cython.boundscheck(False)\n", 116 | "#@cython.wraparound(False)\n", 117 | "def add_vectors_fast2(double[:] first, double[:] second):\n", 118 | " cdef np.ndarray[double, ndim = 1, mode = \"c\"] a = np.ndarray(first.shape[0])\n", 119 | " #cdef int i\n", 120 | " for i in range(len(first)):\n", 121 | " a[i] = first[i] + second[i]\n", 122 | " return a\n" 123 | ], 124 | "language": "python", 125 | "metadata": {}, 126 | "outputs": [] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "collapsed": false, 131 | "input": [ 132 | "%time c = add_vectors_fast2(a, b)" 133 | ], 134 | "language": "python", 135 | "metadata": {}, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "collapsed": false, 141 | "input": [], 142 | "language": "python", 143 | "metadata": {}, 144 | "outputs": [] 145 | } 146 | ], 147 | "metadata": {} 148 | } 149 | ] 150 | } -------------------------------------------------------------------------------- /Chapter 3 - Model Selection; Overfitting and Generalization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Choosing the right complexity for a model\n", 15 | "===============================================" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "collapsed": false, 21 | "input": [ 22 | "\n", 23 | "import numpy as np\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "\n", 26 | "from sklearn.datasets import load_iris\n", 27 | "from sklearn.cross_validation import train_test_split\n", 28 | "\n", 29 | "\n", 30 | "iris = load_iris()\n", 31 | "X = iris.data\n", 32 | "y = iris.target\n", 33 | "\n", 34 | "\n", 35 | "# dataset for decision function visualization\n", 36 | "X_2d = X[:, :2]\n", 37 | "X_2d = X_2d[y > 0]\n", 38 | "y_2d = y[y > 0]\n", 39 | "y_2d -= 1\n", 40 | "\n", 41 | "X_train, X_test, y_train, y_test = train_test_split(X_2d, y_2d)" 42 | ], 43 | "language": "python", 44 | "metadata": {}, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "collapsed": false, 50 | "input": [ 51 | "%matplotlib inline\n", 52 | "plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)" 53 | ], 54 | "language": "python", 55 | "metadata": {}, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "collapsed": false, 61 | "input": [ 62 | "def show_decision_function(clf, ax):\n", 63 | " xx, yy = np.meshgrid(np.linspace(4.5, 8, 200), np.linspace(1.5, 4.0, 200))\n", 64 | " try:\n", 65 | " Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n", 66 | " except AttributeError:\n", 67 | " Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]\n", 68 | "\n", 69 | " Z = Z.reshape(xx.shape)\n", 70 | " ax.pcolormesh(xx, yy, Z, cmap=plt.cm.jet)\n", 71 | " ax.set_xlim(4.5, 8)\n", 72 | " ax.set_ylim(1.5, 4.0)\n", 73 | " ax.set_xticks(())\n", 74 | " ax.set_yticks(())\n", 75 | " ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)" 76 | ], 77 | "language": "python", 78 | "metadata": {}, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "collapsed": false, 84 | "input": [ 85 | "from sklearn.svm import SVC\n", 86 | "\n", 87 | "training_scores = []\n", 88 | "test_scores = []\n", 89 | "fig, axes = plt.subplots(2, 3, figsize=(20, 10))\n", 90 | "Cs = [0.01, 0.1, 1, 10, 100, 1000]\n", 91 | "\n", 92 | "for C, ax in zip(Cs, axes.ravel()):\n", 93 | " clf = SVC(gamma=10, C=C)\n", 94 | " clf.fit(X_train, y_train)\n", 95 | " training_scores.append(clf.score(X_train, y_train))\n", 96 | " test_scores.append(clf.score(X_test, y_test))\n", 97 | " show_decision_function(clf, ax)" 98 | ], 99 | "language": "python", 100 | "metadata": {}, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "collapsed": false, 106 | "input": [ 107 | "plt.figure(figsize=(20, 10))\n", 108 | "plt.plot(training_scores, label=\"training scores\")\n", 109 | "plt.plot(test_scores, label=\"test scores\")\n", 110 | "plt.legend(loc=\"best\")\n", 111 | "plt.xticks(range(6), Cs)" 112 | ], 113 | "language": "python", 114 | "metadata": {}, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "Tasks\n", 122 | "======\n", 123 | "1. Play with the ``n_neighbors`` parameter of ``KNeighborsClassifier`` on the digits dataset. Compare training set and test set performance to see how it is related to complexity." 124 | ] 125 | } 126 | ], 127 | "metadata": {} 128 | } 129 | ] 130 | } -------------------------------------------------------------------------------- /Chapter 2.5 - Data visualization .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "from sklearn.datasets import load_digits\n", 15 | "digits = load_digits()\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "%matplotlib inline" 18 | ], 19 | "language": "python", 20 | "metadata": {}, 21 | "outputs": [] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Dimensionality reduction and manifold learning\n", 28 | "==================================================" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "collapsed": false, 34 | "input": [ 35 | "from sklearn.decomposition import PCA" 36 | ], 37 | "language": "python", 38 | "metadata": {}, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "collapsed": false, 44 | "input": [ 45 | "pca = PCA(n_components=2)" 46 | ], 47 | "language": "python", 48 | "metadata": {}, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "collapsed": false, 54 | "input": [ 55 | "pca.fit(digits.data)" 56 | ], 57 | "language": "python", 58 | "metadata": {}, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "collapsed": false, 64 | "input": [ 65 | "digits_pca = pca.transform(digits.data)" 66 | ], 67 | "language": "python", 68 | "metadata": {}, 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "collapsed": false, 74 | "input": [ 75 | "digits_pca.shape" 76 | ], 77 | "language": "python", 78 | "metadata": {}, 79 | "outputs": [] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "collapsed": false, 84 | "input": [ 85 | "plt.scatter(digits_pca[:, 0], digits_pca[:, 1], c=digits.target)" 86 | ], 87 | "language": "python", 88 | "metadata": {}, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "collapsed": false, 94 | "input": [ 95 | "plt.matshow(pca.mean_.reshape(8, 8))\n", 96 | "plt.matshow(pca.components_[0].reshape(8, 8))\n", 97 | "plt.matshow(pca.components_[1].reshape(8, 8))" 98 | ], 99 | "language": "python", 100 | "metadata": {}, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Manifold Learning\n", 108 | "==================" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "collapsed": false, 114 | "input": [ 115 | "from sklearn.manifold import SpectralEmbedding" 116 | ], 117 | "language": "python", 118 | "metadata": {}, 119 | "outputs": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "collapsed": false, 124 | "input": [ 125 | "se = SpectralEmbedding()\n", 126 | "digits_se = se.fit_transform(digits.data)" 127 | ], 128 | "language": "python", 129 | "metadata": {}, 130 | "outputs": [] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "collapsed": false, 135 | "input": [ 136 | "plt.scatter(digits_se[:, 0], digits_se[:, 1], c=digits.target)" 137 | ], 138 | "language": "python", 139 | "metadata": {}, 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "Tasks\n", 147 | "======\n", 148 | "1. Compare the projection to the principal components to projecting to two rows of the original data.\n", 149 | "2. Play with the ``n_neighbors`` parameter of Spectral Embedding. How doe that change the outcome?\n", 150 | "3. Extract more components from the digits data using PCA. How can you visualize them?" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "collapsed": false, 156 | "input": [], 157 | "language": "python", 158 | "metadata": {}, 159 | "outputs": [] 160 | } 161 | ], 162 | "metadata": {} 163 | } 164 | ] 165 | } -------------------------------------------------------------------------------- /Chapter 4 - Cross-validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\n", 15 | "Cross-validation\n", 16 | "===================\n", 17 | "What is cross-validation?\n", 18 | "--------------------------\n", 19 | "* A robust way to evaluate predictive accuracy.\n", 20 | "* Gives mean and standard deviation.\n", 21 | "* Makes good use of all the data." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "collapsed": false, 27 | "input": [ 28 | "from sklearn.cross_validation import KFold\n", 29 | "n_samples = 200\n", 30 | "cv = KFold(n=n_samples, n_folds=5)" 31 | ], 32 | "language": "python", 33 | "metadata": {}, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "collapsed": false, 39 | "input": [ 40 | "%matplotlib inline\n", 41 | "import numpy as np\n", 42 | "import matplotlib.pyplot as plt\n", 43 | "\n", 44 | "for training_set, test_set in cv:\n", 45 | " plt.figure(figsize=(20,1))\n", 46 | " plt.plot(training_set, np.ones(len(training_set)), \"o\", color='blue', label=\"training set\")\n", 47 | " plt.plot(test_set, np.ones(len(test_set)), \"o\", color='red', label=\"test set\")\n", 48 | " plt.legend(loc=\"best\")\n", 49 | " plt.axis(\"off\")" 50 | ], 51 | "language": "python", 52 | "metadata": {}, 53 | "outputs": [] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Using cross-validation in scikit-learn\n", 60 | "----------------------------------------" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "collapsed": false, 66 | "input": [ 67 | "from sklearn.cross_validation import cross_val_score, train_test_split" 68 | ], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "collapsed": false, 76 | "input": [ 77 | "from sklearn.datasets import load_digits" 78 | ], 79 | "language": "python", 80 | "metadata": {}, 81 | "outputs": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "collapsed": false, 86 | "input": [ 87 | "digits = load_digits()" 88 | ], 89 | "language": "python", 90 | "metadata": {}, 91 | "outputs": [] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "collapsed": false, 96 | "input": [ 97 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" 98 | ], 99 | "language": "python", 100 | "metadata": {}, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "collapsed": false, 106 | "input": [ 107 | "from sklearn.svm import SVC" 108 | ], 109 | "language": "python", 110 | "metadata": {}, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "collapsed": false, 116 | "input": [ 117 | "cross_val_score(SVC(C=1), X_train, y_train, cv=3)" 118 | ], 119 | "language": "python", 120 | "metadata": {}, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "collapsed": false, 126 | "input": [ 127 | "cross_val_score(SVC(C=10), X_train, y_train, cv=3, scoring=\"f1\")" 128 | ], 129 | "language": "python", 130 | "metadata": {}, 131 | "outputs": [] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "Let's go to a binary task for a moment (even vs uneven)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "collapsed": false, 143 | "input": [ 144 | "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3)" 145 | ], 146 | "language": "python", 147 | "metadata": {}, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "collapsed": false, 153 | "input": [ 154 | "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"average_precision\")" 155 | ], 156 | "language": "python", 157 | "metadata": {}, 158 | "outputs": [] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "collapsed": false, 163 | "input": [ 164 | "cross_val_score(SVC(C=10), X_train, y_train % 2, cv=3, scoring=\"roc_auc\")" 165 | ], 166 | "language": "python", 167 | "metadata": {}, 168 | "outputs": [] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "There are other ways to do cross-valiation" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "collapsed": false, 180 | "input": [ 181 | "from sklearn.cross_validation import ShuffleSplit\n", 182 | "cross_val_score(SVC(C=10), X_train, y_train, cv=ShuffleSplit(len(X_train), 10, test_size=.4))" 183 | ], 184 | "language": "python", 185 | "metadata": {}, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "Tasks\n", 193 | "======\n", 194 | "1. Select a good ``gamma`` and ``C`` for SVC on ``digits`` using cross-validation.\n", 195 | "2. Validate your findings on the test set." 196 | ] 197 | } 198 | ], 199 | "metadata": {} 200 | } 201 | ] 202 | } -------------------------------------------------------------------------------- /Chapter 0 - Hello World.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Get me at http://tinyurl.com/sklcds\n", 15 | "========================================" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "The IPython Notebook!\n", 23 | "========================\n", 24 | "Press shift + enter to run a cell.\n", 25 | "\n", 26 | "You can go back to previous cells, change them and re-run them." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "collapsed": false, 32 | "input": [ 33 | "print(\"Hello World\")" 34 | ], 35 | "language": "python", 36 | "metadata": {}, 37 | "outputs": [] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "collapsed": false, 42 | "input": [ 43 | "X = 112" 44 | ], 45 | "language": "python", 46 | "metadata": {}, 47 | "outputs": [] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "collapsed": false, 52 | "input": [ 53 | "print(X)" 54 | ], 55 | "language": "python", 56 | "metadata": {}, 57 | "outputs": [] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "collapsed": false, 62 | "input": [ 63 | "range(10)" 64 | ], 65 | "language": "python", 66 | "metadata": {}, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | " IPython notebook allows tab-completion and shows docstrings (by pressing tab [shift-tab in latest versions] after the opening parantheses), or using ?\n", 74 | " " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "collapsed": false, 80 | "input": [ 81 | "range?" 82 | ], 83 | "language": "python", 84 | "metadata": {}, 85 | "outputs": [] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Cells can be arbitrary long or short, and can define functions that will be available in other cells." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "collapsed": false, 97 | "input": [ 98 | "def fib(n):\n", 99 | " if n in [0, 1]:\n", 100 | " return n\n", 101 | " return fib(n - 1) + fib(n - 2)\n", 102 | "\n", 103 | "for x in range(5):\n", 104 | " print(fib(x))" 105 | ], 106 | "language": "python", 107 | "metadata": {}, 108 | "outputs": [] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "Numpy\n", 115 | "======\n", 116 | "Numpy array are the most common numeric data type.\n", 117 | "\n", 118 | "As in other environments, it is very beneficial to vectorize your code (array-based computing) to make use of fast C and Fortran implementations." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "collapsed": false, 124 | "input": [ 125 | "import numpy as np\n", 126 | "np.ones(10)" 127 | ], 128 | "language": "python", 129 | "metadata": {}, 130 | "outputs": [] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "collapsed": false, 135 | "input": [ 136 | "np.ones((10, 10))" 137 | ], 138 | "language": "python", 139 | "metadata": {}, 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "collapsed": false, 145 | "input": [ 146 | "np.arange(10)" 147 | ], 148 | "language": "python", 149 | "metadata": {}, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Numpy allows *broadcasting* over rows, leading to practical short-hand notations." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "collapsed": false, 162 | "input": [ 163 | "X = np.ones((10, 10)) + np.array([3, 5, 1, 10, 6, 12, 98, 1, 0, 3])\n", 164 | "print(X)" 165 | ], 166 | "language": "python", 167 | "metadata": {}, 168 | "outputs": [] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "Most libraries in Python use object oriented interfaces." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "collapsed": false, 180 | "input": [ 181 | "X.mean(axis=0)" 182 | ], 183 | "language": "python", 184 | "metadata": {}, 185 | "outputs": [] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "Numpy has all standard array functions, linear algebra, and *fancy indexing*." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "collapsed": false, 197 | "input": [ 198 | "X[:3, 1:4]" 199 | ], 200 | "language": "python", 201 | "metadata": {}, 202 | "outputs": [] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "collapsed": false, 207 | "input": [ 208 | "X[:, ::2]" 209 | ], 210 | "language": "python", 211 | "metadata": {}, 212 | "outputs": [] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "collapsed": false, 217 | "input": [ 218 | "X = np.random.randint(10, size=(32, 103))\n", 219 | "X" 220 | ], 221 | "language": "python", 222 | "metadata": {}, 223 | "outputs": [] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "collapsed": false, 228 | "input": [ 229 | "X[np.random.randint(32, size=10)]" 230 | ], 231 | "language": "python", 232 | "metadata": {}, 233 | "outputs": [] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "collapsed": false, 238 | "input": [ 239 | "X[np.random.randint(32, size=10)].shape" 240 | ], 241 | "language": "python", 242 | "metadata": {}, 243 | "outputs": [] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | 250 | "Matplotlib\n", 251 | "=============\n", 252 | "For all of your plotting needs!" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "Enable in-line plotting (can be done in config file)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "collapsed": false, 265 | "input": [ 266 | "%matplotlib inline\n", 267 | "import matplotlib.pyplot as plt\n", 268 | "import numpy as np" 269 | ], 270 | "language": "python", 271 | "metadata": {}, 272 | "outputs": [] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "collapsed": false, 277 | "input": [ 278 | "plt.plot(np.random.uniform(size=10))" 279 | ], 280 | "language": "python", 281 | "metadata": {}, 282 | "outputs": [] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "collapsed": false, 287 | "input": [ 288 | "plt.bar(np.arange(10), np.random.uniform(size=10))" 289 | ], 290 | "language": "python", 291 | "metadata": {}, 292 | "outputs": [] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "collapsed": false, 297 | "input": [ 298 | "plt.hist(np.random.normal(size=1000))" 299 | ], 300 | "language": "python", 301 | "metadata": {}, 302 | "outputs": [] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "collapsed": false, 307 | "input": [ 308 | "x, y = np.random.uniform(size=(2, 10))\n", 309 | "plt.scatter(x, y, marker=\"x\")" 310 | ], 311 | "language": "python", 312 | "metadata": {}, 313 | "outputs": [] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "collapsed": false, 318 | "input": [ 319 | "print(np.eye(5))\n", 320 | "plt.matshow(np.eye(5))" 321 | ], 322 | "language": "python", 323 | "metadata": {}, 324 | "outputs": [] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Tasks\n", 331 | "======\n", 332 | "\n", 333 | "1. Plot the function ``f(x) = x ** 2`` using lines. How can you get a smooth plot?\n", 334 | "2. Visualize a two-dimensional gaussian distribution from samples. How does your approach look with very many samples?" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "collapsed": false, 340 | "input": [], 341 | "language": "python", 342 | "metadata": {}, 343 | "outputs": [] 344 | } 345 | ], 346 | "metadata": {} 347 | } 348 | ] 349 | } 350 | -------------------------------------------------------------------------------- /Chapter 5 - Model Selection And Pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Model Selection\n", 15 | "=================" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Grid-Search with build-in cross validation" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "collapsed": false, 28 | "input": [ 29 | "from sklearn.grid_search import GridSearchCV" 30 | ], 31 | "language": "python", 32 | "metadata": {}, 33 | "outputs": [] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Define parameter grid:" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "collapsed": false, 45 | "input": [ 46 | "import numpy as np\n", 47 | "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma' : 10. ** np.arange(-3, 3)}\n", 48 | "print(param_grid)" 49 | ], 50 | "language": "python", 51 | "metadata": {}, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "collapsed": false, 57 | "input": [ 58 | "from sklearn.svm import SVC\n", 59 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)" 60 | ], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "A GridSearchCV object behaves just like a normal classifier." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "collapsed": false, 75 | "input": [ 76 | "from sklearn.datasets import load_digits\n", 77 | "from sklearn.cross_validation import train_test_split\n", 78 | "digits = load_digits()\n", 79 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" 80 | ], 81 | "language": "python", 82 | "metadata": {}, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "collapsed": false, 88 | "input": [ 89 | "grid_search.fit(X_train, y_train)" 90 | ], 91 | "language": "python", 92 | "metadata": {}, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "collapsed": false, 98 | "input": [ 99 | "# We extract just the scores\n", 100 | "%matplotlib inline\n", 101 | "import matplotlib.pyplot as plt\n", 102 | "\n", 103 | "scores = [x[1] for x in grid_search.grid_scores_]\n", 104 | "scores = np.array(scores).reshape(6, 6)\n", 105 | "\n", 106 | "plt.matshow(scores)\n", 107 | "plt.xlabel('gamma')\n", 108 | "plt.ylabel('C')\n", 109 | "plt.colorbar()\n", 110 | "plt.xticks(np.arange(6), param_grid['gamma'])\n", 111 | "plt.yticks(np.arange(6), param_grid['C'])" 112 | ], 113 | "language": "python", 114 | "metadata": {}, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "collapsed": false, 120 | "input": [ 121 | "grid_search.best_params_" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "collapsed": false, 130 | "input": [ 131 | "grid_search.predict(X_test)" 132 | ], 133 | "language": "python", 134 | "metadata": {}, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "collapsed": false, 140 | "input": [ 141 | "grid_search.score(X_test, y_test)" 142 | ], 143 | "language": "python", 144 | "metadata": {}, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Preprocessing and Pipelines\n", 152 | "=============================" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "collapsed": false, 158 | "input": [ 159 | "from sklearn.preprocessing import StandardScaler" 160 | ], 161 | "language": "python", 162 | "metadata": {}, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "Same interface as always." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "collapsed": false, 175 | "input": [ 176 | "scaler = StandardScaler()" 177 | ], 178 | "language": "python", 179 | "metadata": {}, 180 | "outputs": [] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "collapsed": false, 185 | "input": [ 186 | "scaler.fit(X_train)" 187 | ], 188 | "language": "python", 189 | "metadata": {}, 190 | "outputs": [] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "collapsed": false, 195 | "input": [ 196 | "scaler.transform(X_train).mean(axis=0)" 197 | ], 198 | "language": "python", 199 | "metadata": {}, 200 | "outputs": [] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "collapsed": false, 205 | "input": [ 206 | "scaler.transform(X_train).std(axis=0)" 207 | ], 208 | "language": "python", 209 | "metadata": {}, 210 | "outputs": [] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "For cross-validation, we need to estimate mean and standard deviation separately for each fold.\n", 217 | "To do that, we build a pipeline." 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "collapsed": false, 223 | "input": [ 224 | "from sklearn.pipeline import Pipeline" 225 | ], 226 | "language": "python", 227 | "metadata": {}, 228 | "outputs": [] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "collapsed": false, 233 | "input": [ 234 | "pipeline = Pipeline([(\"scaler\", scaler), (\"svm\", SVC())])" 235 | ], 236 | "language": "python", 237 | "metadata": {}, 238 | "outputs": [] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "collapsed": false, 243 | "input": [ 244 | "pipeline.fit(X_train, y_train)" 245 | ], 246 | "language": "python", 247 | "metadata": {}, 248 | "outputs": [] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "collapsed": false, 253 | "input": [ 254 | "pipeline.predict(X_train)" 255 | ], 256 | "language": "python", 257 | "metadata": {}, 258 | "outputs": [] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Cross-validation with a pipeline\n", 265 | "---------------------------------" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "collapsed": false, 271 | "input": [ 272 | "from sklearn.cross_validation import cross_val_score\n", 273 | "cross_val_score(pipeline, X_train, y_train)" 274 | ], 275 | "language": "python", 276 | "metadata": {}, 277 | "outputs": [] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "So, yeah, don't forget the preprocessing." 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "collapsed": false, 289 | "input": [ 290 | "param_grid_pipeline = {'svm__C': 10. ** np.arange(-3, 3), 'svm__gamma' : 10. ** np.arange(-3, 3)}\n", 291 | "\n", 292 | "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid_pipeline, verbose=3)" 293 | ], 294 | "language": "python", 295 | "metadata": {}, 296 | "outputs": [] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "collapsed": false, 301 | "input": [ 302 | "grid_pipeline.fit(X_train, y_train)" 303 | ], 304 | "language": "python", 305 | "metadata": {}, 306 | "outputs": [] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "collapsed": false, 311 | "input": [ 312 | "# We extract just the scores\n", 313 | "scores = [x[1] for x in grid_pipeline.grid_scores_]\n", 314 | "scores = np.array(scores).reshape(6, 6)\n", 315 | "\n", 316 | "plt.matshow(scores)\n", 317 | "plt.xlabel('gamma')\n", 318 | "plt.ylabel('C')\n", 319 | "plt.colorbar()\n", 320 | "plt.xticks(np.arange(6), param_grid['gamma'])\n", 321 | "plt.yticks(np.arange(6), param_grid['C'])" 322 | ], 323 | "language": "python", 324 | "metadata": {}, 325 | "outputs": [] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "collapsed": false, 330 | "input": [ 331 | "grid_pipeline.score(X_test, y_test)" 332 | ], 333 | "language": "python", 334 | "metadata": {}, 335 | "outputs": [] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "Randomized Searching\n", 342 | "======================" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "collapsed": false, 348 | "input": [ 349 | "from sklearn.grid_search import RandomizedSearchCV" 350 | ], 351 | "language": "python", 352 | "metadata": {}, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "collapsed": false, 358 | "input": [ 359 | "from scipy.stats import expon" 360 | ], 361 | "language": "python", 362 | "metadata": {}, 363 | "outputs": [] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "collapsed": false, 368 | "input": [ 369 | "plt.hist([expon.rvs() for x in xrange(1000)])" 370 | ], 371 | "language": "python", 372 | "metadata": {}, 373 | "outputs": [] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "collapsed": false, 378 | "input": [ 379 | "params = {'C': expon(), 'gamma': expon()}\n", 380 | "rs = RandomizedSearchCV(SVC(), param_distributions=params, n_iter=50, verbose=3)" 381 | ], 382 | "language": "python", 383 | "metadata": {}, 384 | "outputs": [] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "collapsed": false, 389 | "input": [ 390 | "rs.fit(X_train, y_train)" 391 | ], 392 | "language": "python", 393 | "metadata": {}, 394 | "outputs": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "collapsed": false, 399 | "input": [ 400 | "rs.best_params_" 401 | ], 402 | "language": "python", 403 | "metadata": {}, 404 | "outputs": [] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "collapsed": false, 409 | "input": [ 410 | "rs.best_score_" 411 | ], 412 | "language": "python", 413 | "metadata": {}, 414 | "outputs": [] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "collapsed": false, 419 | "input": [ 420 | "scores, Cs, gammas = zip(*[(score.mean_validation_score, score.parameters['C'], score.parameters['gamma']) for score in rs.grid_scores_])" 421 | ], 422 | "language": "python", 423 | "metadata": {}, 424 | "outputs": [] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "collapsed": false, 429 | "input": [ 430 | "plt.scatter(Cs, gammas, s=40, c=scores)\n", 431 | "plt.xlabel(\"C\")\n", 432 | "plt.ylabel(\"gamma\")" 433 | ], 434 | "language": "python", 435 | "metadata": {}, 436 | "outputs": [] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "Tasks\n", 443 | "=====\n", 444 | "1. Do grid-search over a pipeline consisting of the KBest feature selection and an rbf SVM on iris." 445 | ] 446 | } 447 | ], 448 | "metadata": {} 449 | } 450 | ] 451 | } -------------------------------------------------------------------------------- /Chapter 6 - Working With Text Data..ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "import pandas as pd\n", 15 | "data = pd.read_csv(\"train.csv\")" 16 | ], 17 | "language": "python", 18 | "metadata": {}, 19 | "outputs": [] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "collapsed": false, 24 | "input": [ 25 | "len(data)" 26 | ], 27 | "language": "python", 28 | "metadata": {}, 29 | "outputs": [] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "collapsed": false, 34 | "input": [ 35 | "data.columns" 36 | ], 37 | "language": "python", 38 | "metadata": {}, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "collapsed": false, 44 | "input": [ 45 | "data.Insult.value_counts()" 46 | ], 47 | "language": "python", 48 | "metadata": {}, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "collapsed": false, 54 | "input": [], 55 | "language": "python", 56 | "metadata": {}, 57 | "outputs": [] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "collapsed": false, 62 | "input": [ 63 | "import numpy as np\n", 64 | "y_train = np.array(data.Insult)" 65 | ], 66 | "language": "python", 67 | "metadata": {}, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "collapsed": false, 73 | "input": [ 74 | "y_train" 75 | ], 76 | "language": "python", 77 | "metadata": {}, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "collapsed": false, 83 | "input": [ 84 | "text_train = data.Comment.tolist()" 85 | ], 86 | "language": "python", 87 | "metadata": {}, 88 | "outputs": [] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "collapsed": false, 93 | "input": [ 94 | "data_test = pd.read_csv(\"test_with_solutions.csv\")" 95 | ], 96 | "language": "python", 97 | "metadata": {}, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "collapsed": false, 103 | "input": [ 104 | "data_test" 105 | ], 106 | "language": "python", 107 | "metadata": {}, 108 | "outputs": [] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "collapsed": false, 113 | "input": [ 114 | "text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)" 115 | ], 116 | "language": "python", 117 | "metadata": {}, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "collapsed": false, 123 | "input": [ 124 | "from sklearn.feature_extraction.text import CountVectorizer" 125 | ], 126 | "language": "python", 127 | "metadata": {}, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "collapsed": false, 133 | "input": [ 134 | "cv = CountVectorizer()\n", 135 | "cv.fit(text_train)" 136 | ], 137 | "language": "python", 138 | "metadata": {}, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "collapsed": false, 144 | "input": [ 145 | "len(cv.vocabulary_)" 146 | ], 147 | "language": "python", 148 | "metadata": {}, 149 | "outputs": [] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "collapsed": true, 154 | "input": [ 155 | "cv.vocabulary_" 156 | ], 157 | "language": "python", 158 | "metadata": {}, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "collapsed": false, 164 | "input": [ 165 | "X_train = cv.transform(text_train)" 166 | ], 167 | "language": "python", 168 | "metadata": {}, 169 | "outputs": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "collapsed": false, 174 | "input": [ 175 | "X_train.shape" 176 | ], 177 | "language": "python", 178 | "metadata": {}, 179 | "outputs": [] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "collapsed": false, 184 | "input": [ 185 | "text_train[6]" 186 | ], 187 | "language": "python", 188 | "metadata": {}, 189 | "outputs": [] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "collapsed": false, 194 | "input": [ 195 | "X_train[6, :].nonzero()" 196 | ], 197 | "language": "python", 198 | "metadata": {}, 199 | "outputs": [] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "collapsed": false, 204 | "input": [ 205 | "X_train[6]" 206 | ], 207 | "language": "python", 208 | "metadata": {}, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "collapsed": false, 214 | "input": [ 215 | "X_test = cv.transform(text_test)" 216 | ], 217 | "language": "python", 218 | "metadata": {}, 219 | "outputs": [] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "collapsed": false, 224 | "input": [ 225 | "from sklearn.svm import LinearSVC\n", 226 | "svm = LinearSVC(C=.01)" 227 | ], 228 | "language": "python", 229 | "metadata": {}, 230 | "outputs": [] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "collapsed": false, 235 | "input": [ 236 | "svm.fit(X_train, y_train)" 237 | ], 238 | "language": "python", 239 | "metadata": {}, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "collapsed": false, 245 | "input": [ 246 | "svm.score(X_train, y_train)" 247 | ], 248 | "language": "python", 249 | "metadata": {}, 250 | "outputs": [] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "collapsed": false, 255 | "input": [ 256 | "svm.score(X_test, y_test)" 257 | ], 258 | "language": "python", 259 | "metadata": {}, 260 | "outputs": [] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "collapsed": false, 265 | "input": [ 266 | "y_test_pred = svm.predict(X_test)" 267 | ], 268 | "language": "python", 269 | "metadata": {}, 270 | "outputs": [] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "collapsed": false, 275 | "input": [ 276 | "from sklearn.metrics import classification_report" 277 | ], 278 | "language": "python", 279 | "metadata": {}, 280 | "outputs": [] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "collapsed": false, 285 | "input": [ 286 | "print(classification_report(y_test, y_test_pred))" 287 | ], 288 | "language": "python", 289 | "metadata": {}, 290 | "outputs": [] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "collapsed": false, 295 | "input": [ 296 | "coef = svm.coef_.ravel()\n", 297 | "positive_coefficients = np.argsort(coef)[-25:]\n", 298 | "negative_coefficients = np.argsort(coef)[:25]\n", 299 | "interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n" 300 | ], 301 | "language": "python", 302 | "metadata": {}, 303 | "outputs": [] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "collapsed": false, 308 | "input": [ 309 | "%matplotlib inline\n", 310 | "import matplotlib.pyplot as plt\n", 311 | "\n", 312 | "plt.figure(figsize=(15, 5))\n", 313 | "plt.bar(np.arange(50), coef[interesting_coefficients], color=[\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]])\n", 314 | "feature_names = np.array(cv.get_feature_names())\n", 315 | "plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha=\"right\");" 316 | ], 317 | "language": "python", 318 | "metadata": {}, 319 | "outputs": [] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "collapsed": false, 324 | "input": [ 325 | "from sklearn.pipeline import Pipeline" 326 | ], 327 | "language": "python", 328 | "metadata": {}, 329 | "outputs": [] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "collapsed": false, 334 | "input": [ 335 | "pipeline = Pipeline([('vectorizer', cv), ('classifier', svm)])" 336 | ], 337 | "language": "python", 338 | "metadata": {}, 339 | "outputs": [] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "collapsed": false, 344 | "input": [ 345 | "pipeline.fit(text_train, y_train)" 346 | ], 347 | "language": "python", 348 | "metadata": {}, 349 | "outputs": [] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "collapsed": false, 354 | "input": [ 355 | "pipeline.score(text_train, y_train)" 356 | ], 357 | "language": "python", 358 | "metadata": {}, 359 | "outputs": [] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "collapsed": false, 364 | "input": [ 365 | "pipeline.score(text_test, y_test)" 366 | ], 367 | "language": "python", 368 | "metadata": {}, 369 | "outputs": [] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "collapsed": false, 374 | "input": [ 375 | "from sklearn.grid_search import GridSearchCV" 376 | ], 377 | "language": "python", 378 | "metadata": {}, 379 | "outputs": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "collapsed": false, 384 | "input": [ 385 | "param_grid = {'classifier__C': 10. ** np.arange(-3, 3)}\n", 386 | "grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)" 387 | ], 388 | "language": "python", 389 | "metadata": {}, 390 | "outputs": [] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "collapsed": false, 395 | "input": [ 396 | "grid_search.fit(text_train, y_train)" 397 | ], 398 | "language": "python", 399 | "metadata": {}, 400 | "outputs": [] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "collapsed": false, 405 | "input": [ 406 | "grid_search.best_score_" 407 | ], 408 | "language": "python", 409 | "metadata": {}, 410 | "outputs": [] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "collapsed": false, 415 | "input": [ 416 | "grid_search.best_params_" 417 | ], 418 | "language": "python", 419 | "metadata": {}, 420 | "outputs": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "collapsed": false, 425 | "input": [ 426 | "param_grid = {'classifier__C': 10. ** np.arange(-3, 3), \"vectorizer__ngram_range\": [(1, 1), (1, 2), (1, 3), (2, 3), (2, 2)]}\n", 427 | "grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3)" 428 | ], 429 | "language": "python", 430 | "metadata": {}, 431 | "outputs": [] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "collapsed": false, 436 | "input": [ 437 | "grid_search.fit(text_train, y_train)" 438 | ], 439 | "language": "python", 440 | "metadata": {}, 441 | "outputs": [] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "collapsed": false, 446 | "input": [ 447 | "grid_search.best_params_" 448 | ], 449 | "language": "python", 450 | "metadata": {}, 451 | "outputs": [] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "collapsed": false, 456 | "input": [ 457 | "grid_search.best_score_" 458 | ], 459 | "language": "python", 460 | "metadata": {}, 461 | "outputs": [] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "Tasks\n", 468 | "======\n", 469 | "1. Remove the above visualization code for the coefficients and try to recreate it.\n", 470 | "2. Can you think of any other useful features for this task?" 471 | ] 472 | } 473 | ], 474 | "metadata": {} 475 | } 476 | ] 477 | } -------------------------------------------------------------------------------- /Chapter 2 - Introduction to Scikit-learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Scikit-learn\n", 15 | "=============\n", 16 | "Machine learning for the masses!" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "What?\n", 24 | "------\n", 25 | "**Algorithms**\n", 26 | "\n", 27 | "- Classification\n", 28 | "- Regression\n", 29 | "- Dimensionality reduction\n", 30 | "- Manifold learning\n", 31 | "- Feature selection\n", 32 | "- Semisupervised learning\n", 33 | "- Clustering\n", 34 | "\n", 35 | "**Tools**\n", 36 | "\n", 37 | "- Preprocessing\n", 38 | "- Pipelining\n", 39 | "- Model evaluation\n", 40 | "- Model selection\n", 41 | "\n", 42 | "**Features**\n", 43 | "\n", 44 | "- Sparse data\n", 45 | "- Dense data\n", 46 | "- Multi-core\n", 47 | "- Out-of-core\n", 48 | "- Cloud tools available" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Get some data to play with" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "collapsed": false, 61 | "input": [ 62 | "from sklearn.datasets import load_digits\n", 63 | "digits = load_digits()\n", 64 | "digits.keys()" 65 | ], 66 | "language": "python", 67 | "metadata": {}, 68 | "outputs": [] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "collapsed": false, 73 | "input": [ 74 | "digits.images.shape" 75 | ], 76 | "language": "python", 77 | "metadata": {}, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "collapsed": false, 83 | "input": [ 84 | "digits.data.shape" 85 | ], 86 | "language": "python", 87 | "metadata": {}, 88 | "outputs": [] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "collapsed": false, 93 | "input": [ 94 | "digits.target.shape" 95 | ], 96 | "language": "python", 97 | "metadata": {}, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "**Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)**" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "collapsed": false, 110 | "input": [ 111 | "print(digits.images[0])\n" 112 | ], 113 | "language": "python", 114 | "metadata": {}, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "collapsed": false, 120 | "input": [ 121 | "print(digits.target[0])" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "collapsed": false, 130 | "input": [ 131 | "import matplotlib.pyplot as plt\n", 132 | "%matplotlib inline\n", 133 | "\n", 134 | "plt.matshow(digits.images[0], cmap=plt.cm.Greys)" 135 | ], 136 | "language": "python", 137 | "metadata": {}, 138 | "outputs": [] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "Split the data to get going" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "collapsed": false, 150 | "input": [ 151 | "from sklearn.cross_validation import train_test_split\n", 152 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" 153 | ], 154 | "language": "python", 155 | "metadata": {}, 156 | "outputs": [] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Really Simple API\n", 163 | "-------------------\n", 164 | "1) Import your model class" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "collapsed": false, 170 | "input": [ 171 | "from sklearn.svm import LinearSVC" 172 | ], 173 | "language": "python", 174 | "metadata": {}, 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "2) Instantiate an object and set the parameters" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "collapsed": false, 187 | "input": [ 188 | "svm = LinearSVC(C=0.1)" 189 | ], 190 | "language": "python", 191 | "metadata": {}, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "3) Fit the model" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "collapsed": false, 204 | "input": [ 205 | "svm.fit(X_train, y_train)" 206 | ], 207 | "language": "python", 208 | "metadata": {}, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "4) Apply / evaluate" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "collapsed": false, 221 | "input": [ 222 | "print(svm.predict(X_train))\n", 223 | "print(y_train)" 224 | ], 225 | "language": "python", 226 | "metadata": {}, 227 | "outputs": [] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "collapsed": false, 232 | "input": [ 233 | "svm.score(X_train, y_train)" 234 | ], 235 | "language": "python", 236 | "metadata": {}, 237 | "outputs": [] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "collapsed": false, 242 | "input": [ 243 | "svm.score(X_test, y_test)" 244 | ], 245 | "language": "python", 246 | "metadata": {}, 247 | "outputs": [] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "And again\n", 254 | "---------" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "collapsed": false, 260 | "input": [ 261 | "from sklearn.ensemble import RandomForestClassifier" 262 | ], 263 | "language": "python", 264 | "metadata": {}, 265 | "outputs": [] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "collapsed": false, 270 | "input": [ 271 | "rf = RandomForestClassifier(n_estimators=50)" 272 | ], 273 | "language": "python", 274 | "metadata": {}, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "collapsed": false, 280 | "input": [ 281 | "rf.fit(X_train, y_train)" 282 | ], 283 | "language": "python", 284 | "metadata": {}, 285 | "outputs": [] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "collapsed": false, 290 | "input": [ 291 | "rf.score(X_train, y_train)" 292 | ], 293 | "language": "python", 294 | "metadata": {}, 295 | "outputs": [] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "collapsed": false, 300 | "input": [ 301 | "rf.score(X_test, y_test)" 302 | ], 303 | "language": "python", 304 | "metadata": {}, 305 | "outputs": [] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "collapsed": false, 310 | "input": [ 311 | "#%load from github" 312 | ], 313 | "language": "python", 314 | "metadata": {}, 315 | "outputs": [] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "collapsed": false, 320 | "input": [ 321 | "#!/usr/bin/python\n", 322 | "\n", 323 | "\"\"\"\n", 324 | "=====================\n", 325 | "Classifier comparison\n", 326 | "=====================\n", 327 | "\n", 328 | "A comparison of a several classifiers in scikit-learn on synthetic datasets.\n", 329 | "The point of this example is to illustrate the nature of decision boundaries\n", 330 | "of different classifiers.\n", 331 | "This should be taken with a grain of salt, as the intuition conveyed by\n", 332 | "these examples does not necessarily carry over to real datasets.\n", 333 | "\n", 334 | "Particularly in high-dimensional spaces, data can more easily be separated\n", 335 | "linearly and the simplicity of classifiers such as naive Bayes and linear SVMs\n", 336 | "might lead to better generalization than is achieved by other classifiers.\n", 337 | "\n", 338 | "The plots show training points in solid colors and testing points\n", 339 | "semi-transparent. The lower right shows the classification accuracy on the test\n", 340 | "set.\n", 341 | "\"\"\"\n", 342 | "print(__doc__)\n", 343 | "\n", 344 | "\n", 345 | "# Code source: Ga\u00ebl Varoquaux\n", 346 | "# Andreas M\u00fcller\n", 347 | "# Modified for documentation by Jaques Grobler\n", 348 | "# License: BSD 3 clause\n", 349 | "\n", 350 | "import numpy as np\n", 351 | "import pylab as pl\n", 352 | "from matplotlib.colors import ListedColormap\n", 353 | "from sklearn.cross_validation import train_test_split\n", 354 | "from sklearn.preprocessing import StandardScaler\n", 355 | "from sklearn.datasets import make_moons, make_circles, make_classification\n", 356 | "from sklearn.neighbors import KNeighborsClassifier\n", 357 | "from sklearn.svm import SVC\n", 358 | "from sklearn.tree import DecisionTreeClassifier\n", 359 | "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", 360 | "from sklearn.naive_bayes import GaussianNB\n", 361 | "from sklearn.lda import LDA\n", 362 | "from sklearn.qda import QDA\n", 363 | "\n", 364 | "h = .02 # step size in the mesh\n", 365 | "\n", 366 | "names = [\"Nearest Neighbors\", \"Linear SVM\", \"RBF SVM\", \"Decision Tree\",\n", 367 | " \"Random Forest\", \"AdaBoost\", \"Naive Bayes\", \"LDA\", \"QDA\"]\n", 368 | "classifiers = [\n", 369 | " KNeighborsClassifier(3),\n", 370 | " SVC(kernel=\"linear\", C=0.025),\n", 371 | " SVC(gamma=2, C=1),\n", 372 | " DecisionTreeClassifier(max_depth=5),\n", 373 | " RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n", 374 | " AdaBoostClassifier(),\n", 375 | " GaussianNB(),\n", 376 | " LDA(),\n", 377 | " QDA()]\n", 378 | "\n", 379 | "X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,\n", 380 | " random_state=1, n_clusters_per_class=1)\n", 381 | "rng = np.random.RandomState(2)\n", 382 | "X += 2 * rng.uniform(size=X.shape)\n", 383 | "linearly_separable = (X, y)\n", 384 | "\n", 385 | "datasets = [make_moons(noise=0.3, random_state=0),\n", 386 | " make_circles(noise=0.2, factor=0.5, random_state=1),\n", 387 | " linearly_separable\n", 388 | " ]\n", 389 | "\n", 390 | "figure = pl.figure(figsize=(27, 9))\n", 391 | "i = 1\n", 392 | "# iterate over datasets\n", 393 | "for ds in datasets:\n", 394 | " # preprocess dataset, split into training and test part\n", 395 | " X, y = ds\n", 396 | " X = StandardScaler().fit_transform(X)\n", 397 | " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)\n", 398 | "\n", 399 | " x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n", 400 | " y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n", 401 | " xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n", 402 | " np.arange(y_min, y_max, h))\n", 403 | "\n", 404 | " # just plot the dataset first\n", 405 | " cm = pl.cm.RdBu\n", 406 | " cm_bright = ListedColormap(['#FF0000', '#0000FF'])\n", 407 | " ax = pl.subplot(len(datasets), len(classifiers) + 1, i)\n", 408 | " # Plot the training points\n", 409 | " ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)\n", 410 | " # and testing points\n", 411 | " ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)\n", 412 | " ax.set_xlim(xx.min(), xx.max())\n", 413 | " ax.set_ylim(yy.min(), yy.max())\n", 414 | " ax.set_xticks(())\n", 415 | " ax.set_yticks(())\n", 416 | " i += 1\n", 417 | "\n", 418 | " # iterate over classifiers\n", 419 | " for name, clf in zip(names, classifiers):\n", 420 | " ax = pl.subplot(len(datasets), len(classifiers) + 1, i)\n", 421 | " clf.fit(X_train, y_train)\n", 422 | " score = clf.score(X_test, y_test)\n", 423 | "\n", 424 | " # Plot the decision boundary. For that, we will assign a color to each\n", 425 | " # point in the mesh [x_min, m_max]x[y_min, y_max].\n", 426 | " if hasattr(clf, \"decision_function\"):\n", 427 | " Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n", 428 | " else:\n", 429 | " Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n", 430 | "\n", 431 | " # Put the result into a color plot\n", 432 | " Z = Z.reshape(xx.shape)\n", 433 | " ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)\n", 434 | "\n", 435 | " # Plot also the training points\n", 436 | " ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)\n", 437 | " # and testing points\n", 438 | " ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,\n", 439 | " alpha=0.6)\n", 440 | "\n", 441 | " ax.set_xlim(xx.min(), xx.max())\n", 442 | " ax.set_ylim(yy.min(), yy.max())\n", 443 | " ax.set_xticks(())\n", 444 | " ax.set_yticks(())\n", 445 | " ax.set_title(name)\n", 446 | " ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),\n", 447 | " size=15, horizontalalignment='right')\n", 448 | " i += 1\n", 449 | "\n", 450 | "figure.subplots_adjust(left=.02, right=.98)\n", 451 | "pl.show()\n" 452 | ], 453 | "language": "python", 454 | "metadata": {}, 455 | "outputs": [] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "Tasks\n", 462 | "======\n", 463 | "1. Train a KNeighbors classifier on the digits dataset and compute the test accuracy.\n", 464 | "2. Visualize some of the mistakes." 465 | ] 466 | } 467 | ], 468 | "metadata": {} 469 | } 470 | ] 471 | } --------------------------------------------------------------------------------