├── .gitignore ├── 01 - Intro to Machine Learning.ipynb ├── 02 - First Steps.ipynb ├── 03 - Unsupervised Transformers.ipynb ├── 04 - API Summary.ipynb ├── 05 - Cross-validation.ipynb ├── 06 - Model Complexity.ipynb ├── 07 - Grid Searches for Hyper Parameters.ipynb ├── 08 - Preprocessing and Pipelines.ipynb ├── 09.1 - Linear models.ipynb ├── 09.2 - Support Vector Machines.ipynb ├── 09.3 - Trees and Forests.ipynb ├── 10 - Working With Text Data.ipynb ├── 11 - Out Of Core Learning.ipynb ├── LICENSE ├── README.md ├── data ├── test_with_solutions.csv └── train.csv ├── figures ├── bag_of_words.svg ├── cluster_comparison.png ├── cross_validation.svg ├── data_representation.svg ├── feature_union.svg ├── grid_search_cross_validation.svg ├── hashing_vectorizer.svg ├── overfitting_underfitting_cartoon.svg ├── pipeline.svg ├── pipeline_cross_validation.svg ├── randomized_search.png ├── supervised_workflow.svg ├── train_test_split.svg ├── train_test_split_matrix.svg ├── train_validation_test2.svg └── unsupervised_workflow.svg ├── plots ├── __init__.py ├── plot_2d_separator.py ├── plot_interactive_forest.py ├── plot_interactive_tree.py ├── plot_kneighbors_regularization.py ├── plot_linear_svc_regularization.py └── plot_rbf_svm_parameters.py └── solutions ├── cross_validation_iris.py ├── digits_tsne.py ├── forests.py ├── grid_search_forest.py ├── grid_search_k_neighbors.py ├── linear_models.py ├── load_iris.py ├── pipeline_iris.py ├── svms.py ├── text_pipeline.py ├── train_iris.py └── validation_curve.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | .ipynb_checkpoints/ 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | -------------------------------------------------------------------------------- /01 - Intro to Machine Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# What is machine learning ?" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Supervised learning\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "source": [ 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "# Data Representations" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Dataset Split" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "% matplotlib nbagg\n", 68 | "import matplotlib.pyplot as plt\n", 69 | "import numpy as np" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "from sklearn.datasets import load_digits\n", 81 | "digits = load_digits()\n", 82 | "digits.keys()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "digits.images.shape" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "print(digits.images[0])" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "plt.matshow(digits.images[0], cmap=plt.cm.Greys)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "digits.data.shape" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "digits.target.shape" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "digits.target" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "**Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)**" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Splitting the data:" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "from sklearn.cross_validation import train_test_split\n", 174 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "# Exercises\n", 182 | "\n", 183 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n", 184 | "The function returns a dictionary-like object that has the same attributes as ``digits``.\n", 185 | "\n", 186 | "What is the number of classes, features and data points in this dataset?\n", 187 | "Use a scatterplot to visualize the dataset.\n", 188 | "\n", 189 | "You can look at ``DESCR`` attribute to learn more about the dataset." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "# %load solutions/load_iris.py" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 2", 216 | "language": "python", 217 | "name": "python2" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 2 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython2", 229 | "version": "2.7.9" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 0 234 | } 235 | -------------------------------------------------------------------------------- /02 - First Steps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Get some data to play with" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.datasets import load_digits\n", 32 | "digits = load_digits()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "from sklearn.cross_validation import train_test_split\n", 44 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n", 45 | " digits.target)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "Really Simple API\n", 53 | "-------------------\n", 54 | "0) Import your model class" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "from sklearn.svm import LinearSVC" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "1) Instantiate an object and set the parameters" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "svm = LinearSVC()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "2) Fit the model" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "svm.fit(X_train, y_train)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "3) Apply / evaluate" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "print(svm.predict(X_train))\n", 120 | "print(y_train)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "svm.score(X_train, y_train)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "np.sum(svm.predict(X_train) == y_train) / float(len(X_train))" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "svm.score(X_test, y_test)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "And again\n", 161 | "---------" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "from sklearn.ensemble import RandomForestClassifier" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "rf = RandomForestClassifier(n_estimators=50)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "rf.fit(X_train, y_train)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "rf.score(X_test, y_test)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "%load https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/examples/classification/plot_classifier_comparison.py" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "# Exercises\n", 224 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n", 225 | "\n", 226 | "Split it into training and test set using ``train_test_split``.\n", 227 | "Then train an evaluate a classifier of your choice.\n" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "# %load solutions/train_iris.py" 239 | ] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 2", 245 | "language": "python", 246 | "name": "python2" 247 | }, 248 | "language_info": { 249 | "codemirror_mode": { 250 | "name": "ipython", 251 | "version": 2 252 | }, 253 | "file_extension": ".py", 254 | "mimetype": "text/x-python", 255 | "name": "python", 256 | "nbconvert_exporter": "python", 257 | "pygments_lexer": "ipython2", 258 | "version": "2.7.9" 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 0 263 | } 264 | -------------------------------------------------------------------------------- /03 - Unsupervised Transformers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.datasets import load_digits\n", 32 | "from sklearn.cross_validation import train_test_split\n", 33 | "import numpy as np\n", 34 | "np.set_printoptions(suppress=True)\n", 35 | "\n", 36 | "digits = load_digits()\n", 37 | "X, y = digits.data, digits.target\n", 38 | "X_train, X_test, y_train, y_test = train_test_split(X, y)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Removing mean and scaling variance\n", 46 | "===================================" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from sklearn.preprocessing import StandardScaler" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "1) Instantiate the model" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "scaler = StandardScaler()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "2) Fit using only the data." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "scaler.fit(X_train)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "3) `transform` the data (not `predict`)." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "X_train_scaled = scaler.transform(X_train)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "X_train.shape" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "X_train_scaled.shape" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "The transformed version of the data has the mean removed:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "X_train_scaled.mean(axis=0)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "X_train_scaled.std(axis=0)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "X_test_transformed = scaler.transform(X_test)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "Principal Component Analysis\n", 181 | "=============================" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "0) Import the model" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "from sklearn.decomposition import PCA" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "1) Instantiate the model" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "pca = PCA(n_components=2)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "2) Fit to training data" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": false 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "pca.fit(X)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "3) Transform to lower-dimensional representation" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "print(X.shape)\n", 254 | "X_pca = pca.transform(X)\n", 255 | "X_pca.shape" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Visualize\n", 263 | "----------" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": false 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "source": [ 283 | "Manifold Learning\n", 284 | "==================" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "collapsed": true 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "from sklearn.manifold import Isomap\n", 296 | "isomap = Isomap()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "X_isomap = isomap.fit_transform(X)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "plt.figure()\n", 319 | "plt.scatter(X_isomap[:, 0], X_isomap[:, 1], c=y)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "collapsed": true 326 | }, 327 | "source": [ 328 | "# Exercises\n", 329 | "Visualize the digits dataset using the TSNE algorithm from the sklearn.manifold module (it runs for a couple of seconds).\n" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "# %load solutions/digits_tsne.py" 341 | ] 342 | } 343 | ], 344 | "metadata": { 345 | "kernelspec": { 346 | "display_name": "Python 2", 347 | "language": "python", 348 | "name": "python2" 349 | }, 350 | "language_info": { 351 | "codemirror_mode": { 352 | "name": "ipython", 353 | "version": 2 354 | }, 355 | "file_extension": ".py", 356 | "mimetype": "text/x-python", 357 | "name": "python", 358 | "nbconvert_exporter": "python", 359 | "pygments_lexer": "ipython2", 360 | "version": "2.7.9" 361 | } 362 | }, 363 | "nbformat": 4, 364 | "nbformat_minor": 0 365 | } 366 | -------------------------------------------------------------------------------- /04 - API Summary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A recap on Scikit-learn's estimator interface\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "``X`` : data, 2d numpy array or scipy sparse matrix of shape (n_samples, n_features)\n", 15 | "\n", 16 | "``y`` : targets, 1d numpy array of shape (n_samples,)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Methods" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "\n", 31 | "\n", 32 | "\n", 33 | "\n", 34 | "\n", 35 | "\n", 36 | "\n", 37 | "
``model.fit(X_train, [y_train])``
``model.predict(X_test)````model.transform(X_test)``
ClassificationPreprocessing
RegressionDimensionality Reduction
ClusteringFeature Extraction
 Feature selection
" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Efficient alternatives, methods for models that don't generalize\n", 45 | "``model.fit_predict(X)`` (clustering)\n", 46 | "\n", 47 | "``model.fit_transform(X)`` (manifold learning)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Additional methods\n", 55 | "__Model evaluation__ : ``score(X, [y])``\n", 56 | "\n", 57 | "__Uncertainties from Classifiers__: ``decision_function(X)`` and ``predict_proba(X)``." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Attributes\n", 65 | "__Classifiers__: ``classes_``\n", 66 | "\n", 67 | "__Clustering__: ``labels_``\n", 68 | "\n", 69 | "__Manifold Learning__: ``embedding_``\n", 70 | "\n", 71 | "__Linear models__: ``coef_``\n", 72 | "\n", 73 | "__Linear Decompositions__: ``components_``" 74 | ] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 2", 80 | "language": "python", 81 | "name": "python2" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 2 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython2", 93 | "version": "2.7.9" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 0 98 | } 99 | -------------------------------------------------------------------------------- /05 - Cross-validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Cross-Validation\n", 21 | "----------------------------------------" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "from sklearn.datasets import load_iris" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "iris = load_iris()\n", 51 | "X = iris.data\n", 52 | "y = iris.target" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "from sklearn.cross_validation import cross_val_score\n", 64 | "from sklearn.svm import LinearSVC" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "cross_val_score(LinearSVC(), X, y, cv=5)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "cross_val_score(LinearSVC(), X, y, cv=5, scoring=\"f1_macro\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Let's go to a binary task for a moment" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "y % 2" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "cross_val_score(LinearSVC(), X, y % 2)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"average_precision\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"roc_auc\")" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "from sklearn.metrics.scorer import SCORERS\n", 149 | "print(SCORERS.keys())" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Implementing your own scoring metric:" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "def my_accuracy_scoring(est, X, y):\n", 168 | " return np.mean(est.predict(X) == y)\n", 169 | "\n", 170 | "cross_val_score(LinearSVC(), X, y, scoring=my_accuracy_scoring)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "def my_super_scoring(est, X, y):\n", 182 | " return np.mean(est.predict(X) == y) - np.mean(est.coef_ != 0)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn.grid_search import GridSearchCV\n", 194 | "\n", 195 | "y = iris.target\n", 196 | "grid = GridSearchCV(LinearSVC(C=.01, dual=False),\n", 197 | " param_grid={'penalty' : ['l1', 'l2']},\n", 198 | " scoring=my_super_scoring)\n", 199 | "grid.fit(X, y)\n", 200 | "print(grid.best_params_)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "There are other ways to do cross-valiation" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "from sklearn.cross_validation import ShuffleSplit\n", 219 | "\n", 220 | "shuffle_split = ShuffleSplit(len(X), 10, test_size=.4)\n", 221 | "cross_val_score(LinearSVC(), X, y, cv=shuffle_split)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit\n", 233 | "\n", 234 | "def plot_cv(cv, n_samples):\n", 235 | " masks = []\n", 236 | " for train, test in cv:\n", 237 | " mask = np.zeros(n_samples, dtype=bool)\n", 238 | " mask[test] = 1\n", 239 | " masks.append(mask)\n", 240 | " plt.matshow(masks)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "plot_cv(StratifiedKFold(y, n_folds=5), len(y))" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "plot_cv(KFold(len(iris.target), n_folds=5), len(iris.target))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "plot_cv(ShuffleSplit(len(iris.target), n_iter=20, test_size=.2), \n", 274 | " len(iris.target))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "source": [ 283 | "# Exercises\n", 284 | "Use KFold cross validation and StratifiedKFold cross validation (3 or 5 folds) for LinearSVC on the iris dataset.\n", 285 | "Why are the results so different? How could you get more similar results?" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "# %load solutions/cross_validation_iris.py" 297 | ] 298 | } 299 | ], 300 | "metadata": { 301 | "kernelspec": { 302 | "display_name": "Python 2", 303 | "language": "python", 304 | "name": "python2" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 2 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython2", 316 | "version": "2.7.9" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 0 321 | } 322 | -------------------------------------------------------------------------------- /06 - Model Complexity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import matplotlib.pyplot as plt\n", 12 | "import numpy as np\n", 13 | "%matplotlib nbagg" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Model Complexity, Overfitting and Underfitting\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from plots import plot_kneighbors_regularization\n", 32 | "plot_kneighbors_regularization()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "![underfitting and overfitting](figures/overfitting_underfitting_cartoon.svg)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "source": [ 48 | "# Validation Curves" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "from sklearn.datasets import load_digits\n", 60 | "from sklearn.ensemble import RandomForestClassifier\n", 61 | "from sklearn.learning_curve import validation_curve" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "digits = load_digits()\n", 73 | "X, y = digits.data, digits.target" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "model = RandomForestClassifier(n_estimators=20)\n", 85 | "param_range = range(1, 13)\n", 86 | "training_scores, validation_scores = validation_curve(model, X, y,\n", 87 | " param_name=\"max_depth\",\n", 88 | " param_range=param_range, cv=5)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "training_scores.shape" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "def plot_validation_curve(parameter_values, train_scores, validation_scores):\n", 111 | " train_scores_mean = np.mean(train_scores, axis=1)\n", 112 | " train_scores_std = np.std(train_scores, axis=1)\n", 113 | " validation_scores_mean = np.mean(validation_scores, axis=1)\n", 114 | " validation_scores_std = np.std(validation_scores, axis=1)\n", 115 | "\n", 116 | " plt.fill_between(parameter_values, train_scores_mean - train_scores_std,\n", 117 | " train_scores_mean + train_scores_std, alpha=0.1,\n", 118 | " color=\"r\")\n", 119 | " plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,\n", 120 | " validation_scores_mean + validation_scores_std, alpha=0.1, color=\"g\")\n", 121 | " plt.plot(parameter_values, train_scores_mean, 'o-', color=\"r\",\n", 122 | " label=\"Training score\")\n", 123 | " plt.plot(parameter_values, validation_scores_mean, 'o-', color=\"g\",\n", 124 | " label=\"Cross-validation score\")\n", 125 | " plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)\n", 126 | " plt.legend(loc=\"best\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "plt.figure()\n", 138 | "plot_validation_curve(param_range, training_scores, validation_scores)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# Exercise\n", 146 | "\n", 147 | "Plot the validation curve on the digit dataset for:\n", 148 | "* a LinearSVC with a logarithmic range of regularization parameters ``C``.\n", 149 | "* KNeighborsClassifier with a linear range of neighbors ``k``.\n", 150 | "\n", 151 | "What do you expect them to look like? How do they actually look like?" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "# %load solutions/validation_curve.py" 163 | ] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 2", 169 | "language": "python", 170 | "name": "python2" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 2 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython2", 182 | "version": "2.7.9" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 0 187 | } 188 | -------------------------------------------------------------------------------- /07 - Grid Searches for Hyper Parameters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Grid Searches\n", 21 | "=================" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Grid-Search with build-in cross validation" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.grid_search import GridSearchCV\n", 47 | "from sklearn.svm import SVC" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "from sklearn.datasets import load_digits\n", 59 | "from sklearn.cross_validation import train_test_split\n", 60 | "digits = load_digits()\n", 61 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n", 62 | " digits.target)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Define parameter grid:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "import numpy as np\n", 81 | "\n", 82 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n", 83 | " 'gamma' : 10. ** np.arange(-5, 0)}\n", 84 | "\n", 85 | "np.set_printoptions(suppress=True)\n", 86 | "print(param_grid)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "A GridSearchCV object behaves just like a normal classifier." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false, 112 | "scrolled": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "grid_search.fit(X_train, y_train)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "grid_search.predict(X_test)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "grid_search.score(X_test, y_test)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "grid_search.best_params_" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "# We extract just the scores\n", 161 | "\n", 162 | "scores = [x.mean_validation_score for x in grid_search.grid_scores_]\n", 163 | "scores = np.array(scores).reshape(6, 5)\n", 164 | "\n", 165 | "plt.matshow(scores)\n", 166 | "plt.xlabel('gamma')\n", 167 | "plt.ylabel('C')\n", 168 | "plt.colorbar()\n", 169 | "plt.xticks(np.arange(5), param_grid['gamma'])\n", 170 | "plt.yticks(np.arange(6), param_grid['C']);" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Nested Cross-validation in scikit-learn:" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": false 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "from sklearn.cross_validation import cross_val_score\n", 189 | "cross_val_score(GridSearchCV(SVC(), param_grid),\n", 190 | " digits.data, digits.target)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "source": [ 199 | "# Exercises\n", 200 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier.\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "# %load solutions/grid_search_k_neighbors.py" 212 | ] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 2", 218 | "language": "python", 219 | "name": "python2" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 2 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython2", 231 | "version": "2.7.10" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 0 236 | } 237 | -------------------------------------------------------------------------------- /08 - Preprocessing and Pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Preprocessing and Pipelines\n", 21 | "=============================" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "from sklearn.datasets import load_digits\n", 40 | "from sklearn.cross_validation import train_test_split\n", 41 | "digits = load_digits()\n", 42 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.\n", 50 | "To do that, we build a pipeline." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "from sklearn.pipeline import Pipeline, make_pipeline\n", 62 | "from sklearn.svm import SVC\n", 63 | "from sklearn.preprocessing import StandardScaler" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"svm\", SVC())])\n", 75 | "# or for short:\n", 76 | "make_pipeline(StandardScaler(), SVC())" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "pipeline.fit(X_train, y_train)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "pipeline.predict(X_test)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Cross-validation with a pipeline\n", 113 | "---------------------------------" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "from sklearn.cross_validation import cross_val_score\n", 125 | "cross_val_score(pipeline, X_train, y_train)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Grid Search with a pipeline\n", 133 | "===========================" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "from sklearn.grid_search import GridSearchCV\n", 145 | "\n", 146 | "param_grid = {'svm__C': 10. ** np.arange(-3, 3),\n", 147 | " 'svm__gamma' : 10. ** np.arange(-3, 3)}\n", 148 | "\n", 149 | "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "grid_pipeline.fit(X_train, y_train)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "grid_pipeline.score(X_test, y_test)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "source": [ 180 | "# Exercises\n", 181 | "Add random features to the iris dataset using ``np.random.uniform`` and ``np.hstack``.\n", 182 | "\n", 183 | "Build a pipeline using the SelectKBest univariate feature selection from the sklearn.feature_selection module and the LinearSVC on the iris dataset.\n", 184 | "\n", 185 | "Use GridSearchCV to adjust C and the number of features selected in SelectKBest." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": false, 193 | "scrolled": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "# %load solutions/pipeline_iris.py" 198 | ] 199 | } 200 | ], 201 | "metadata": { 202 | "kernelspec": { 203 | "display_name": "Python 2", 204 | "language": "python", 205 | "name": "python2" 206 | }, 207 | "language_info": { 208 | "codemirror_mode": { 209 | "name": "ipython", 210 | "version": 2 211 | }, 212 | "file_extension": ".py", 213 | "mimetype": "text/x-python", 214 | "name": "python", 215 | "nbconvert_exporter": "python", 216 | "pygments_lexer": "ipython2", 217 | "version": "2.7.9" 218 | } 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 0 222 | } 223 | -------------------------------------------------------------------------------- /09.1 - Linear models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Linear models for regression" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "\n", 28 | "```\n", 29 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_\n", 30 | "```" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from sklearn.datasets import make_regression\n", 42 | "from sklearn.cross_validation import train_test_split\n", 43 | "\n", 44 | "X, y, true_coefficient = make_regression(n_samples=80, n_features=30, n_informative=10, noise=100, coef=True, random_state=5)\n", 45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)\n", 46 | "print(X_train.shape)\n", 47 | "print(y_train.shape)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Linear Regression\n", 55 | "\n", 56 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 $$" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false, 64 | "scrolled": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "from sklearn.linear_model import LinearRegression\n", 69 | "linear_regression = LinearRegression().fit(X_train, y_train)\n", 70 | "print(\"R^2 on training set: %f\" % linear_regression.score(X_train, y_train))\n", 71 | "print(\"R^2 on test set: %f\" % linear_regression.score(X_test, y_test))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "from sklearn.metrics import r2_score\n", 83 | "print(r2_score(np.dot(X, true_coefficient), y))" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "plt.figure(figsize=(10, 5))\n", 95 | "coefficient_sorting = np.argsort(true_coefficient)[::-1]\n", 96 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\")\n", 97 | "plt.plot(linear_regression.coef_[coefficient_sorting], \"o\", label=\"linear regression\")\n", 98 | "\n", 99 | "plt.legend()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "## Ridge Regression (L2 penalty)\n", 107 | "\n", 108 | "$$ \\text{min}_{w,b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_2^2$$ " 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "from sklearn.linear_model import Ridge\n", 120 | "ridge_models = {}\n", 121 | "training_scores = []\n", 122 | "test_scores = []\n", 123 | "\n", 124 | "for alpha in [100, 10, 1, .01]:\n", 125 | " ridge = Ridge(alpha=alpha).fit(X_train, y_train)\n", 126 | " training_scores.append(ridge.score(X_train, y_train))\n", 127 | " test_scores.append(ridge.score(X_test, y_test))\n", 128 | " ridge_models[alpha] = ridge\n", 129 | "\n", 130 | "plt.figure()\n", 131 | "plt.plot(training_scores, label=\"training scores\")\n", 132 | "plt.plot(test_scores, label=\"test scores\")\n", 133 | "plt.xticks(range(4), [100, 10, 1, .01])\n", 134 | "plt.legend(loc=\"best\")" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "plt.figure(figsize=(10, 5))\n", 146 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n", 147 | "\n", 148 | "for i, alpha in enumerate([100, 10, 1, .01]):\n", 149 | " plt.plot(ridge_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n", 150 | " \n", 151 | "plt.legend(loc=\"best\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Lasso (L1 penalty)\n", 159 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_1$$ " 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "from sklearn.linear_model import Lasso\n", 171 | "\n", 172 | "lasso_models = {}\n", 173 | "training_scores = []\n", 174 | "test_scores = []\n", 175 | "\n", 176 | "for alpha in [30, 10, 1, .01]:\n", 177 | " lasso = Lasso(alpha=alpha).fit(X_train, y_train)\n", 178 | " training_scores.append(lasso.score(X_train, y_train))\n", 179 | " test_scores.append(lasso.score(X_test, y_test))\n", 180 | " lasso_models[alpha] = lasso\n", 181 | "plt.figure()\n", 182 | "plt.plot(training_scores, label=\"training scores\")\n", 183 | "plt.plot(test_scores, label=\"test scores\")\n", 184 | "plt.xticks(range(4), [30, 10, 1, .01])\n", 185 | "plt.legend(loc=\"best\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "plt.figure(figsize=(10, 5))\n", 197 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n", 198 | "\n", 199 | "for i, alpha in enumerate([30, 10, 1, .01]):\n", 200 | " plt.plot(lasso_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n", 201 | " \n", 202 | "plt.legend(loc=\"best\")" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## Linear models for classification" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "\n", 217 | "```\n", 218 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_ > 0\n", 219 | "```" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "The influence of C in LinearSVC" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "from plots import plot_linear_svc_regularization\n", 238 | "plot_linear_svc_regularization()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "## Multi-Class linear classification" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "from sklearn.datasets import make_blobs\n", 257 | "plt.figure()\n", 258 | "X, y = make_blobs(random_state=42)\n", 259 | "plt.scatter(X[:, 0], X[:, 1], c=y)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "from sklearn.svm import LinearSVC\n", 271 | "linear_svm = LinearSVC().fit(X, y)\n", 272 | "print(linear_svm.coef_.shape)\n", 273 | "print(linear_svm.intercept_.shape)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "plt.scatter(X[:, 0], X[:, 1], c=y)\n", 285 | "line = np.linspace(-15, 15)\n", 286 | "for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):\n", 287 | " plt.plot(line, -(line * coef[0] + intercept) / coef[1])\n", 288 | "plt.ylim(-10, 15)\n", 289 | "plt.xlim(-10, 8)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "# Exercises" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "* Use GridSearchCV to tune the parameter C of LinearSVC on the digits dataset.\n", 304 | "* Compare l1 penalty and l2 penalty by plotting the coefficients as above for the digits dataset. Classify odd vs even digits to make it a binary task." 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": false 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "# %load solutions/linear_models.py" 316 | ] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 2", 322 | "language": "python", 323 | "name": "python2" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 2 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython2", 335 | "version": "2.7.10" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 0 340 | } 341 | -------------------------------------------------------------------------------- /09.2 - Support Vector Machines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Support Vector Machines" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.datasets import load_digits\n", 32 | "from sklearn.cross_validation import train_test_split\n", 33 | "\n", 34 | "digits = load_digits()\n", 35 | "X_train, X_test, y_train, y_test = train_test_split(digits.data / 16., digits.target % 2, random_state=2)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.svm import LinearSVC, SVC\n", 47 | "linear_svc = LinearSVC(loss=\"hinge\").fit(X_train, y_train)\n", 48 | "svc = SVC(kernel=\"linear\").fit(X_train, y_train)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "np.mean(linear_svc.predict(X_test) == svc.predict(X_test))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Kernel SVMs\n", 67 | "\n", 68 | "\n", 69 | "Predictions in a kernel-SVM are made using the formular\n", 70 | "\n", 71 | "$$\n", 72 | "\\hat{y} = \\alpha_0 + \\alpha_1 y_1 k(\\mathbf{x^{(1)}}, \\mathbf{x}) + ... + \\alpha_n y_n k(\\mathbf{x^{(n)}}, \\mathbf{x})> 0\n", 73 | "$$\n", 74 | "\n", 75 | "$$\n", 76 | "0 \\leq \\alpha_i \\leq C\n", 77 | "$$\n", 78 | "\n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Radial basis function (Gaussian) kernel:\n", 86 | "$$k(\\mathbf{x}, \\mathbf{x'}) = \\exp(-\\gamma ||\\mathbf{x} - \\mathbf{x'}||^2)$$" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "from sklearn.metrics.pairwise import rbf_kernel\n", 98 | "line = np.linspace(-3, 3, 100)[:, np.newaxis]\n", 99 | "kernel_value = rbf_kernel([[0]], line, gamma=1)\n", 100 | "plt.plot(line, kernel_value.T)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "from plots import plot_svm_interactive\n", 112 | "plot_svm_interactive()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "svc = SVC().fit(X_train, y_train)\n", 124 | "svc.score(X_test, y_test)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "Cs = [0.001, 0.01, 0.1, 1, 10, 100]\n", 136 | "gammas = [0.001, 0.01, 0.1, 1, 10, 100]\n", 137 | "\n", 138 | "from sklearn.grid_search import GridSearchCV\n", 139 | "\n", 140 | "param_grid = {'C': Cs, 'gamma' : gammas}\n", 141 | "grid_search = GridSearchCV(SVC(), param_grid, cv=5)\n", 142 | "grid_search.fit(X_train, y_train)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "grid_search.score(X_test, y_test)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "# We extract just the scores\n", 165 | "scores = [x[1] for x in grid_search.grid_scores_]\n", 166 | "scores = np.array(scores).reshape(6, 6)\n", 167 | "\n", 168 | "plt.matshow(scores)\n", 169 | "plt.xlabel('gamma')\n", 170 | "plt.ylabel('C')\n", 171 | "plt.colorbar()\n", 172 | "plt.xticks(np.arange(6), param_grid['gamma'])\n", 173 | "plt.yticks(np.arange(6), param_grid['C']);" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "source": [ 182 | "# Excercise\n", 183 | "* Scale the data using StandardScaler before applying the SVC. How does the performance of the default parameters change?\n", 184 | "* Grid-Search the parameters for the scaled data. How do they differ from the previous ones?" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 2", 191 | "language": "python", 192 | "name": "python2" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 2 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython2", 204 | "version": "2.7.10" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 0 209 | } 210 | -------------------------------------------------------------------------------- /09.3 - Trees and Forests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Trees and Forests" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib nbagg\n", 19 | "import numpy as np\n", 20 | "import matplotlib.pyplot as plt" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Decision Tree Classification\n", 28 | "==================\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "from plots import plot_tree_interactive\n", 40 | "plot_tree_interactive()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Random Forests" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "from plots import plot_forest_interactive\n", 59 | "plot_forest_interactive()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Selecting the Optimal Estimator via Cross-Validation" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "from sklearn import grid_search\n", 78 | "from sklearn.datasets import load_digits\n", 79 | "from sklearn.cross_validation import train_test_split\n", 80 | "from sklearn.ensemble import RandomForestClassifier\n", 81 | "\n", 82 | "digits = load_digits()\n", 83 | "X, y = digits.data, digits.target\n", 84 | "\n", 85 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", 86 | "\n", 87 | "rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)\n", 88 | "parameters = {'max_features':['sqrt', 'log2'],\n", 89 | " 'max_depth':[5, 7, 9]}\n", 90 | "\n", 91 | "clf_grid = grid_search.GridSearchCV(rf, parameters)\n", 92 | "clf_grid.fit(X_train, y_train)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": false 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "clf_grid.score(X_train, y_train)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "clf_grid.score(X_test, y_test)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "source": [ 123 | "# Exercises\n", 124 | "* Plot the validation curve for the maximum depth of a decision tree on the digits dataset.\n", 125 | "* Plot the validation curve for max_features of a random forest on the digits dataset." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "# %load solutions/forests.py" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 2", 143 | "language": "python", 144 | "name": "python2" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 2 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython2", 156 | "version": "2.7.10" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 0 161 | } 162 | -------------------------------------------------------------------------------- /10 - Working With Text Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Working with Text Data" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "import pandas as pd\n", 39 | "import os\n", 40 | "\n", 41 | "data = pd.read_csv(os.path.join(\"data\", \"train.csv\"))" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "len(data)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "data" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "y_train = np.array(data.Insult)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "y_train" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "text_train = data.Comment.tolist()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "text_train[6]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "data_test = pd.read_csv(os.path.join(\"data\", \"test_with_solutions.csv\"))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "from sklearn.feature_extraction.text import CountVectorizer" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "cv = CountVectorizer()\n", 152 | "cv.fit(text_train)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "len(cv.vocabulary_)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false, 171 | "scrolled": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "print(cv.get_feature_names()[:50])\n", 176 | "print(cv.get_feature_names()[-50:])" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "X_train = cv.transform(text_train)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "X_train" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "text_train[6]" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "X_train[6, :].nonzero()[1]" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "X_test = cv.transform(text_test)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "from sklearn.svm import LinearSVC\n", 243 | "svm = LinearSVC()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "svm.fit(X_train, y_train)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "svm.score(X_train, y_train)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "svm.score(X_test, y_test)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "def visualize_coefficients(classifier, feature_names, n_top_features=25):\n", 288 | " # get coefficients with large absolute values \n", 289 | " coef = classifier.coef_.ravel()\n", 290 | " positive_coefficients = np.argsort(coef)[-n_top_features:]\n", 291 | " negative_coefficients = np.argsort(coef)[:n_top_features]\n", 292 | " interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n", 293 | " # plot them\n", 294 | " plt.figure(figsize=(15, 5))\n", 295 | " colors = [\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]]\n", 296 | " plt.bar(np.arange(50), coef[interesting_coefficients], color=colors)\n", 297 | " feature_names = np.array(feature_names)\n", 298 | " plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha=\"right\");\n" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "visualize_coefficients(svm, cv.get_feature_names())" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "# Exercises\n", 317 | "* Create a pipeine using the count vectorizer and SVM (see 07). Train and score using the pipeline.\n", 318 | "* Vary the n_gram_range in the count vectorizer, visualize the changed coefficients.\n", 319 | "* Grid search the C in the LinearSVC using the pipeline.\n", 320 | "* Grid search the C in the LinearSVC together with the n_gram_range (try (1,1), (1, 2), (2, 2))" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [], 330 | "source": [ 331 | "# %load solutions/text_pipeline.py\n" 332 | ] 333 | } 334 | ], 335 | "metadata": { 336 | "kernelspec": { 337 | "display_name": "Python 2", 338 | "language": "python", 339 | "name": "python2" 340 | }, 341 | "language_info": { 342 | "codemirror_mode": { 343 | "name": "ipython", 344 | "version": 2 345 | }, 346 | "file_extension": ".py", 347 | "mimetype": "text/x-python", 348 | "name": "python", 349 | "nbconvert_exporter": "python", 350 | "pygments_lexer": "ipython2", 351 | "version": "2.7.6" 352 | } 353 | }, 354 | "nbformat": 4, 355 | "nbformat_minor": 0 356 | } 357 | -------------------------------------------------------------------------------- /11 - Out Of Core Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# write out some toy data\n", 12 | "from sklearn.datasets import load_digits\n", 13 | "import cPickle\n", 14 | "\n", 15 | "digits = load_digits()\n", 16 | "\n", 17 | "X, y = digits.data, digits.target\n", 18 | "\n", 19 | "for i in range(10):\n", 20 | " cPickle.dump((X[i::10], y[i::10]), open(\"data/batch_%02d.pickle\" % i, \"w\"), -1)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.linear_model import SGDClassifier\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "sgd = SGDClassifier()\n", 43 | "\n", 44 | "for i in range(9):\n", 45 | " X_batch, y_batch = cPickle.load(open(\"data/batch_%02d.pickle\" % i))\n", 46 | " sgd.partial_fit(X_batch, y_batch, classes=range(10))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "X_test, y_test = cPickle.load(open(\"data/batch_09.pickle\"))\n", 58 | "\n", 59 | "sgd.score(X_test, y_test)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Text\n", 67 | "=====" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "import pandas as pd\n", 79 | "from sklearn.feature_extraction.text import HashingVectorizer\n", 80 | "\n", 81 | "sgd = SGDClassifier()\n", 82 | "hashing_vectorizer = HashingVectorizer()\n", 83 | "\n", 84 | "for i in range(10):\n", 85 | " data_batch = pd.read_csv(\"data/train_%d.csv\" % i)\n", 86 | " text_batch = data_batch.Comment.tolist()\n", 87 | " y_batch = data_batch.Insult.values\n", 88 | " X_batch = hashing_vectorizer.transform(text_batch)\n", 89 | " sgd.partial_fit(X_batch, y_batch, classes=range(10))" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "data_test = pd.read_csv(\"data/test_with_solutions.csv\")\n", 101 | "X_test = hashing_vectorizer.transform(data_test.Comment.tolist())\n", 102 | "y_test = data_test.Insult.values\n", 103 | "sgd.score(X_test, y_test)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Kernel Approximations\n", 111 | "=======================" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "from sklearn.kernel_approximation import RBFSampler\n", 123 | "\n", 124 | "sgd = SGDClassifier()\n", 125 | "kernel_approximation = RBFSampler(gamma=.001, n_components=400)\n", 126 | "\n", 127 | "for i in range(9):\n", 128 | " X_batch, y_batch = cPickle.load(open(\"data/batch_%02d.pickle\" % i))\n", 129 | " if i == 0:\n", 130 | " kernel_approximation.fit(X_batch)\n", 131 | " X_transformed = kernel_approximation.transform(X_batch)\n", 132 | " sgd.partial_fit(X_transformed, y_batch, classes=range(10))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "X_test, y_test = cPickle.load(open(\"data/batch_09.pickle\"))\n", 144 | "\n", 145 | "sgd.score(kernel_approximation.transform(X_test), y_test)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 2", 161 | "language": "python", 162 | "name": "python2" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 2 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython2", 174 | "version": "2.7.6" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 0 179 | } 180 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Andreas Mueller 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Material for Scikit-learn workshop 2 | Jupyter notebooks for an interactive scikit-learn workshop with exercises and solutions. 3 | -------------------------------------------------------------------------------- /figures/cluster_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/sklearn_workshop/3bb698e874ea2f816855ec2bc1ae406e555bd5d0/figures/cluster_comparison.png -------------------------------------------------------------------------------- /figures/pipeline.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 28 | 34 | 35 | 43 | 49 | 50 | 58 | 64 | 65 | 73 | 79 | 80 | 88 | 94 | 95 | 103 | 109 | 110 | 118 | 124 | 125 | 133 | 139 | 140 | 148 | 154 | 155 | 163 | 169 | 170 | 178 | 184 | 185 | 193 | 199 | 200 | 208 | 214 | 215 | 223 | 229 | 230 | 238 | 244 | 245 | 253 | 259 | 260 | 268 | 274 | 275 | 283 | 289 | 290 | 298 | 304 | 305 | 313 | 319 | 320 | 328 | 334 | 335 | 343 | 349 | 350 | 351 | 373 | 375 | 376 | 378 | image/svg+xml 379 | 381 | 382 | 383 | 384 | 385 | 390 | pipe.fit(X, y) 402 | 406 | 413 | T1 424 | 425 | X 436 | y 447 | 454 | 461 | T1.fit(X, y) 472 | T2.fit(X1, y) 483 | Classifier.fit(X2, y) 494 | T1.transform(X) 505 | pipe.predict(X) 517 | X' 528 | y' 539 | Classifier.predict(X'2) 550 | 556 | 560 | 567 | T2 578 | 579 | 583 | 590 | Classifier 601 | 602 | 606 | 613 | T2 624 | 625 | 629 | 636 | T1 647 | 648 | X1 659 | 665 | y 676 | T2.transform(X1) 687 | X2 698 | y 709 | 713 | 720 | Classifier 731 | 732 | 739 | T1.transform(X) 750 | X'1 761 | 767 | T2.transform(X1) 778 | X'2 789 | 794 | 799 | 805 | 811 | 817 | pipe = make_pipeline(T1(), T2(), Classifier()) pipe = make_pipeline(T1(), T2(), Classifier()) 842 | 843 | 844 | -------------------------------------------------------------------------------- /figures/randomized_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/sklearn_workshop/3bb698e874ea2f816855ec2bc1ae406e555bd5d0/figures/randomized_search.png -------------------------------------------------------------------------------- /figures/train_test_split.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlAll Data 369 | Training data 395 | Test data 421 | -------------------------------------------------------------------------------- /figures/train_validation_test2.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlAll Data 351 | Training 373 | Test 398 | Validation 423 | -------------------------------------------------------------------------------- /plots/__init__.py: -------------------------------------------------------------------------------- 1 | from .plot_2d_separator import plot_2d_separator 2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \ 3 | plot_regression_datasets, make_dataset 4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization 5 | from .plot_interactive_tree import plot_tree_interactive 6 | from .plot_interactive_forest import plot_forest_interactive 7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters 8 | from .plot_rbf_svm_parameters import plot_svm_interactive 9 | 10 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization', 11 | 'plot_linear_svc_regularization', 'plot_tree_interactive', 12 | 'plot_regression_datasets', 'make_dataset', 13 | "plot_forest_interactive", "plot_rbf_svm_parameters", 14 | "plot_svm_interactive"] 15 | -------------------------------------------------------------------------------- /plots/plot_2d_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None): 6 | if eps is None: 7 | eps = X.std() / 2. 8 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 9 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 10 | xx = np.linspace(x_min, x_max, 100) 11 | yy = np.linspace(y_min, y_max, 100) 12 | 13 | X1, X2 = np.meshgrid(xx, yy) 14 | X_grid = np.c_[X1.ravel(), X2.ravel()] 15 | try: 16 | decision_values = classifier.decision_function(X_grid) 17 | levels = [0] 18 | fill_levels = [decision_values.min(), 0, decision_values.max()] 19 | except AttributeError: 20 | # no decision_function 21 | decision_values = classifier.predict_proba(X_grid)[:, 1] 22 | levels = [.5] 23 | fill_levels = [0, .5, 1] 24 | 25 | if ax is None: 26 | ax = plt.gca() 27 | if fill: 28 | ax.contourf(X1, X2, decision_values.reshape(X1.shape), 29 | levels=fill_levels, colors=['blue', 'red']) 30 | else: 31 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels, 32 | colors="black") 33 | ax.set_xlim(x_min, x_max) 34 | ax.set_ylim(y_min, y_max) 35 | ax.set_xticks(()) 36 | ax.set_yticks(()) 37 | 38 | 39 | if __name__ == '__main__': 40 | from sklearn.datasets import make_blobs 41 | from sklearn.linear_model import LogisticRegression 42 | X, y = make_blobs(centers=2, random_state=42) 43 | clf = LogisticRegression().fit(X, y) 44 | plot_2d_separator(clf, X, fill=True) 45 | plt.scatter(X[:, 0], X[:, 1], c=y) 46 | plt.show() 47 | -------------------------------------------------------------------------------- /plots/plot_interactive_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | 8 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 9 | 10 | 11 | def plot_forest(max_depth=1): 12 | plt.figure() 13 | ax = plt.gca() 14 | h = 0.02 15 | 16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 18 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 19 | 20 | if max_depth != 0: 21 | forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth, 22 | random_state=1).fit(X, y) 23 | Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 24 | Z = Z.reshape(xx.shape) 25 | ax.contourf(xx, yy, Z, alpha=.4) 26 | ax.set_title("max_depth = %d" % max_depth) 27 | else: 28 | ax.set_title("data set") 29 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 30 | ax.set_xlim(x_min, x_max) 31 | ax.set_ylim(y_min, y_max) 32 | ax.set_xticks(()) 33 | ax.set_yticks(()) 34 | 35 | 36 | def plot_forest_interactive(): 37 | from IPython.html.widgets import interactive, IntSlider 38 | slider = IntSlider(min=0, max=8, step=1, value=0) 39 | return interactive(plot_forest, max_depth=slider) 40 | -------------------------------------------------------------------------------- /plots/plot_interactive_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | from sklearn.externals.six import StringIO # doctest: +SKIP 8 | from sklearn.tree import export_graphviz 9 | from scipy.misc import imread 10 | from scipy import ndimage 11 | import os 12 | 13 | GRAPHVIS_PATH = r"C:\Program Files (x86)\Graphviz2.38\bin" 14 | if GRAPHVIS_PATH not in os.environ['PATH']: 15 | os.environ['PATH'] += ";" + GRAPHVIS_PATH 16 | 17 | import re 18 | 19 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 20 | 21 | 22 | def tree_image(tree, fout=None): 23 | try: 24 | import pydot 25 | import a_reliable_dot_rendering 26 | except ImportError: 27 | return None 28 | dot_data = StringIO() 29 | export_graphviz(tree, out_file=dot_data) 30 | data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue()) 31 | data = re.sub(r"samples = [0-9]+\\n", "", data) 32 | data = re.sub(r"\\nsamples = [0-9]+", "", data) 33 | 34 | graph = pydot.graph_from_dot_data(data) 35 | if fout is None: 36 | fout = "tmp.png" 37 | graph.write_png(fout) 38 | return imread(fout) 39 | 40 | 41 | def plot_tree(max_depth=1): 42 | fig, ax = plt.subplots(1, 2, figsize=(15, 7)) 43 | h = 0.02 44 | 45 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 46 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 47 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 48 | 49 | if max_depth != 0: 50 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y) 51 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 52 | Z = Z.reshape(xx.shape) 53 | faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) 54 | faces = faces.reshape(xx.shape) 55 | border = ndimage.laplace(faces) != 0 56 | ax[0].contourf(xx, yy, Z, alpha=.4) 57 | ax[0].scatter(xx[border], yy[border], marker='.', s=1) 58 | ax[0].set_title("max_depth = %d" % max_depth) 59 | img = tree_image(tree) 60 | if img is not None: 61 | ax[1].imshow(i) 62 | ax[1].axis("off") 63 | else: 64 | ax[1].set_visible(False) 65 | else: 66 | ax[0].set_title("data set") 67 | ax[1].set_visible(False) 68 | ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 69 | ax[0].set_xlim(x_min, x_max) 70 | ax[0].set_ylim(y_min, y_max) 71 | ax[0].set_xticks(()) 72 | ax[0].set_yticks(()) 73 | 74 | 75 | def plot_tree_interactive(): 76 | from IPython.html.widgets import interactive, IntSlider 77 | slider = IntSlider(min=0, max=8, step=1, value=0) 78 | return interactive(plot_tree, max_depth=slider) 79 | -------------------------------------------------------------------------------- /plots/plot_kneighbors_regularization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | 6 | 7 | def make_dataset(n_samples=100): 8 | rnd = np.random.RandomState(42) 9 | x = np.linspace(-3, 3, n_samples) 10 | y_no_noise = np.sin(4 * x) + x 11 | y = y_no_noise + rnd.normal(size=len(x)) 12 | return x, y 13 | 14 | 15 | def plot_regression_datasets(): 16 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 17 | for n_samples, ax in zip([10, 100, 1000], axes): 18 | x, y = make_dataset(n_samples) 19 | ax.plot(x, y, 'o', alpha=.6) 20 | 21 | 22 | def plot_kneighbors_regularization(): 23 | rnd = np.random.RandomState(42) 24 | x = np.linspace(-3, 3, 100) 25 | y_no_noise = np.sin(4 * x) + x 26 | y = y_no_noise + rnd.normal(size=len(x)) 27 | X = x[:, np.newaxis] 28 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 29 | 30 | x_test = np.linspace(-3, 3, 1000) 31 | 32 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()): 33 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors) 34 | kneighbor_regression.fit(X, y) 35 | ax.plot(x, y_no_noise, label="true function") 36 | ax.plot(x, y, "o", label="data") 37 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]), 38 | label="prediction") 39 | ax.legend(loc="best") 40 | ax.set_title("n_neighbors = %d" % n_neighbors) 41 | 42 | if __name__ == "__main__": 43 | plot_kneighbors_regularization() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /plots/plot_linear_svc_regularization.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | 6 | 7 | def plot_linear_svc_regularization(): 8 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 9 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 10 | 11 | # a carefully hand-designed dataset lol 12 | y[7] = 0 13 | y[27] = 0 14 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 15 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 16 | 17 | for ax, C in zip(axes, [1e-2, 1, 1e2]): 18 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 19 | 20 | svm = SVC(kernel='linear', C=C, tol=0.00001).fit(X, y) 21 | w = svm.coef_[0] 22 | a = -w[0] / w[1] 23 | xx = np.linspace(6, 13) 24 | yy = a * xx - (svm.intercept_[0]) / w[1] 25 | ax.plot(xx, yy, label="C = %.e" % C, c='k') 26 | ax.set_xlim(x_min, x_max) 27 | ax.set_ylim(y_min, y_max) 28 | ax.set_xticks(()) 29 | ax.set_yticks(()) 30 | ax.set_title("C = %f" % C) 31 | 32 | if __name__ == "__main__": 33 | plot_linear_svc_regularization() 34 | plt.show() 35 | -------------------------------------------------------------------------------- /plots/plot_rbf_svm_parameters.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | from sklearn.externals.joblib import Memory 6 | from .plot_2d_separator import plot_2d_separator 7 | 8 | def make_handcrafted_dataset(): 9 | # a carefully hand-designed dataset lol 10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 11 | y[np.array([7, 27])] = 0 12 | mask = np.ones(len(X), dtype=np.bool) 13 | mask[np.array([0, 1, 5, 26])] = 0 14 | X, y = X[mask], y[mask] 15 | return X, y 16 | 17 | 18 | def plot_rbf_svm_parameters(): 19 | X, y = make_handcrafted_dataset() 20 | 21 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 22 | for ax, C in zip(axes, [1e0, 5, 10, 100]): 23 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 24 | 25 | svm = SVC(kernel='rbf', C=C).fit(X, y) 26 | plot_2d_separator(svm, X, ax=ax, eps=.5) 27 | ax.set_title("C = %f" % C) 28 | 29 | fig, axes = plt.subplots(1, 4, figsize=(15, 3)) 30 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]): 31 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 32 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y) 33 | plot_2d_separator(svm, X, ax=ax, eps=.5) 34 | ax.set_title("gamma = %f" % gamma) 35 | 36 | 37 | def plot_svm(log_C, log_gamma): 38 | X, y = make_handcrafted_dataset() 39 | C = 10. ** log_C 40 | gamma = 10. ** log_gamma 41 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y) 42 | ax = plt.gca() 43 | plot_2d_separator(svm, X, ax=ax, eps=.5) 44 | # plot data 45 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 46 | # plot support vectors 47 | sv = svm.support_vectors_ 48 | ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3) 49 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma)) 50 | 51 | 52 | def plot_svm_interactive(): 53 | from IPython.html.widgets import interactive, FloatSlider 54 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False) 55 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False) 56 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider) 57 | -------------------------------------------------------------------------------- /solutions/cross_validation_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.cross_validation import StratifiedKFold, KFold 3 | iris = load_iris() 4 | X, y = iris.data, iris.target 5 | 6 | print(cross_val_score(LinearSVC(), X, y, cv=KFold(len(X), 3))) 7 | print(cross_val_score(LinearSVC(), X, y, cv=StratifiedKFold(y, 3))) 8 | -------------------------------------------------------------------------------- /solutions/digits_tsne.py: -------------------------------------------------------------------------------- 1 | from sklearn.manifold import TSNE 2 | tsne = TSNE() 3 | X_tsne = tsne.fit_transform(X) 4 | plt.title("All classes") 5 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y) 6 | -------------------------------------------------------------------------------- /solutions/forests.py: -------------------------------------------------------------------------------- 1 | from sklearn.tree import DecisionTreeClassifier 2 | from sklearn.ensemble import RandomForestClassifier 3 | from sklearn.datasets import load_digits 4 | from sklearn.learning_curve import validation_curve 5 | 6 | digits = load_digits() 7 | 8 | def plot_validation_curve(parameter_values, train_scores, validation_scores): 9 | train_scores_mean = np.mean(train_scores, axis=1) 10 | train_scores_std = np.std(train_scores, axis=1) 11 | validation_scores_mean = np.mean(validation_scores, axis=1) 12 | validation_scores_std = np.std(validation_scores, axis=1) 13 | 14 | plt.fill_between(parameter_values, train_scores_mean - train_scores_std, 15 | train_scores_mean + train_scores_std, alpha=0.1, 16 | color="r") 17 | plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std, 18 | validation_scores_mean + validation_scores_std, alpha=0.1, color="g") 19 | plt.plot(parameter_values, train_scores_mean, 'o-', color="r", 20 | label="Training score") 21 | plt.plot(parameter_values, validation_scores_mean, 'o-', color="g", 22 | label="Cross-validation score") 23 | plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1) 24 | plt.legend(loc="best") 25 | 26 | param_range = range(1, 50) 27 | training_scores, validation_scores = validation_curve(DecisionTreeClassifier(), digits.data, digits.target, 28 | param_name="max_depth", 29 | param_range=param_range, 30 | cv=5) 31 | plt.figure() 32 | plot_validation_curve(param_range, training_scores, validation_scores) 33 | 34 | param_range = range(1, 20, 1) 35 | training_scores, validation_scores = validation_curve(RandomForestClassifier(n_estimators=100), 36 | digits.data, digits.target, 37 | param_name="max_features", 38 | param_range=param_range, 39 | cv=5) 40 | plt.figure() 41 | plot_validation_curve(param_range, training_scores, validation_scores) 42 | -------------------------------------------------------------------------------- /solutions/grid_search_forest.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | 3 | param_grid = {'max_depth': [1, 3, 5, 7, 10], 'max_features': [5, 8, 10, 20]} 4 | 5 | grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) 6 | grid.fit(X_train, y_train) 7 | print("best parameters: %s" % grid.best_params_) 8 | print("Training set accuracy: %s" % grid.score(X_train, y_train)) 9 | print("Test set accuracy: %s" % grid.score(X_test, y_test)) 10 | 11 | scores = [x.mean_validation_score for x in grid.grid_scores_] 12 | scores = np.array(scores).reshape(5, 4) 13 | plt.matshow(scores) 14 | plt.xlabel("max_features") 15 | plt.ylabel("max_depth") 16 | -------------------------------------------------------------------------------- /solutions/grid_search_k_neighbors.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier 2 | 3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]} 4 | 5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid) 6 | grid.fit(X_train, y_train) 7 | print("best parameters: %s" % grid.best_params_) 8 | print("Training set accuracy: %s" % grid.score(X_train, y_train)) 9 | print("Test set accuracy: %s" % grid.score(X_test, y_test)) 10 | -------------------------------------------------------------------------------- /solutions/linear_models.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | from sklearn.grid_search import GridSearchCV 4 | from sklearn.datasets import load_digits 5 | from sklearn.cross_validation import train_test_split 6 | from sklearn.svm import LinearSVC 7 | 8 | digits = load_digits() 9 | X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target % 2) 10 | 11 | grid = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-6, 2, 9)}, cv=5) 12 | grid.fit(X_train, y_train) 13 | pprint(grid.grid_scores_) 14 | pprint(grid.score(X_test, y_test)) 15 | 16 | 17 | Cs = [10, 1, .01, 0.001, 0.0001] 18 | for penalty in ['l1', 'l2']: 19 | svm_models = {} 20 | training_scores = [] 21 | test_scores = [] 22 | for C in Cs: 23 | svm = LinearSVC(C=C, penalty=penalty, dual=False).fit(X_train, y_train) 24 | training_scores.append(svm.score(X_train, y_train)) 25 | test_scores.append(svm.score(X_test, y_test)) 26 | svm_models[C] = svm 27 | 28 | plt.figure() 29 | plt.plot(training_scores, label="training scores") 30 | plt.plot(test_scores, label="test scores") 31 | plt.xticks(range(4), Cs) 32 | plt.legend(loc="best") 33 | 34 | plt.figure(figsize=(10, 5)) 35 | for i, C in enumerate(Cs): 36 | plt.plot(svm_models[C].coef_.ravel(), "o", label="C = %.2f" % C) 37 | 38 | plt.legend(loc="best") 39 | -------------------------------------------------------------------------------- /solutions/load_iris.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import load_iris 5 | from sklearn.cross_validation import train_test_split 6 | 7 | iris = load_iris() 8 | X, y = iris.data, iris.target 9 | 10 | print("Dataset size: %d number of features: %d number of classes: %d" 11 | % (X.shape[0], X.shape[1], len(np.unique(y)))) 12 | 13 | X_train, X_test, y_train, y_test = train_test_split(X, y) 14 | 15 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train) 16 | plt.figure() 17 | plt.scatter(X_train[:, 2], X_train[:, 3], c=y_train) 18 | -------------------------------------------------------------------------------- /solutions/pipeline_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.feature_selection import SelectKBest 3 | from sklearn.pipeline import make_pipeline 4 | from sklearn.svm import LinearSVC 5 | 6 | rng = np.random.RandomState(42) 7 | iris = load_iris() 8 | X = np.hstack([iris.data, rng.uniform(size=(len(iris.data), 5))]) 9 | X_train, X_test, y_train, y_test = train_test_split(X, iris.target, random_state=2) 10 | 11 | selection_pipe = make_pipeline(SelectKBest(), LinearSVC()) 12 | param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3), 13 | 'selectkbest__k': [1, 2, 3, 4, 5, 7]} 14 | grid = GridSearchCV(selection_pipe, param_grid, cv=5) 15 | grid.fit(X_train, y_train) 16 | print("Best parameters: %s" % grid.best_params_) 17 | print("Test set performance: %s" % grid.score(X_test, y_test)) 18 | -------------------------------------------------------------------------------- /solutions/svms.py: -------------------------------------------------------------------------------- 1 | print("default score without scaling: %f" % SVC().fit(X_train, y_train).score(X_test, y_test)) 2 | 3 | from sklearn.preprocessing import StandardScaler 4 | scaler = StandardScaler() 5 | X_train_scaled = scaler.fit_transform(X_train) 6 | X_test_scaled = scaler.transform(X_test) 7 | print("default score with scaling: %f" % SVC().fit(X_train_scaled, y_train).score(X_test_scaled, y_test)) 8 | 9 | grid_search.fit(X_train_scaled, y_train) 10 | 11 | # We extract just the scores 12 | scores = [x[1] for x in grid_search.grid_scores_] 13 | scores = np.array(scores).reshape(6, 6) 14 | 15 | plt.matshow(scores) 16 | plt.xlabel('gamma') 17 | plt.ylabel('C') 18 | plt.colorbar() 19 | plt.xticks(np.arange(6), param_grid['gamma']) 20 | plt.yticks(np.arange(6), param_grid['C']) 21 | -------------------------------------------------------------------------------- /solutions/text_pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import Pipeline 2 | from sklearn.grid_search import GridSearchCV 3 | 4 | pipeline = Pipeline([('vectorizer', cv), ('classifier', svm)]) 5 | pipeline.fit(text_train, y_train) 6 | print("Pipeline test score: %f" % pipeline.score(text_test, y_test)) 7 | 8 | param_grid = {'classifier__C': 10. ** np.arange(-3, 3)} 9 | 10 | grid_search = GridSearchCV(pipeline, param_grid=param_grid) 11 | grid_search.fit(text_train, y_train) 12 | print("best parameters : %s" % grid_search.best_params_) 13 | print("Grid-searched test score: %f" % grid_search.score(text_test, y_test)) 14 | 15 | 16 | param_grid = {'classifier__C': 10. ** np.arange(-3, 3), 17 | "vectorizer__ngram_range": [(1, 1), (1, 2), (2, 2)]} 18 | grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3) 19 | grid_search.fit(text_train, y_train) 20 | 21 | print("best parameters with n-gram search: %s" % grid_search.best_params_) 22 | print("test set score with n-gram search: %s" % grid_search.score(text_test, y_test)) 23 | -------------------------------------------------------------------------------- /solutions/train_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.neighbors import KNeighborsClassifier 3 | from sklearn.cross_validation import train_test_split 4 | 5 | iris = load_iris() 6 | X, y = iris.data, iris.target 7 | 8 | X_train, X_test, y_train, y_test = train_test_split(X, y) 9 | 10 | knn = KNeighborsClassifier(n_neighbors=3) 11 | knn.fit(X_train, y_train) 12 | 13 | print("test set score of knn: %f" % knn.score(X_test, y_test)) 14 | -------------------------------------------------------------------------------- /solutions/validation_curve.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | from sklearn.svm import LinearSVC 4 | from sklearn.neighbors import KNeighborsClassifier 5 | from sklearn.learning_curve import validation_curve 6 | 7 | 8 | cs = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10] 9 | training_scores, test_scores = validation_curve(LinearSVC(), X, y, 10 | param_name="C", param_range=cs) 11 | plt.figure() 12 | plot_validation_curve(range(7), training_scores, test_scores) 13 | 14 | 15 | ks = range(1, 10) 16 | training_scores, test_scores = validation_curve(KNeighborsClassifier(), X, y, 17 | param_name="n_neighbors", param_range=ks) 18 | plt.figure() 19 | plot_validation_curve(ks, training_scores, test_scores) 20 | --------------------------------------------------------------------------------