├── .gitignore ├── API Summary.ipynb ├── Cross-validation.ipynb ├── First Steps.ipynb ├── Grid Searches for Hyper Parameters.ipynb ├── Intro to Machine Learning and data representations.ipynb ├── LICENSE ├── Linear models.ipynb ├── Model Complexity.ipynb ├── Preprocessing and Pipelines.ipynb ├── Stochastic Gradient Descent.ipynb ├── Support Vector Machines.ipynb ├── Unsupervised Transformers.ipynb ├── Using built-in and custom score functions.ipynb ├── figures ├── bag_of_words.svg ├── cluster_comparison.png ├── cross_validation.svg ├── data_representation.svg ├── feature_union.svg ├── grid_search_cross_validation.svg ├── hashing_vectorizer.svg ├── overfitting_underfitting_cartoon.svg ├── pipeline.svg ├── pipeline_cross_validation.svg ├── randomized_search.png ├── supervised_workflow.svg ├── train_test_split.svg ├── train_test_split_matrix.svg ├── train_validation_test2.svg └── unsupervised_workflow.svg ├── outline.rst ├── plots ├── __init__.py ├── plot_2d_separator.py ├── plot_interactive_forest.py ├── plot_interactive_tree.py ├── plot_kneighbors_regularization.py ├── plot_linear_svc_regularization.py └── plot_rbf_svm_parameters.py └── solutions ├── cross_validation_iris.py ├── digits_unsupervised.py ├── forests.py ├── grid_search_forest.py ├── grid_search_k_neighbors.py ├── linear_models.py ├── load_iris.py ├── pipeline_iris.py ├── svms.py ├── train_iris.py └── validation_curve.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | notebooks/.ipynb_checkpoints/ 3 | notebooks/datasets 4 | notebooks/joblib/ 5 | .ipynb_checkpoints 6 | -------------------------------------------------------------------------------- /API Summary.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A recap on Scikit-learn's estimator interface\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "``X`` : data, 2d numpy array or scipy sparse matrix of shape (n_samples, n_features)\n", 15 | "\n", 16 | "``y`` : targets, 1d numpy array of shape (n_samples,)" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Methods" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "\n", 31 | "\n", 32 | "\n", 33 | "\n", 34 | "\n", 35 | "\n", 36 | "\n", 37 | "
``model.fit(X_train, [y_train])``
``model.predict(X_test)````model.transform(X_test)``
ClassificationPreprocessing
RegressionDimensionality Reduction
ClusteringFeature Extraction
 Feature selection
" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Efficient alternatives, methods for models that don't generalize\n", 45 | "``model.fit_predict(X)`` (clustering)\n", 46 | "\n", 47 | "``model.fit_transform(X)`` (manifold learning)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Additional methods\n", 55 | "__Model evaluation__ : ``score(X, [y])``\n", 56 | "\n", 57 | "__Uncertainties from Classifiers__: ``decision_function(X)`` and ``predict_proba(X)``." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Attributes\n", 65 | "__Classifiers__: ``classes_``\n", 66 | "\n", 67 | "__Clustering__: ``labels_``\n", 68 | "\n", 69 | "__Manifold Learning__: ``embedding_``\n", 70 | "\n", 71 | "__Linear models__: ``coef_``\n", 72 | "\n", 73 | "__Linear Decompositions__: ``components_``" 74 | ] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 2", 80 | "language": "python", 81 | "name": "python2" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 2 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython2", 93 | "version": "2.7.10" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 0 98 | } 99 | -------------------------------------------------------------------------------- /Cross-validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Cross-Validation\n", 21 | "----------------------------------------" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "from sklearn.datasets import load_iris" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "iris = load_iris()\n", 51 | "X = iris.data\n", 52 | "y = iris.target" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "from sklearn.cross_validation import cross_val_score\n", 64 | "from sklearn.svm import LinearSVC" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "cross_val_score(LinearSVC(), X, y, cv=5)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "cross_val_score(LinearSVC(), X, y, cv=5, scoring=\"f1_macro\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Let's go to a binary task for a moment" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "y % 2" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "cross_val_score(LinearSVC(), X, y % 2)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"average_precision\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"roc_auc\")" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "from sklearn.metrics.scorer import SCORERS\n", 149 | "print(SCORERS.keys())" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Implementing your own scoring metric:" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "def my_accuracy_scoring(est, X, y):\n", 168 | " return np.mean(est.predict(X) == y)\n", 169 | "\n", 170 | "cross_val_score(LinearSVC(), X, y, scoring=my_accuracy_scoring)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "def my_super_scoring(est, X, y):\n", 182 | " return np.mean(est.predict(X) == y) - np.mean(est.coef_ != 0)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn.grid_search import GridSearchCV\n", 194 | "\n", 195 | "y = iris.target\n", 196 | "grid = GridSearchCV(LinearSVC(C=.01, dual=False),\n", 197 | " param_grid={'penalty' : ['l1', 'l2']},\n", 198 | " scoring=my_super_scoring)\n", 199 | "grid.fit(X, y)\n", 200 | "print(grid.best_params_)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "There are other ways to do cross-valiation" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "from sklearn.cross_validation import ShuffleSplit\n", 219 | "\n", 220 | "shuffle_split = ShuffleSplit(len(X), 10, test_size=.4)\n", 221 | "cross_val_score(LinearSVC(), X, y, cv=shuffle_split)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit\n", 233 | "\n", 234 | "def plot_cv(cv, n_samples):\n", 235 | " masks = []\n", 236 | " for train, test in cv:\n", 237 | " mask = np.zeros(n_samples, dtype=bool)\n", 238 | " mask[test] = 1\n", 239 | " masks.append(mask)\n", 240 | " plt.matshow(masks)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "plot_cv(StratifiedKFold(y, n_folds=5), len(y))" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "plot_cv(KFold(len(iris.target), n_folds=5), len(iris.target))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": false 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "plot_cv(ShuffleSplit(len(iris.target), n_iter=20, test_size=.2), \n", 274 | " len(iris.target))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "source": [ 283 | "# Exercises\n", 284 | "Use KFold cross validation and StratifiedKFold cross validation (3 or 5 folds) for LinearSVC on the iris dataset.\n", 285 | "Why are the results so different? How could you get more similar results?" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "# %load solutions/cross_validation_iris.py" 297 | ] 298 | } 299 | ], 300 | "metadata": { 301 | "kernelspec": { 302 | "display_name": "Python 2", 303 | "language": "python", 304 | "name": "python2" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 2 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython2", 316 | "version": "2.7.9" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 0 321 | } 322 | -------------------------------------------------------------------------------- /First Steps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Get some data to play with" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.datasets import load_digits\n", 32 | "digits = load_digits()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "from sklearn.cross_validation import train_test_split\n", 44 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n", 45 | " digits.target)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "X_train.shape" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Really Simple API\n", 64 | "-------------------\n", 65 | "0) Import your model class" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "from sklearn.svm import LinearSVC" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "1) Instantiate an object and set the parameters" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "svm = LinearSVC(C=0.1)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "2) Fit the model" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "svm.fit(X_train, y_train)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "3) Apply / evaluate" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": false, 127 | "scrolled": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "print(svm.predict(X_test))\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "svm.score(X_train, y_train)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "svm.score(X_test, y_test)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "And again\n", 161 | "---------" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "from sklearn.ensemble import RandomForestClassifier" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "rf = RandomForestClassifier(n_estimators=50)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "rf.fit(X_train, y_train)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "rf.score(X_test, y_test)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "%load https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/examples/classification/plot_classifier_comparison.py" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "# Exercises\n", 224 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n", 225 | "\n", 226 | "Split it into training and test set using ``train_test_split``.\n", 227 | "Then train an evaluate a classifier of your choice.\n" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "# %load solutions/train_iris.py" 239 | ] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 2", 245 | "language": "python", 246 | "name": "python2" 247 | }, 248 | "language_info": { 249 | "codemirror_mode": { 250 | "name": "ipython", 251 | "version": 2 252 | }, 253 | "file_extension": ".py", 254 | "mimetype": "text/x-python", 255 | "name": "python", 256 | "nbconvert_exporter": "python", 257 | "pygments_lexer": "ipython2", 258 | "version": "2.7.10" 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 0 263 | } 264 | -------------------------------------------------------------------------------- /Grid Searches for Hyper Parameters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Grid Searches\n", 21 | "=================" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Grid-Search with build-in cross validation" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.grid_search import GridSearchCV\n", 47 | "from sklearn.svm import SVC" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "from sklearn.datasets import load_digits\n", 59 | "from sklearn.cross_validation import train_test_split\n", 60 | "digits = load_digits()\n", 61 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n", 62 | " digits.target, random_state=0)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Define parameter grid:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "import numpy as np\n", 81 | "\n", 82 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n", 83 | " 'gamma' : 10. ** np.arange(-5, 0)}\n", 84 | "\n", 85 | "np.set_printoptions(suppress=True)\n", 86 | "print(param_grid)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "A GridSearchCV object behaves just like a normal classifier." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false, 112 | "scrolled": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "grid_search.fit(X_train, y_train)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": false, 124 | "scrolled": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "grid_search.predict(X_test)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "grid_search.score(X_test, y_test)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "grid_search.best_params_" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "# We extract just the scores\n", 162 | "\n", 163 | "scores = [x.mean_validation_score for x in grid_search.grid_scores_]\n", 164 | "scores = np.array(scores).reshape(6, 5)\n", 165 | "\n", 166 | "plt.matshow(scores)\n", 167 | "plt.xlabel('gamma')\n", 168 | "plt.ylabel('C')\n", 169 | "plt.colorbar()\n", 170 | "plt.xticks(np.arange(5), param_grid['gamma'])\n", 171 | "plt.yticks(np.arange(6), param_grid['C']);" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "Nested Cross-validation in scikit-learn:" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [], 188 | "source": [] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "collapsed": true 194 | }, 195 | "source": [ 196 | "# Exercises\n", 197 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier.\n", 198 | "Visualize ``grid_search.grid_scores_``." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "from sklearn.neighbors import KNeighborsClassifier" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "# %load solutions/grid_search_k_neighbors.py" 221 | ] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": "Python 2", 227 | "language": "python", 228 | "name": "python2" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 2 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython2", 240 | "version": "2.7.10" 241 | } 242 | }, 243 | "nbformat": 4, 244 | "nbformat_minor": 0 245 | } 246 | -------------------------------------------------------------------------------- /Intro to Machine Learning and data representations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# What is machine learning ?" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Supervised learning\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "source": [ 23 | "" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "# Data Representations" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Dataset Split" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "% matplotlib nbagg\n", 68 | "import matplotlib.pyplot as plt\n", 69 | "import numpy as np" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "from sklearn.datasets import load_digits\n", 81 | "digits = load_digits()\n", 82 | "digits.keys()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "digits.images.shape" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "print(digits.images[0])" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "plt.matshow(digits.images[0], cmap=plt.cm.Greys)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "digits.data.shape" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "digits.target.shape" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "digits.target" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "**Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)**" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Splitting the data:" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "from sklearn.cross_validation import train_test_split\n", 174 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "# Exercises\n", 182 | "\n", 183 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n", 184 | "The function returns a dictionary-like object that has the same attributes as ``digits``.\n", 185 | "\n", 186 | "What is the number of classes, features and data points in this dataset?\n", 187 | "Use a scatterplot to visualize the dataset.\n", 188 | "\n", 189 | "You can look at ``DESCR`` attribute to learn more about the dataset." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "# %load solutions/load_iris.py" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 2", 216 | "language": "python", 217 | "name": "python2" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 2 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython2", 229 | "version": "2.7.9" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 0 234 | } 235 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Andreas Mueller 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /Linear models.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Linear models for regression" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "\n", 28 | "```\n", 29 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_\n", 30 | "```" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from sklearn.datasets import make_regression\n", 42 | "from sklearn.cross_validation import train_test_split\n", 43 | "\n", 44 | "X, y, true_coefficient = make_regression(n_samples=80, n_features=30, n_informative=10, noise=100, coef=True, random_state=5)\n", 45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)\n", 46 | "print(X_train.shape)\n", 47 | "print(y_train.shape)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Linear Regression\n", 55 | "\n", 56 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 $$" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false, 64 | "scrolled": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "from sklearn.linear_model import LinearRegression\n", 69 | "linear_regression = LinearRegression().fit(X_train, y_train)\n", 70 | "print(\"R^2 on training set: %f\" % linear_regression.score(X_train, y_train))\n", 71 | "print(\"R^2 on test set: %f\" % linear_regression.score(X_test, y_test))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "from sklearn.metrics import r2_score\n", 83 | "print(r2_score(np.dot(X, true_coefficient), y))" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "plt.figure(figsize=(10, 5))\n", 95 | "coefficient_sorting = np.argsort(true_coefficient)[::-1]\n", 96 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\")\n", 97 | "plt.plot(linear_regression.coef_[coefficient_sorting], \"o\", label=\"linear regression\")\n", 98 | "\n", 99 | "plt.legend()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "## Ridge Regression (L2 penalty)\n", 107 | "\n", 108 | "$$ \\text{min}_{w,b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_2^2$$ " 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "from sklearn.linear_model import Ridge\n", 120 | "ridge_models = {}\n", 121 | "training_scores = []\n", 122 | "test_scores = []\n", 123 | "\n", 124 | "for alpha in [100, 10, 1, .01]:\n", 125 | " ridge = Ridge(alpha=alpha).fit(X_train, y_train)\n", 126 | " training_scores.append(ridge.score(X_train, y_train))\n", 127 | " test_scores.append(ridge.score(X_test, y_test))\n", 128 | " ridge_models[alpha] = ridge\n", 129 | "\n", 130 | "plt.figure()\n", 131 | "plt.plot(training_scores, label=\"training scores\")\n", 132 | "plt.plot(test_scores, label=\"test scores\")\n", 133 | "plt.xticks(range(4), [100, 10, 1, .01])\n", 134 | "plt.legend(loc=\"best\")" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "plt.figure(figsize=(10, 5))\n", 146 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n", 147 | "\n", 148 | "for i, alpha in enumerate([100, 10, 1, .01]):\n", 149 | " plt.plot(ridge_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n", 150 | " \n", 151 | "plt.legend(loc=\"best\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Lasso (L1 penalty)\n", 159 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_1$$ " 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "from sklearn.linear_model import Lasso\n", 171 | "\n", 172 | "lasso_models = {}\n", 173 | "training_scores = []\n", 174 | "test_scores = []\n", 175 | "\n", 176 | "for alpha in [30, 10, 1, .01]:\n", 177 | " lasso = Lasso(alpha=alpha).fit(X_train, y_train)\n", 178 | " training_scores.append(lasso.score(X_train, y_train))\n", 179 | " test_scores.append(lasso.score(X_test, y_test))\n", 180 | " lasso_models[alpha] = lasso\n", 181 | "plt.figure()\n", 182 | "plt.plot(training_scores, label=\"training scores\")\n", 183 | "plt.plot(test_scores, label=\"test scores\")\n", 184 | "plt.xticks(range(4), [30, 10, 1, .01])\n", 185 | "plt.legend(loc=\"best\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [], 195 | "source": [ 196 | "plt.figure(figsize=(10, 5))\n", 197 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n", 198 | "\n", 199 | "for i, alpha in enumerate([30, 10, 1, .01]):\n", 200 | " plt.plot(lasso_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n", 201 | " \n", 202 | "plt.legend(loc=\"best\")" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## Linear models for classification" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "\n", 217 | "```\n", 218 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_ > 0\n", 219 | "```" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "The influence of C in LinearSVC" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "from plots import plot_linear_svc_regularization\n", 238 | "plot_linear_svc_regularization()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "## Multi-Class linear classification" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "from sklearn.datasets import make_blobs\n", 257 | "plt.figure()\n", 258 | "X, y = make_blobs(random_state=42)\n", 259 | "plt.scatter(X[:, 0], X[:, 1], c=y)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "from sklearn.svm import LinearSVC\n", 271 | "linear_svm = LinearSVC().fit(X, y)\n", 272 | "print(linear_svm.coef_.shape)\n", 273 | "print(linear_svm.intercept_.shape)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "plt.figure()\n", 285 | "plt.scatter(X[:, 0], X[:, 1], c=y)\n", 286 | "line = np.linspace(-15, 15)\n", 287 | "for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):\n", 288 | " plt.plot(line, -(line * coef[0] + intercept) / coef[1])\n", 289 | "plt.ylim(-10, 15)\n", 290 | "plt.xlim(-10, 8)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "# Exercises" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "* Compare Logistic regression with l1 penalty and l2 penalty by plotting the coefficients as above for the digits dataset. Classify odd vs even digits to make it a binary task." 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": true 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "y % 2" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": false 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "# %load solutions/linear_models.py" 327 | ] 328 | } 329 | ], 330 | "metadata": { 331 | "kernelspec": { 332 | "display_name": "Python 2", 333 | "language": "python", 334 | "name": "python2" 335 | }, 336 | "language_info": { 337 | "codemirror_mode": { 338 | "name": "ipython", 339 | "version": 2 340 | }, 341 | "file_extension": ".py", 342 | "mimetype": "text/x-python", 343 | "name": "python", 344 | "nbconvert_exporter": "python", 345 | "pygments_lexer": "ipython2", 346 | "version": "2.7.10" 347 | } 348 | }, 349 | "nbformat": 4, 350 | "nbformat_minor": 0 351 | } 352 | -------------------------------------------------------------------------------- /Model Complexity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import matplotlib.pyplot as plt\n", 12 | "import numpy as np\n", 13 | "%matplotlib nbagg" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Model Complexity, Overfitting and Underfitting\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from plots import plot_kneighbors_regularization\n", 32 | "plot_kneighbors_regularization()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "![underfitting and overfitting](figures/overfitting_underfitting_cartoon.svg)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "source": [ 48 | "# Validation Curves" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "from sklearn.datasets import load_digits\n", 60 | "from sklearn.ensemble import RandomForestClassifier\n", 61 | "from sklearn.learning_curve import validation_curve" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "digits = load_digits()\n", 73 | "X, y = digits.data, digits.target" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "model = RandomForestClassifier(n_estimators=20)\n", 85 | "param_range = range(1, 13)\n", 86 | "training_scores, validation_scores = validation_curve(model, X, y,\n", 87 | " param_name=\"max_depth\",\n", 88 | " param_range=param_range, cv=5)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "training_scores.shape" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "def plot_validation_curve(parameter_values, train_scores, validation_scores):\n", 111 | " train_scores_mean = np.mean(train_scores, axis=1)\n", 112 | " train_scores_std = np.std(train_scores, axis=1)\n", 113 | " validation_scores_mean = np.mean(validation_scores, axis=1)\n", 114 | " validation_scores_std = np.std(validation_scores, axis=1)\n", 115 | "\n", 116 | " plt.fill_between(parameter_values, train_scores_mean - train_scores_std,\n", 117 | " train_scores_mean + train_scores_std, alpha=0.1,\n", 118 | " color=\"r\")\n", 119 | " plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,\n", 120 | " validation_scores_mean + validation_scores_std, alpha=0.1, color=\"g\")\n", 121 | " plt.plot(parameter_values, train_scores_mean, 'o-', color=\"r\",\n", 122 | " label=\"Training score\")\n", 123 | " plt.plot(parameter_values, validation_scores_mean, 'o-', color=\"g\",\n", 124 | " label=\"Cross-validation score\")\n", 125 | " plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)\n", 126 | " plt.legend(loc=\"best\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "plt.figure()\n", 138 | "plot_validation_curve(param_range, training_scores, validation_scores)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# Exercise\n", 146 | "\n", 147 | "Plot the validation curve on the digit dataset for:\n", 148 | "* a LinearSVC with a logarithmic range of regularization parameters ``C``.\n", 149 | "* KNeighborsClassifier with a linear range of neighbors ``k``.\n", 150 | "\n", 151 | "What do you expect them to look like? How do they actually look like?" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "# %load solutions/validation_curve.py" 163 | ] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 2", 169 | "language": "python", 170 | "name": "python2" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 2 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython2", 182 | "version": "2.7.9" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 0 187 | } 188 | -------------------------------------------------------------------------------- /Preprocessing and Pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Preprocessing and Pipelines\n", 21 | "=============================" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "from sklearn.datasets import load_digits\n", 40 | "from sklearn.cross_validation import train_test_split\n", 41 | "digits = load_digits()\n", 42 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.\n", 50 | "To do that, we build a pipeline." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "from sklearn.pipeline import Pipeline, make_pipeline\n", 62 | "from sklearn.svm import SVC\n", 63 | "from sklearn.preprocessing import StandardScaler" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"svm\", SVC())])\n", 75 | "# or for short:\n", 76 | "make_pipeline(StandardScaler(), SVC())" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "pipeline.fit(X_train, y_train)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "pipeline.predict(X_test)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Cross-validation with a pipeline\n", 113 | "---------------------------------" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "from sklearn.cross_validation import cross_val_score\n", 125 | "cross_val_score(pipeline, X_train, y_train)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Grid Search with a pipeline\n", 133 | "===========================" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "from sklearn.grid_search import GridSearchCV\n", 145 | "\n", 146 | "param_grid = {'svm__C': 10. ** np.arange(-3, 3),\n", 147 | " 'svm__gamma' : 10. ** np.arange(-3, 3)}\n", 148 | "\n", 149 | "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "grid_pipeline.fit(X_train, y_train)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "grid_pipeline.score(X_test, y_test)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "source": [ 180 | "# Exercises\n", 181 | "Add random features to the iris dataset using ``np.random.uniform`` and ``np.hstack``.\n", 182 | "\n", 183 | "Build a pipeline using the SelectKBest univariate feature selection from the sklearn.feature_selection module and the LinearSVC on the iris dataset.\n", 184 | "\n", 185 | "Use GridSearchCV to adjust C and the number of features selected in SelectKBest." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "collapsed": false, 193 | "scrolled": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "# %load solutions/pipeline_iris.py" 198 | ] 199 | } 200 | ], 201 | "metadata": { 202 | "kernelspec": { 203 | "display_name": "Python 2", 204 | "language": "python", 205 | "name": "python2" 206 | }, 207 | "language_info": { 208 | "codemirror_mode": { 209 | "name": "ipython", 210 | "version": 2 211 | }, 212 | "file_extension": ".py", 213 | "mimetype": "text/x-python", 214 | "name": "python", 215 | "nbconvert_exporter": "python", 216 | "pygments_lexer": "ipython2", 217 | "version": "2.7.9" 218 | } 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 0 222 | } 223 | -------------------------------------------------------------------------------- /Stochastic Gradient Descent.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from sklearn.datasets import load_digits\n", 12 | "from sklearn.linear_model import SGDClassifier\n", 13 | "from sklearn.preprocessing import StandardScaler\n", 14 | "from sklearn.cross_validation import train_test_split\n", 15 | "\n", 16 | "digits = load_digits()\n", 17 | "\n", 18 | "\n", 19 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)\n", 20 | "scaler = StandardScaler()\n", 21 | "X_train_scaled = scaler.fit_transform(X_train)\n", 22 | "X_test_scaled = scaler.transform(X_test)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "collapsed": false 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "sgd = SGDClassifier(n_iter=5, loss=\"hinge\", penalty=\"l2\")\n", 34 | "sgd.fit(X_train_scaled, y_train)\n", 35 | "print(sgd.score(X_test_scaled, y_test))" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "sgd = SGDClassifier(shuffle=False)\n", 47 | "sgd.partial_fit(X_train_scaled, y_train, classes=range(10))\n", 48 | "print(sgd.score(X_test_scaled, y_test))\n", 49 | "sgd.partial_fit(X_train_scaled, y_train)\n", 50 | "print(sgd.score(X_test_scaled, y_test))\n", 51 | "sgd.partial_fit(X_train_scaled, y_train)\n", 52 | "print(sgd.score(X_test_scaled, y_test))" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "source": [ 61 | "# Exercise\n", 62 | "Record the training and test loss for 10 iterations using constant learning rate and \"invscaling\" learning rate.\n", 63 | "Plot the resulting convergence curves. Try different learning rates." 64 | ] 65 | } 66 | ], 67 | "metadata": { 68 | "kernelspec": { 69 | "display_name": "Python 2", 70 | "language": "python", 71 | "name": "python2" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": { 75 | "name": "ipython", 76 | "version": 2 77 | }, 78 | "file_extension": ".py", 79 | "mimetype": "text/x-python", 80 | "name": "python", 81 | "nbconvert_exporter": "python", 82 | "pygments_lexer": "ipython2", 83 | "version": "2.7.10" 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 0 88 | } 89 | -------------------------------------------------------------------------------- /Support Vector Machines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Support Vector Machines" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.datasets import load_digits\n", 32 | "from sklearn.cross_validation import train_test_split\n", 33 | "\n", 34 | "digits = load_digits()\n", 35 | "X_train, X_test, y_train, y_test = train_test_split(digits.data / 16., digits.target % 2, random_state=2)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.svm import LinearSVC, SVC\n", 47 | "linear_svc = LinearSVC(loss=\"hinge\").fit(X_train, y_train)\n", 48 | "svc = SVC(kernel=\"linear\").fit(X_train, y_train)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "np.mean(linear_svc.predict(X_test) == svc.predict(X_test))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Kernel SVMs\n", 67 | "\n", 68 | "\n", 69 | "Predictions in a kernel-SVM are made using the formular\n", 70 | "\n", 71 | "$$\n", 72 | "\\hat{y} = \\alpha_0 + \\alpha_1 y_1 k(\\mathbf{x^{(1)}}, \\mathbf{x}) + ... + \\alpha_n y_n k(\\mathbf{x^{(n)}}, \\mathbf{x})> 0\n", 73 | "$$\n", 74 | "\n", 75 | "$$\n", 76 | "0 \\leq \\alpha_i \\leq C\n", 77 | "$$\n", 78 | "\n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Radial basis function (Gaussian) kernel:\n", 86 | "$$k(\\mathbf{x}, \\mathbf{x'}) = \\exp(-\\gamma ||\\mathbf{x} - \\mathbf{x'}||^2)$$" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "from sklearn.metrics.pairwise import rbf_kernel\n", 98 | "line = np.linspace(-3, 3, 100)[:, np.newaxis]\n", 99 | "kernel_value = rbf_kernel([[0]], line, gamma=1)\n", 100 | "plt.plot(line, kernel_value.T)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "from plots import plot_svm_interactive\n", 112 | "plot_svm_interactive()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "svc = SVC().fit(X_train, y_train)\n", 124 | "svc.score(X_test, y_test)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "Cs = [0.001, 0.01, 0.1, 1, 10, 100]\n", 136 | "gammas = [0.001, 0.01, 0.1, 1, 10, 100]\n", 137 | "\n", 138 | "from sklearn.grid_search import GridSearchCV\n", 139 | "\n", 140 | "param_grid = {'C': Cs, 'gamma' : gammas}\n", 141 | "grid_search = GridSearchCV(SVC(), param_grid, cv=5)\n", 142 | "grid_search.fit(X_train, y_train)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "grid_search.score(X_test, y_test)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "# We extract just the scores\n", 165 | "scores = [x[1] for x in grid_search.grid_scores_]\n", 166 | "scores = np.array(scores).reshape(6, 6)\n", 167 | "\n", 168 | "plt.matshow(scores)\n", 169 | "plt.xlabel('gamma')\n", 170 | "plt.ylabel('C')\n", 171 | "plt.colorbar()\n", 172 | "plt.xticks(np.arange(6), param_grid['gamma'])\n", 173 | "plt.yticks(np.arange(6), param_grid['C']);" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "source": [ 182 | "# Excercise\n", 183 | "* Scale the data using StandardScaler before applying the SVC. How does the performance of the default parameters change?\n", 184 | "* Grid-Search the parameters for the scaled data. How do they differ from the previous ones?" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 2", 191 | "language": "python", 192 | "name": "python2" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 2 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython2", 204 | "version": "2.7.10" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 0 209 | } 210 | -------------------------------------------------------------------------------- /Unsupervised Transformers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib nbagg\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.datasets import load_digits\n", 32 | "from sklearn.cross_validation import train_test_split\n", 33 | "import numpy as np\n", 34 | "np.set_printoptions(suppress=True)\n", 35 | "\n", 36 | "digits = load_digits()\n", 37 | "X, y = digits.data, digits.target\n", 38 | "X_train, X_test, y_train, y_test = train_test_split(X, y)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Removing mean and scaling variance\n", 46 | "===================================" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from sklearn.preprocessing import StandardScaler" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "1) Instantiate the model" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "scaler = StandardScaler()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "2) Fit using only the data." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "scaler.fit(X_train)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "3) `transform` the data (not `predict`)." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "X_train_scaled = scaler.transform(X_train)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "X_train.shape" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "X_train_scaled.shape" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "The transformed version of the data has the mean removed:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "X_train_scaled.mean(axis=0)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "X_train_scaled.std(axis=0)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "X_test_transformed = scaler.transform(X_test)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "Principal Component Analysis\n", 181 | "=============================" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "0) Import the model" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": false 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "from sklearn.decomposition import PCA" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "1) Instantiate the model" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "pca = PCA(n_components=2)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "2) Fit to training data" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": false 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "pca.fit(X)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "3) Transform to lower-dimensional representation" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "print(X.shape)\n", 254 | "X_pca = pca.transform(X)\n", 255 | "X_pca.shape" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Visualize\n", 263 | "----------" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": false 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "plt.figure()\n", 275 | "plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": false 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "pca.components_.shape" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "plt.matshow(pca.components_[0].reshape(8, 8), cmap=\"gray\")\n", 298 | "plt.colorbar()\n", 299 | "plt.matshow(pca.components_[1].reshape(8, 8), cmap=\"gray\")\n", 300 | "plt.colorbar()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "collapsed": false 307 | }, 308 | "source": [ 309 | "Manifold Learning\n", 310 | "==================" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": { 317 | "collapsed": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "from sklearn.manifold import Isomap\n", 322 | "isomap = Isomap()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": false 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "X_isomap = isomap.fit_transform(X)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": false 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "plt.scatter(X_isomap[:, 0], X_isomap[:, 1], c=y)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "collapsed": true 351 | }, 352 | "source": [ 353 | "# Exercises\n", 354 | "* Visualize the digits dataset using the TSNE algorithm from the sklearn.manifold module (it runs for a couple of seconds).\n", 355 | "* Extract non-negative components from the digits dataset using NMF. Visualize the resulting components. The interface of NMF is identical to the PCA one. What qualitative difference can you find compared to PCA?" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "collapsed": false 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "# %load solutions/digits_unsupervised.py\n", 367 | "from sklearn.manifold import TSNE\n", 368 | "from sklearn.decomposition import NMF\n", 369 | "\n", 370 | "# Compute TSNE embedding\n", 371 | "tsne = TSNE()\n", 372 | "X_tsne = tsne.fit_transform(X)\n", 373 | "\n", 374 | "# Visualize TSNE results\n", 375 | "plt.title(\"All classes\")\n", 376 | "plt.figure()\n", 377 | "plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)\n", 378 | "\n", 379 | "# build an NMF factorization of the digits dataset\n", 380 | "nmf = NMF(n_components=16).fit(X)\n", 381 | "\n", 382 | "# visualize the components\n", 383 | "fig, axes = plt.subplots(4, 4)\n", 384 | "for ax, component in zip(axes.ravel(), nmf.components_):\n", 385 | " ax.imshow(component.reshape(8, 8), cmap=\"gray\", interpolation=\"nearest\")\n" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": { 392 | "collapsed": true 393 | }, 394 | "outputs": [], 395 | "source": [] 396 | } 397 | ], 398 | "metadata": { 399 | "kernelspec": { 400 | "display_name": "Python 2", 401 | "language": "python", 402 | "name": "python2" 403 | }, 404 | "language_info": { 405 | "codemirror_mode": { 406 | "name": "ipython", 407 | "version": 2 408 | }, 409 | "file_extension": ".py", 410 | "mimetype": "text/x-python", 411 | "name": "python", 412 | "nbconvert_exporter": "python", 413 | "pygments_lexer": "ipython2", 414 | "version": "2.7.10" 415 | } 416 | }, 417 | "nbformat": 4, 418 | "nbformat_minor": 0 419 | } 420 | -------------------------------------------------------------------------------- /Using built-in and custom score functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n", 14 | "np.set_printoptions(precision=2)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Built-In and custom scoring functions\n", 22 | "=======================================" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Using built-in scoring functions" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "from sklearn.datasets import make_classification\n", 41 | "from sklearn.cross_validation import train_test_split\n", 42 | "\n", 43 | "X, y = make_classification(random_state=0)\n", 44 | "\n", 45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "from sklearn.linear_model import LogisticRegression\n", 57 | "\n", 58 | "lr = LogisticRegression()\n", 59 | "lr.fit(X_train, y_train)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "print(lr.score(X_test, y_test))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "pred = lr.predict(X_test)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "from sklearn.metrics import confusion_matrix\n", 93 | "print(confusion_matrix(y_test, pred))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Binary confusion matrix:\n", 101 | "\n", 102 | "\n", 103 | "\n", 104 | "\n", 105 | "
True Positive (TP)False Negative (FN)
False Positive (FP) True Negative (TN)
\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "\n", 113 | "$$ \\text{precision} = \\frac{TP}{FP + TP} $$\n", 114 | "\n", 115 | "$$ \\text{recall} = \\frac{TP}{FN + TP} $$\n", 116 | "\n", 117 | "$$ \\text{accuracy} = \\frac{TP + TN}{FP + FN + TP + TN} $$\n", 118 | "\n", 119 | "$$ f_1 = 2 \\frac{\\text{precision} \\cdot \\text{recall}}{\\text{precision} + \\text{recall}} $$\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "from sklearn.metrics import classification_report\n", 131 | "print(classification_report(y_test, pred))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "from sklearn.metrics import precision_score, f1_score\n", 143 | "print(\"precision: %f f1_score: %f\" % (precision_score(y_test, pred), f1_score(y_test, pred)))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [], 153 | "source": [ 154 | "from sklearn.metrics import roc_auc_score, average_precision_score, log_loss\n", 155 | "\n", 156 | "probs = lr.predict_proba(X_test)[:, 1]\n", 157 | "\n", 158 | "print(\"area under the roc_curve: %f\" % roc_auc_score(y_test, probs))\n", 159 | "print(\"average precision: %f\" % average_precision_score(y_test, probs))\n", 160 | "print(\"log loss: %f\" % log_loss(y_test, probs))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Scorers for cross-validation and grid-search" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": false 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "from sklearn.metrics.scorer import SCORERS\n", 179 | "print(SCORERS.keys())" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "from sklearn.cross_validation import cross_val_score\n", 191 | "\n", 192 | "cross_val_score(LogisticRegression(), X, y)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "print(\"Accuracy scoring: %s\" % cross_val_score(LogisticRegression(), X, y, scoring=\"accuracy\"))\n", 204 | "print(\"F1 scoring: %s\" % cross_val_score(LogisticRegression(), X, y, scoring=\"f1\"))\n", 205 | "print(\"AUC scoring: %s\" % cross_val_score(LogisticRegression(), X, y, scoring=\"roc_auc\"))\n", 206 | "print(\"Log loss scoring: %s\" % cross_val_score(LogisticRegression(), X, y, scoring=\"log_loss\"))" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "from sklearn.grid_search import GridSearchCV\n", 218 | "\n", 219 | "param_grid = {'C': np.logspace(start=-3, stop=3, num=10)}\n", 220 | "grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring=\"log_loss\")\n", 221 | "grid_search.fit(X, y)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "grid_search.grid_scores_" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "grid_search.best_params_" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## Defining your own scoring callable" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### From scratch" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "collapsed": false 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "def my_accuracy_scoring(est, X, y):\n", 269 | " return np.mean(est.predict(X) == y)\n", 270 | "\n", 271 | "print(cross_val_score(LogisticRegression(), X, y))\n", 272 | "print(cross_val_score(LogisticRegression(), X, y, scoring=my_accuracy_scoring))" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": {}, 278 | "source": [ 279 | "### From a score function" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": { 286 | "collapsed": false 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "from sklearn.metrics import fbeta_score\n", 291 | "fbeta_score(y_test, pred, beta=10)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "collapsed": false 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "from sklearn.metrics.scorer import make_scorer\n", 303 | "my_fbeta_scorer = make_scorer(fbeta_score, beta=10)\n", 304 | "\n", 305 | "print(cross_val_score(LogisticRegression(), X, y, scoring=my_fbeta_scorer))" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Accessing the estimator" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "def my_sparse_scoring(est, X, y):\n", 324 | " return np.mean(est.predict(X) == y) - np.mean(est.coef_ != 0)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": { 331 | "collapsed": false 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "from sklearn.grid_search import GridSearchCV\n", 336 | "from sklearn.svm import LinearSVC\n", 337 | "\n", 338 | "grid = GridSearchCV(LinearSVC(C=.01, dual=False),\n", 339 | " param_grid={'penalty' : ['l1', 'l2']},\n", 340 | " scoring=my_sparse_scoring)\n", 341 | "grid.fit(X, y)\n", 342 | "print(grid.best_params_)" 343 | ] 344 | } 345 | ], 346 | "metadata": { 347 | "kernelspec": { 348 | "display_name": "Python 2", 349 | "language": "python", 350 | "name": "python2" 351 | }, 352 | "language_info": { 353 | "codemirror_mode": { 354 | "name": "ipython", 355 | "version": 2 356 | }, 357 | "file_extension": ".py", 358 | "mimetype": "text/x-python", 359 | "name": "python", 360 | "nbconvert_exporter": "python", 361 | "pygments_lexer": "ipython2", 362 | "version": "2.7.10" 363 | } 364 | }, 365 | "nbformat": 4, 366 | "nbformat_minor": 0 367 | } 368 | -------------------------------------------------------------------------------- /figures/cluster_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/nyu_ml_lectures/3c5858870bd7177e1850fdd4c721af0115e6a258/figures/cluster_comparison.png -------------------------------------------------------------------------------- /figures/pipeline.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 28 | 34 | 35 | 43 | 49 | 50 | 58 | 64 | 65 | 73 | 79 | 80 | 88 | 94 | 95 | 103 | 109 | 110 | 118 | 124 | 125 | 133 | 139 | 140 | 148 | 154 | 155 | 163 | 169 | 170 | 178 | 184 | 185 | 193 | 199 | 200 | 208 | 214 | 215 | 223 | 229 | 230 | 238 | 244 | 245 | 253 | 259 | 260 | 268 | 274 | 275 | 283 | 289 | 290 | 298 | 304 | 305 | 313 | 319 | 320 | 328 | 334 | 335 | 343 | 349 | 350 | 351 | 373 | 375 | 376 | 378 | image/svg+xml 379 | 381 | 382 | 383 | 384 | 385 | 390 | pipe.fit(X, y) 402 | 406 | 413 | T1 424 | 425 | X 436 | y 447 | 454 | 461 | T1.fit(X, y) 472 | T2.fit(X1, y) 483 | Classifier.fit(X2, y) 494 | T1.transform(X) 505 | pipe.predict(X) 517 | X' 528 | y' 539 | Classifier.predict(X'2) 550 | 556 | 560 | 567 | T2 578 | 579 | 583 | 590 | Classifier 601 | 602 | 606 | 613 | T2 624 | 625 | 629 | 636 | T1 647 | 648 | X1 659 | 665 | y 676 | T2.transform(X1) 687 | X2 698 | y 709 | 713 | 720 | Classifier 731 | 732 | 739 | T1.transform(X) 750 | X'1 761 | 767 | T2.transform(X1) 778 | X'2 789 | 794 | 799 | 805 | 811 | 817 | pipe = make_pipeline(T1(), T2(), Classifier()) pipe = make_pipeline(T1(), T2(), Classifier()) 842 | 843 | 844 | -------------------------------------------------------------------------------- /figures/randomized_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/nyu_ml_lectures/3c5858870bd7177e1850fdd4c721af0115e6a258/figures/randomized_search.png -------------------------------------------------------------------------------- /figures/train_test_split.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlAll Data 369 | Training data 395 | Test data 421 | -------------------------------------------------------------------------------- /figures/train_validation_test2.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xmlAll Data 351 | Training 373 | Test 398 | Validation 423 | -------------------------------------------------------------------------------- /outline.rst: -------------------------------------------------------------------------------- 1 | copy cross-validation and intro to sklearn and api back in. 2 | 3 | start of with grid-search for nearest neighbors 4 | then go into linear models. 5 | do exercises for classifiers plotting coefficients bla 6 | then SGD (partial fit?) 7 | 8 | then do grid-search over linear models for good measure 9 | do learning curves maybe? 10 | 11 | Do scaling and pipelines 12 | -------------------------------------------------------------------------------- /plots/__init__.py: -------------------------------------------------------------------------------- 1 | from .plot_2d_separator import plot_2d_separator 2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \ 3 | plot_regression_datasets, make_dataset 4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization 5 | from .plot_interactive_tree import plot_tree_interactive 6 | from .plot_interactive_forest import plot_forest_interactive 7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters 8 | from .plot_rbf_svm_parameters import plot_svm_interactive 9 | 10 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization', 11 | 'plot_linear_svc_regularization', 'plot_tree_interactive', 12 | 'plot_regression_datasets', 'make_dataset', 13 | "plot_forest_interactive", "plot_rbf_svm_parameters", 14 | "plot_svm_interactive"] 15 | -------------------------------------------------------------------------------- /plots/plot_2d_separator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None): 6 | if eps is None: 7 | eps = X.std() / 2. 8 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps 9 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps 10 | xx = np.linspace(x_min, x_max, 100) 11 | yy = np.linspace(y_min, y_max, 100) 12 | 13 | X1, X2 = np.meshgrid(xx, yy) 14 | X_grid = np.c_[X1.ravel(), X2.ravel()] 15 | try: 16 | decision_values = classifier.decision_function(X_grid) 17 | levels = [0] 18 | fill_levels = [decision_values.min(), 0, decision_values.max()] 19 | except AttributeError: 20 | # no decision_function 21 | decision_values = classifier.predict_proba(X_grid)[:, 1] 22 | levels = [.5] 23 | fill_levels = [0, .5, 1] 24 | 25 | if ax is None: 26 | ax = plt.gca() 27 | if fill: 28 | ax.contourf(X1, X2, decision_values.reshape(X1.shape), 29 | levels=fill_levels, colors=['blue', 'red']) 30 | else: 31 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels, 32 | colors="black") 33 | ax.set_xlim(x_min, x_max) 34 | ax.set_ylim(y_min, y_max) 35 | ax.set_xticks(()) 36 | ax.set_yticks(()) 37 | 38 | 39 | if __name__ == '__main__': 40 | from sklearn.datasets import make_blobs 41 | from sklearn.linear_model import LogisticRegression 42 | X, y = make_blobs(centers=2, random_state=42) 43 | clf = LogisticRegression().fit(X, y) 44 | plot_2d_separator(clf, X, fill=True) 45 | plt.scatter(X[:, 0], X[:, 1], c=y) 46 | plt.show() 47 | -------------------------------------------------------------------------------- /plots/plot_interactive_forest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | 8 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 9 | 10 | 11 | def plot_forest(max_depth=1): 12 | plt.figure() 13 | ax = plt.gca() 14 | h = 0.02 15 | 16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 18 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 19 | 20 | if max_depth != 0: 21 | forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth, 22 | random_state=1).fit(X, y) 23 | Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 24 | Z = Z.reshape(xx.shape) 25 | ax.contourf(xx, yy, Z, alpha=.4) 26 | ax.set_title("max_depth = %d" % max_depth) 27 | else: 28 | ax.set_title("data set") 29 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 30 | ax.set_xlim(x_min, x_max) 31 | ax.set_ylim(y_min, y_max) 32 | ax.set_xticks(()) 33 | ax.set_yticks(()) 34 | 35 | 36 | def plot_forest_interactive(): 37 | from IPython.html.widgets import interactive, IntSlider 38 | slider = IntSlider(min=0, max=8, step=1, value=0) 39 | return interactive(plot_forest, max_depth=slider) 40 | -------------------------------------------------------------------------------- /plots/plot_interactive_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import make_blobs 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | from sklearn.externals.six import StringIO # doctest: +SKIP 8 | from sklearn.tree import export_graphviz 9 | from scipy.misc import imread 10 | from scipy import ndimage 11 | import os 12 | 13 | GRAPHVIS_PATH = r"C:\Program Files (x86)\Graphviz2.38\bin" 14 | if GRAPHVIS_PATH not in os.environ['PATH']: 15 | os.environ['PATH'] += ";" + GRAPHVIS_PATH 16 | 17 | import re 18 | 19 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50) 20 | 21 | 22 | def tree_image(tree, fout=None): 23 | try: 24 | import pydot 25 | import a_reliable_dot_rendering 26 | except ImportError: 27 | return None 28 | dot_data = StringIO() 29 | export_graphviz(tree, out_file=dot_data) 30 | data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue()) 31 | data = re.sub(r"samples = [0-9]+\\n", "", data) 32 | data = re.sub(r"\\nsamples = [0-9]+", "", data) 33 | 34 | graph = pydot.graph_from_dot_data(data) 35 | if fout is None: 36 | fout = "tmp.png" 37 | graph.write_png(fout) 38 | return imread(fout) 39 | 40 | 41 | def plot_tree(max_depth=1): 42 | fig, ax = plt.subplots(1, 2, figsize=(15, 7)) 43 | h = 0.02 44 | 45 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 46 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 47 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 48 | 49 | if max_depth != 0: 50 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y) 51 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 52 | Z = Z.reshape(xx.shape) 53 | faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) 54 | faces = faces.reshape(xx.shape) 55 | border = ndimage.laplace(faces) != 0 56 | ax[0].contourf(xx, yy, Z, alpha=.4) 57 | ax[0].scatter(xx[border], yy[border], marker='.', s=1) 58 | ax[0].set_title("max_depth = %d" % max_depth) 59 | img = tree_image(tree) 60 | if img is not None: 61 | ax[1].imshow(i) 62 | ax[1].axis("off") 63 | else: 64 | ax[1].set_visible(False) 65 | else: 66 | ax[0].set_title("data set") 67 | ax[1].set_visible(False) 68 | ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) 69 | ax[0].set_xlim(x_min, x_max) 70 | ax[0].set_ylim(y_min, y_max) 71 | ax[0].set_xticks(()) 72 | ax[0].set_yticks(()) 73 | 74 | 75 | def plot_tree_interactive(): 76 | from IPython.html.widgets import interactive, IntSlider 77 | slider = IntSlider(min=0, max=8, step=1, value=0) 78 | return interactive(plot_tree, max_depth=slider) 79 | -------------------------------------------------------------------------------- /plots/plot_kneighbors_regularization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.neighbors import KNeighborsRegressor 5 | 6 | 7 | def make_dataset(n_samples=100): 8 | rnd = np.random.RandomState(42) 9 | x = np.linspace(-3, 3, n_samples) 10 | y_no_noise = np.sin(4 * x) + x 11 | y = y_no_noise + rnd.normal(size=len(x)) 12 | return x, y 13 | 14 | 15 | def plot_regression_datasets(): 16 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 17 | for n_samples, ax in zip([10, 100, 1000], axes): 18 | x, y = make_dataset(n_samples) 19 | ax.plot(x, y, 'o', alpha=.6) 20 | 21 | 22 | def plot_kneighbors_regularization(): 23 | rnd = np.random.RandomState(42) 24 | x = np.linspace(-3, 3, 100) 25 | y_no_noise = np.sin(4 * x) + x 26 | y = y_no_noise + rnd.normal(size=len(x)) 27 | X = x[:, np.newaxis] 28 | fig, axes = plt.subplots(1, 3, figsize=(15, 5)) 29 | 30 | x_test = np.linspace(-3, 3, 1000) 31 | 32 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()): 33 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors) 34 | kneighbor_regression.fit(X, y) 35 | ax.plot(x, y_no_noise, label="true function") 36 | ax.plot(x, y, "o", label="data") 37 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]), 38 | label="prediction") 39 | ax.legend(loc="best") 40 | ax.set_title("n_neighbors = %d" % n_neighbors) 41 | 42 | if __name__ == "__main__": 43 | plot_kneighbors_regularization() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /plots/plot_linear_svc_regularization.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | 6 | 7 | def plot_linear_svc_regularization(): 8 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 9 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 10 | 11 | # a carefully hand-designed dataset lol 12 | y[7] = 0 13 | y[27] = 0 14 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 15 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 16 | 17 | for ax, C in zip(axes, [1e-2, 1, 1e2]): 18 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 19 | 20 | svm = SVC(kernel='linear', C=C, tol=0.00001).fit(X, y) 21 | w = svm.coef_[0] 22 | a = -w[0] / w[1] 23 | xx = np.linspace(6, 13) 24 | yy = a * xx - (svm.intercept_[0]) / w[1] 25 | ax.plot(xx, yy, label="C = %.e" % C, c='k') 26 | ax.set_xlim(x_min, x_max) 27 | ax.set_ylim(y_min, y_max) 28 | ax.set_xticks(()) 29 | ax.set_yticks(()) 30 | ax.set_title("C = %f" % C) 31 | 32 | if __name__ == "__main__": 33 | plot_linear_svc_regularization() 34 | plt.show() 35 | -------------------------------------------------------------------------------- /plots/plot_rbf_svm_parameters.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | from sklearn.datasets import make_blobs 5 | from sklearn.externals.joblib import Memory 6 | from .plot_2d_separator import plot_2d_separator 7 | 8 | def make_handcrafted_dataset(): 9 | # a carefully hand-designed dataset lol 10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30) 11 | y[np.array([7, 27])] = 0 12 | mask = np.ones(len(X), dtype=np.bool) 13 | mask[np.array([0, 1, 5, 26])] = 0 14 | X, y = X[mask], y[mask] 15 | return X, y 16 | 17 | 18 | def plot_rbf_svm_parameters(): 19 | X, y = make_handcrafted_dataset() 20 | 21 | fig, axes = plt.subplots(1, 3, figsize=(12, 4)) 22 | for ax, C in zip(axes, [1e0, 5, 10, 100]): 23 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 24 | 25 | svm = SVC(kernel='rbf', C=C).fit(X, y) 26 | plot_2d_separator(svm, X, ax=ax, eps=.5) 27 | ax.set_title("C = %f" % C) 28 | 29 | fig, axes = plt.subplots(1, 4, figsize=(15, 3)) 30 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]): 31 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 32 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y) 33 | plot_2d_separator(svm, X, ax=ax, eps=.5) 34 | ax.set_title("gamma = %f" % gamma) 35 | 36 | 37 | def plot_svm(log_C, log_gamma): 38 | X, y = make_handcrafted_dataset() 39 | C = 10. ** log_C 40 | gamma = 10. ** log_gamma 41 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y) 42 | ax = plt.gca() 43 | plot_2d_separator(svm, X, ax=ax, eps=.5) 44 | # plot data 45 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y]) 46 | # plot support vectors 47 | sv = svm.support_vectors_ 48 | ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3) 49 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma)) 50 | 51 | 52 | def plot_svm_interactive(): 53 | from IPython.html.widgets import interactive, FloatSlider 54 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False) 55 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False) 56 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider) 57 | -------------------------------------------------------------------------------- /solutions/cross_validation_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.cross_validation import StratifiedKFold, KFold 3 | iris = load_iris() 4 | X, y = iris.data, iris.target 5 | 6 | print(cross_val_score(LinearSVC(), X, y, cv=KFold(len(X), 3))) 7 | print(cross_val_score(LinearSVC(), X, y, cv=StratifiedKFold(y, 3))) 8 | -------------------------------------------------------------------------------- /solutions/digits_unsupervised.py: -------------------------------------------------------------------------------- 1 | from sklearn.manifold import TSNE 2 | from sklearn.decomposition import NMF 3 | 4 | # Compute TSNE embedding 5 | tsne = TSNE() 6 | X_tsne = tsne.fit_transform(X) 7 | 8 | # Visualize TSNE results 9 | plt.title("All classes") 10 | plt.figure() 11 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y) 12 | 13 | # build an NMF factorization of the digits dataset 14 | nmf = NMF(n_components=16).fit(X) 15 | 16 | # visualize the components 17 | fig, axes = plt.subplots(4, 4) 18 | for ax, component in zip(axes.ravel(), nmf.components_): 19 | ax.imshow(component.reshape(8, 8), cmap="gray", interpolation="nearest") 20 | ax.xticks(()) 21 | ax.yticks(()) 22 | -------------------------------------------------------------------------------- /solutions/forests.py: -------------------------------------------------------------------------------- 1 | from sklearn.tree import DecisionTreeClassifier 2 | from sklearn.ensemble import RandomForestClassifier 3 | from sklearn.datasets import load_digits 4 | from sklearn.learning_curve import validation_curve 5 | 6 | digits = load_digits() 7 | 8 | def plot_validation_curve(parameter_values, train_scores, validation_scores): 9 | train_scores_mean = np.mean(train_scores, axis=1) 10 | train_scores_std = np.std(train_scores, axis=1) 11 | validation_scores_mean = np.mean(validation_scores, axis=1) 12 | validation_scores_std = np.std(validation_scores, axis=1) 13 | 14 | plt.fill_between(parameter_values, train_scores_mean - train_scores_std, 15 | train_scores_mean + train_scores_std, alpha=0.1, 16 | color="r") 17 | plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std, 18 | validation_scores_mean + validation_scores_std, alpha=0.1, color="g") 19 | plt.plot(parameter_values, train_scores_mean, 'o-', color="r", 20 | label="Training score") 21 | plt.plot(parameter_values, validation_scores_mean, 'o-', color="g", 22 | label="Cross-validation score") 23 | plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1) 24 | plt.legend(loc="best") 25 | 26 | param_range = range(1, 50) 27 | training_scores, validation_scores = validation_curve(DecisionTreeClassifier(), digits.data, digits.target, 28 | param_name="max_depth", 29 | param_range=param_range, 30 | cv=5) 31 | plt.figure() 32 | plot_validation_curve(param_range, training_scores, validation_scores) 33 | 34 | param_range = range(1, 20, 1) 35 | training_scores, validation_scores = validation_curve(RandomForestClassifier(n_estimators=100), 36 | digits.data, digits.target, 37 | param_name="max_features", 38 | param_range=param_range, 39 | cv=5) 40 | plt.figure() 41 | plot_validation_curve(param_range, training_scores, validation_scores) 42 | -------------------------------------------------------------------------------- /solutions/grid_search_forest.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier 2 | 3 | param_grid = {'max_depth': [1, 3, 5, 7, 10], 'max_features': [5, 8, 10, 20]} 4 | 5 | grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) 6 | grid.fit(X_train, y_train) 7 | print("best parameters: %s" % grid.best_params_) 8 | print("Training set accuracy: %s" % grid.score(X_train, y_train)) 9 | print("Test set accuracy: %s" % grid.score(X_test, y_test)) 10 | 11 | scores = [x.mean_validation_score for x in grid.grid_scores_] 12 | scores = np.array(scores).reshape(5, 4) 13 | plt.matshow(scores) 14 | plt.xlabel("max_features") 15 | plt.ylabel("max_depth") 16 | -------------------------------------------------------------------------------- /solutions/grid_search_k_neighbors.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier 2 | 3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]} 4 | 5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid) 6 | grid.fit(X_train, y_train) 7 | print("best parameters: %s" % grid.best_params_) 8 | print("Training set accuracy: %s" % grid.score(X_train, y_train)) 9 | print("Test set accuracy: %s" % grid.score(X_test, y_test)) 10 | -------------------------------------------------------------------------------- /solutions/linear_models.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | from sklearn.grid_search import GridSearchCV 4 | from sklearn.datasets import load_digits 5 | from sklearn.cross_validation import train_test_split 6 | from sklearn.svm import LinearSVC 7 | 8 | digits = load_digits() 9 | X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target % 2) 10 | 11 | grid = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-6, 2, 9)}, cv=5) 12 | grid.fit(X_train, y_train) 13 | pprint(grid.grid_scores_) 14 | pprint(grid.score(X_test, y_test)) 15 | 16 | 17 | Cs = [10, 1, .01, 0.001, 0.0001] 18 | for penalty in ['l1', 'l2']: 19 | svm_models = {} 20 | training_scores = [] 21 | test_scores = [] 22 | for C in Cs: 23 | svm = LinearSVC(C=C, penalty=penalty, dual=False).fit(X_train, y_train) 24 | training_scores.append(svm.score(X_train, y_train)) 25 | test_scores.append(svm.score(X_test, y_test)) 26 | svm_models[C] = svm 27 | 28 | plt.figure() 29 | plt.plot(training_scores, label="training scores") 30 | plt.plot(test_scores, label="test scores") 31 | plt.xticks(range(4), Cs) 32 | plt.legend(loc="best") 33 | 34 | plt.figure(figsize=(10, 5)) 35 | for i, C in enumerate(Cs): 36 | plt.plot(svm_models[C].coef_.ravel(), "o", label="C = %.2f" % C) 37 | 38 | plt.legend(loc="best") 39 | -------------------------------------------------------------------------------- /solutions/load_iris.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import load_iris 5 | from sklearn.cross_validation import train_test_split 6 | 7 | iris = load_iris() 8 | X, y = iris.data, iris.target 9 | 10 | print("Dataset size: %d number of features: %d number of classes: %d" 11 | % (X.shape[0], X.shape[1], len(np.unique(y)))) 12 | 13 | X_train, X_test, y_train, y_test = train_test_split(X, y) 14 | 15 | plt.scatters(X_train[:, 0], X_train[:, 1], c=y_train) 16 | plt.figure() 17 | plt.scatter(X_train[:, 2], X_train[:, 3], c=y_train) 18 | -------------------------------------------------------------------------------- /solutions/pipeline_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.feature_selection import SelectKBest 3 | from sklearn.pipeline import make_pipeline 4 | from sklearn.svm import LinearSVC 5 | 6 | rng = np.random.RandomState(42) 7 | iris = load_iris() 8 | X = np.hstack([iris.data, rng.uniform(size=(len(iris.data), 5))]) 9 | X_train, X_test, y_train, y_test = train_test_split(X, iris.target, random_state=2) 10 | 11 | selection_pipe = make_pipeline(SelectKBest(), LinearSVC()) 12 | param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3), 13 | 'selectkbest__k': [1, 2, 3, 4, 5, 7]} 14 | grid = GridSearchCV(selection_pipe, param_grid, cv=5) 15 | grid.fit(X_train, y_train) 16 | print("Best parameters: %s" % grid.best_params_) 17 | print("Test set performance: %s" % grid.score(X_test, y_test)) 18 | -------------------------------------------------------------------------------- /solutions/svms.py: -------------------------------------------------------------------------------- 1 | print("default score without scaling: %f" % SVC().fit(X_train, y_train).score(X_test, y_test)) 2 | 3 | from sklearn.preprocessing import StandardScaler 4 | scaler = StandardScaler() 5 | X_train_scaled = scaler.fit_transform(X_train) 6 | X_test_scaled = scaler.transform(X_test) 7 | print("default score with scaling: %f" % SVC().fit(X_train_scaled, y_train).score(X_test_scaled, y_test)) 8 | 9 | grid_search.fit(X_train_scaled, y_train) 10 | 11 | # We extract just the scores 12 | scores = [x[1] for x in grid_search.grid_scores_] 13 | scores = np.array(scores).reshape(6, 6) 14 | 15 | plt.matshow(scores) 16 | plt.xlabel('gamma') 17 | plt.ylabel('C') 18 | plt.colorbar() 19 | plt.xticks(np.arange(6), param_grid['gamma']) 20 | plt.yticks(np.arange(6), param_grid['C']) 21 | -------------------------------------------------------------------------------- /solutions/train_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.neighbors import KNeighborsClassifier 3 | from sklearn.cross_validation import train_test_split 4 | 5 | iris = load_iris() 6 | X, y = iris.data, iris.target 7 | 8 | X_train, X_test, y_train, y_test = train_test_split(X, y) 9 | 10 | knn = KNeighborsClassifier(n_neighbors=3) 11 | knn.fit(X_train, y_train) 12 | 13 | print("test set score of knn: %f" % knn.score(X_test, y_test)) 14 | -------------------------------------------------------------------------------- /solutions/validation_curve.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | from sklearn.svm import LinearSVC 4 | from sklearn.neighbors import KNeighborsClassifier 5 | from sklearn.learning_curve import validation_curve 6 | 7 | 8 | cs = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10] 9 | training_scores, test_scores = validation_curve(LinearSVC(), X, y, 10 | param_name="C", param_range=cs) 11 | plt.figure() 12 | plot_validation_curve(range(7), training_scores, test_scores) 13 | 14 | 15 | ks = range(10) 16 | training_scores, test_scores = validation_curve(KNeighborsClassifier(), X, y, 17 | param_name="n_neighbors", param_range=ks) 18 | plt.figure() 19 | plot_validation_curve(ks, training_scores, test_scores) 20 | --------------------------------------------------------------------------------