├── .gitignore ├── Advanced Scoring.ipynb ├── Combining Pipelines and GridSearchCV.ipynb ├── Cross-validation.ipynb ├── Custom Estimators.ipynb ├── Grid Searches for Hyper Parameters.ipynb ├── LICENSE ├── Out Of Core Learning for Text.ipynb ├── Out Of Core Learning.ipynb ├── Preprocessing and Pipelines.ipynb ├── README.md ├── Working With Text Data.ipynb ├── advanced-sklearn-boston-nlp-2016.odp ├── advanced-sklearn-boston-nlp-2016.pdf ├── environment.yml └── solutions ├── cross_validation_iris.py ├── digits_tsne.py ├── grid_search_k_neighbors.py ├── load_iris.py ├── out_of_core.py ├── pipeline_knn.py ├── text_pipeline.py └── train_iris.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/aclImdb/* 2 | .ipynb_checkpoints/* 3 | data/batch* 4 | data/movies.txt 5 | -------------------------------------------------------------------------------- /Advanced Scoring.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n", 14 | "\n", 15 | "from sklearn.datasets import load_digits\n", 16 | "from sklearn.cross_validation import train_test_split\n", 17 | "np.set_printoptions(precision=2)\n", 18 | "\n", 19 | "digits = load_digits()\n", 20 | "X, y = digits.data, digits.target == 3\n", 21 | "X_train, X_test, y_train, y_test = train_test_split(X, y)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "from sklearn.svm import SVC" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "from sklearn.cross_validation import cross_val_score\n", 44 | "cross_val_score(SVC(), X_train, y_train)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "from sklearn.dummy import DummyClassifier\n", 56 | "cross_val_score(DummyClassifier(\"most_frequent\"), X_train, y_train)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "from sklearn.metrics import roc_curve, roc_auc_score\n", 77 | "\n", 78 | "for gamma in [.01, .1, 1]:\n", 79 | " plt.xlabel(\"FPR\")\n", 80 | " plt.ylabel(\"TPR\")\n", 81 | " svm = SVC(gamma=gamma).fit(X_train, y_train)\n", 82 | " decision_function = svm.decision_function(X_test)\n", 83 | " fpr, tpr, _ = roc_curve(y_test, decision_function)\n", 84 | " acc = svm.score(X_test, y_test)\n", 85 | " auc = roc_auc_score(y_test, svm.decision_function(X_test))\n", 86 | " plt.plot(fpr, tpr, label=\"acc:%.2f auc:%.2f\" % (acc, auc))\n", 87 | " print()\n", 88 | "plt.legend(loc=\"best\")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "from sklearn.metrics.scorer import SCORERS" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "SCORERS.keys()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "# Defining your own scoring function" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "def my_accuracy(est, X, y):\n", 129 | " return np.mean(est.predict(X) == y)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "from sklearn.svm import LinearSVC\n", 141 | "print(cross_val_score(LinearSVC(random_state=0), X, y, cv=5))\n", 142 | "print(cross_val_score(LinearSVC(random_state=0), X, y, cv=5, scoring=my_accuracy))" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": true 150 | }, 151 | "outputs": [], 152 | "source": [] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python 3", 158 | "language": "python", 159 | "name": "python3" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.5.1" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 0 176 | } 177 | -------------------------------------------------------------------------------- /Combining Pipelines and GridSearchCV.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Pipelining becomes powerful with GridSearchCV\n", 8 | "-----------------------------------------------" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "collapsed": false 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "from sklearn.svm import LinearSVC\n", 20 | "from sklearn.pipeline import make_pipeline\n", 21 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 22 | "from sklearn.grid_search import GridSearchCV\n", 23 | "import numpy as np" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "from sklearn.datasets import load_iris\n", 35 | "from sklearn.cross_validation import train_test_split\n", 36 | "\n", 37 | "\n", 38 | "iris = load_iris()\n", 39 | "X, y = iris.data, iris.target\n", 40 | "X_train, X_test, y_train, y_test = train_test_split(X, y)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "The wrong way to do GridSearchCV with preprocessing:" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "from sklearn.preprocessing import StandardScaler\n", 59 | "from sklearn.svm import SVC\n", 60 | "\n", 61 | "scaler = StandardScaler()\n", 62 | "X_preprocessed = scaler.fit_transform(X_train)\n", 63 | "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma': 10. ** np.arange(-3, 3)}\n", 64 | "\n", 65 | "grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "The right way to do GridSearchCV with preprocessing" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "from sklearn.pipeline import make_pipeline\n", 84 | "\n", 85 | "param_grid_pipeline = {'svc__C': 10. ** np.arange(-3, 3), 'svc__gamma': 10. ** np.arange(-3, 3)}\n", 86 | "\n", 87 | "scaler_pipe = make_pipeline(StandardScaler(), SVC())\n", 88 | "grid = GridSearchCV(scaler_pipe, param_grid=param_grid_pipeline, cv=5)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "grid.fit(X_train, y_train)\n", 100 | "print(grid.best_params_)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "from sklearn.pipeline import make_pipeline\n", 112 | "from sklearn.svm import SVC\n", 113 | "from sklearn.feature_selection import SelectKBest\n", 114 | "\n", 115 | "\n", 116 | "param_grid = {'selectkbest__k': [1, 2, 3, 4], 'svc__C': 10. ** np.arange(-3, 3), 'svc__gamma': 10. ** np.arange(-3, 3)}\n", 117 | "\n", 118 | "scaler_pipe = make_pipeline(SelectKBest(), SVC())\n", 119 | "grid = GridSearchCV(scaler_pipe, param_grid=param_grid, cv=5)\n", 120 | "grid.fit(X_train, y_train)\n", 121 | "print(grid.best_params_)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "text_pipe = make_pipeline(TfidfVectorizer(), LinearSVC())\n", 133 | "param_grid = {'tfidifvectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 'linearsvc__C': 10. ** np.arange(-3, 3)}\n", 134 | "\n", 135 | "grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5)" 136 | ] 137 | } 138 | ], 139 | "metadata": {}, 140 | "nbformat": 4, 141 | "nbformat_minor": 0 142 | } 143 | -------------------------------------------------------------------------------- /Cross-validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib notebook\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "Cross-Validation\n", 21 | "----------------------------------------" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "from sklearn.datasets import load_iris" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "iris = load_iris()\n", 44 | "X = iris.data\n", 45 | "y = iris.target" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "from sklearn.cross_validation import cross_val_score\n", 57 | "from sklearn.svm import LinearSVC" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "cross_val_score(LinearSVC(), X, y, cv=5)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "cross_val_score(LinearSVC(), X, y, cv=5, scoring=\"f1_macro\")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Let's go to a binary task for a moment" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "y % 2" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "cross_val_score(LinearSVC(), X, y % 2)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"average_precision\")" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"roc_auc\")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "from sklearn.metrics.scorer import SCORERS\n", 142 | "print(SCORERS.keys())" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "There are other ways to do cross-valiation" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "from sklearn.cross_validation import ShuffleSplit\n", 161 | "\n", 162 | "shuffle_split = ShuffleSplit(len(X), 10, test_size=.4)\n", 163 | "cross_val_score(LinearSVC(), X, y, cv=shuffle_split)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit\n", 175 | "\n", 176 | "def plot_cv(cv, n_samples):\n", 177 | " masks = []\n", 178 | " for train, test in cv:\n", 179 | " mask = np.zeros(n_samples, dtype=bool)\n", 180 | " mask[test] = 1\n", 181 | " masks.append(mask)\n", 182 | " plt.figure(figsize=(10, 4))\n", 183 | " plt.subplots_adjust(left=0, bottom=0, right=1, top=1)\n", 184 | " plt.imshow(masks, interpolation='none')" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "plot_cv(StratifiedKFold(y, n_folds=5), len(y))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "plot_cv(KFold(len(iris.target), n_folds=5), len(iris.target))" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "plot_cv(ShuffleSplit(len(iris.target), n_iter=20, test_size=.2), \n", 218 | " len(iris.target))" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "collapsed": false 225 | }, 226 | "source": [ 227 | "# Exercises\n", 228 | "Use KFold cross validation and StratifiedKFold cross validation (3 or 5 folds) for LinearSVC on the iris dataset.\n", 229 | "Why are the results so different? How could you get more similar results?" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "# %load solutions/cross_validation_iris.py" 241 | ] 242 | } 243 | ], 244 | "metadata": { 245 | "kernelspec": { 246 | "display_name": "Python 3", 247 | "language": "python", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.5.1" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 0 265 | } 266 | -------------------------------------------------------------------------------- /Custom Estimators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from sklearn.utils.validation import check_X_y\n", 12 | "\n", 13 | "class MyEstimator(object):\n", 14 | " def __init__(self, my_parameter=\"stuff\"):\n", 15 | " self.my_parameter = my_parameter\n", 16 | " def fit(self, X, y):\n", 17 | " X, y = check_X_y(X, y)\n", 18 | " return self\n", 19 | " def set_params(self, **kwargs):\n", 20 | " for key, value in kwargs:\n", 21 | " if key == \"parameter\":\n", 22 | " self.my_parameter = my_parameter\n", 23 | " else:\n", 24 | " raise ValueError(\"Unknown parameter %s\" % key)\n", 25 | " return self\n", 26 | " def get_params(self, deep=None):\n", 27 | " return {'my_parameter': self.my_parameter}" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "est = MyEstimator(my_parameter=\"bla\")\n", 39 | "print(est) " 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "from sklearn.utils.estimator_checks import check_estimator\n", 51 | "check_estimator(MyEstimator)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "from sklearn.utils.validation import check_X_y, check_array\n", 63 | "\n", 64 | "class MyBrokenEstimator(object):\n", 65 | " def __init__(self, my_parameter=\"stuff\"):\n", 66 | " self.my_parameter = my_parameter + \" more stuff\"\n", 67 | " def fit(self, X, y):\n", 68 | " X, y = check_X_y(X, y)\n", 69 | " return self\n", 70 | " def set_params(self, **kwargs):\n", 71 | " for key, value in kwargs:\n", 72 | " if key == \"parameter\":\n", 73 | " self.my_parameter = my_parameter\n", 74 | " else:\n", 75 | " raise ValueError(\"Unknown parameter %s\" % key)\n", 76 | " return self\n", 77 | " def get_params(self, deep=None):\n", 78 | " return {'my_parameter': self.my_parameter}" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "check_estimator(MyBrokenEstimator)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "from sklearn.base import BaseEstimator\n", 101 | "\n", 102 | "class MyInheritingEstimator(BaseEstimator):\n", 103 | " def __init__(self, my_parameter=\"stuff\"):\n", 104 | " self.my_parameter = my_parameter\n", 105 | " def fit(self, X, y):\n", 106 | " X, y = check_X_y(X, y)\n", 107 | " return self" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "est = MyInheritingEstimator(my_parameter=\"bla\")\n", 119 | "print(est) " 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "check_estimator(MyInheritingEstimator)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": true 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "from sklearn.base import TransformerMixin\n", 142 | "class MyTransformer(BaseEstimator, TransformerMixin):\n", 143 | " def __init__(self, my_parameter=\"stuff\"):\n", 144 | " self.my_parameter = my_parameter\n", 145 | " def fit(self, X, y):\n", 146 | " X, y = check_X_y(X, y)\n", 147 | " self.n_features_ = X.shape[1]\n", 148 | " return self\n", 149 | " def transform(self, X):\n", 150 | " X = check_array(X)\n", 151 | " if X.shape[1] != self.n_features_:\n", 152 | " raise ValueError(\"lol wat\")\n", 153 | " return X - 2" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "check_estimator(MyTransformer)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "import numpy as np\n", 176 | "from sklearn.base import ClassifierMixin\n", 177 | "\n", 178 | "class MyBrokenClassifier(BaseEstimator, ClassifierMixin):\n", 179 | " def __init__(self, my_parameter=\"stuff\"):\n", 180 | " self.my_parameter = my_parameter\n", 181 | " def fit(self, X, y):\n", 182 | " X, y = check_X_y(X, y)\n", 183 | " return self\n", 184 | " def predict(self, X):\n", 185 | " X = check_array(X)\n", 186 | " return np.array([1, 2])" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "collapsed": false 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "check_estimator(MyBrokenClassifier)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": false 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "from sklearn.metrics import euclidean_distances\n", 209 | "from sklearn.utils.multiclass import unique_labels\n", 210 | "from sklearn.utils.validation import check_is_fitted\n", 211 | "\n", 212 | "class MyClassifier(BaseEstimator, ClassifierMixin):\n", 213 | " def __init__(self, my_parameter=\"stuff\"):\n", 214 | " self.my_parameter = my_parameter\n", 215 | " \n", 216 | " def fit(self, X, y):\n", 217 | " X, y = check_X_y(X, y)\n", 218 | " self.classes_ = unique_labels(y)\n", 219 | " self.X_ = X\n", 220 | " self.y_ = y\n", 221 | " return self\n", 222 | " \n", 223 | " def predict(self, X):\n", 224 | " check_is_fitted(self, [\"X_\", \"y_\"])\n", 225 | " X = check_array(X)\n", 226 | " closest = np.argmin(euclidean_distances(X, self.X_), axis=1)\n", 227 | " return self.y_[closest]" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "check_estimator(MyClassifier)" 239 | ] 240 | } 241 | ], 242 | "metadata": { 243 | "kernelspec": { 244 | "display_name": "Python 3", 245 | "language": "python", 246 | "name": "python3" 247 | }, 248 | "language_info": { 249 | "codemirror_mode": { 250 | "name": "ipython", 251 | "version": 3 252 | }, 253 | "file_extension": ".py", 254 | "mimetype": "text/x-python", 255 | "name": "python", 256 | "nbconvert_exporter": "python", 257 | "pygments_lexer": "ipython3", 258 | "version": "3.5.1" 259 | } 260 | }, 261 | "nbformat": 4, 262 | "nbformat_minor": 0 263 | } 264 | -------------------------------------------------------------------------------- /Grid Searches for Hyper Parameters.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Grid Searches\n", 8 | "=================" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "Grid-Search with build-in cross validation" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "from sklearn.grid_search import GridSearchCV\n", 27 | "from sklearn.svm import SVC" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "from sklearn.datasets import load_digits\n", 39 | "from sklearn.cross_validation import train_test_split\n", 40 | "digits = load_digits()\n", 41 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "Define parameter grid:" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "import numpy as np\n", 60 | "\n", 61 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n", 62 | " 'gamma' : 10. ** np.arange(-5, 0)}\n", 63 | " \n", 64 | "\n", 65 | "np.set_printoptions(suppress=True)\n", 66 | "print(param_grid)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3, cv=5)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "A GridSearchCV object behaves just like a normal classifier." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": false, 92 | "scrolled": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "grid_search.fit(X_train, y_train)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false, 104 | "scrolled": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "grid_search.predict(X_test)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "grid_search.score(X_test, y_test)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "grid_search.best_params_" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "# We extract just the scores\n", 142 | "%matplotlib notebook\n", 143 | "import matplotlib.pyplot as plt\n", 144 | "\n", 145 | "scores = [x[1] for x in grid_search.grid_scores_]\n", 146 | "scores = np.array(scores).reshape(6, 5)\n", 147 | "\n", 148 | "plt.matshow(scores)\n", 149 | "plt.xlabel('gamma')\n", 150 | "plt.ylabel('C')\n", 151 | "plt.colorbar()\n", 152 | "plt.xticks(np.arange(5), param_grid['gamma'])\n", 153 | "plt.yticks(np.arange(6), param_grid['C']);" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "source": [ 162 | "# Exercises\n", 163 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "# %load solutions/grid_search_k_neighbors.py" 175 | ] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.5.1" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 0 199 | } 200 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | CC0 1.0 Universal 2 | 3 | Statement of Purpose 4 | 5 | The laws of most jurisdictions throughout the world automatically confer 6 | exclusive Copyright and Related Rights (defined below) upon the creator and 7 | subsequent owner(s) (each and all, an "owner") of an original work of 8 | authorship and/or a database (each, a "Work"). 9 | 10 | Certain owners wish to permanently relinquish those rights to a Work for the 11 | purpose of contributing to a commons of creative, cultural and scientific 12 | works ("Commons") that the public can reliably and without fear of later 13 | claims of infringement build upon, modify, incorporate in other works, reuse 14 | and redistribute as freely as possible in any form whatsoever and for any 15 | purposes, including without limitation commercial purposes. These owners may 16 | contribute to the Commons to promote the ideal of a free culture and the 17 | further production of creative, cultural and scientific works, or to gain 18 | reputation or greater distribution for their Work in part through the use and 19 | efforts of others. 20 | 21 | For these and/or other purposes and motivations, and without any expectation 22 | of additional consideration or compensation, the person associating CC0 with a 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work 25 | and publicly distribute the Work under its terms, with knowledge of his or her 26 | Copyright and Related Rights in the Work and the meaning and intended legal 27 | effect of CC0 on those rights. 28 | 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be 30 | protected by copyright and related or neighboring rights ("Copyright and 31 | Related Rights"). Copyright and Related Rights include, but are not limited 32 | to, the following: 33 | 34 | i. the right to reproduce, adapt, distribute, perform, display, communicate, 35 | and translate a Work; 36 | 37 | ii. moral rights retained by the original author(s) and/or performer(s); 38 | 39 | iii. publicity and privacy rights pertaining to a person's image or likeness 40 | depicted in a Work; 41 | 42 | iv. rights protecting against unfair competition in regards to a Work, 43 | subject to the limitations in paragraph 4(a), below; 44 | 45 | v. rights protecting the extraction, dissemination, use and reuse of data in 46 | a Work; 47 | 48 | vi. database rights (such as those arising under Directive 96/9/EC of the 49 | European Parliament and of the Council of 11 March 1996 on the legal 50 | protection of databases, and under any national implementation thereof, 51 | including any amended or successor version of such directive); and 52 | 53 | vii. other similar, equivalent or corresponding rights throughout the world 54 | based on applicable law or treaty, and any national implementations thereof. 55 | 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of, 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright 59 | and Related Rights and associated claims and causes of action, whether now 60 | known or unknown (including existing as well as future claims and causes of 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum 62 | duration provided by applicable law or treaty (including future time 63 | extensions), (iii) in any current or future medium and for any number of 64 | copies, and (iv) for any purpose whatsoever, including without limitation 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes 66 | the Waiver for the benefit of each member of the public at large and to the 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver 68 | shall not be subject to revocation, rescission, cancellation, termination, or 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work 70 | by the public as contemplated by Affirmer's express Statement of Purpose. 71 | 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be 73 | judged legally invalid or ineffective under applicable law, then the Waiver 74 | shall be preserved to the maximum extent permitted taking into account 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver 76 | is so judged Affirmer hereby grants to each affected person a royalty-free, 77 | non transferable, non sublicensable, non exclusive, irrevocable and 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration 80 | provided by applicable law or treaty (including future time extensions), (iii) 81 | in any current or future medium and for any number of copies, and (iv) for any 82 | purpose whatsoever, including without limitation commercial, advertising or 83 | promotional purposes (the "License"). The License shall be deemed effective as 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the 85 | License for any reason be judged legally invalid or ineffective under 86 | applicable law, such partial invalidity or ineffectiveness shall not 87 | invalidate the remainder of the License, and in such case Affirmer hereby 88 | affirms that he or she will not (i) exercise any of his or her remaining 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims 90 | and causes of action with respect to the Work, in either case contrary to 91 | Affirmer's express Statement of Purpose. 92 | 93 | 4. Limitations and Disclaimers. 94 | 95 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 96 | surrendered, licensed or otherwise affected by this document. 97 | 98 | b. Affirmer offers the Work as-is and makes no representations or warranties 99 | of any kind concerning the Work, express, implied, statutory or otherwise, 100 | including without limitation warranties of title, merchantability, fitness 101 | for a particular purpose, non infringement, or the absence of latent or 102 | other defects, accuracy, or the present or absence of errors, whether or not 103 | discoverable, all to the greatest extent permissible under applicable law. 104 | 105 | c. Affirmer disclaims responsibility for clearing rights of other persons 106 | that may apply to the Work or any use thereof, including without limitation 107 | any person's Copyright and Related Rights in the Work. Further, Affirmer 108 | disclaims responsibility for obtaining any necessary consents, permissions 109 | or other rights required for any use of the Work. 110 | 111 | d. Affirmer understands and acknowledges that Creative Commons is not a 112 | party to this document and has no duty or obligation with respect to this 113 | CC0 or use of the Work. 114 | 115 | For more information, please see 116 | 117 | -------------------------------------------------------------------------------- /Out Of Core Learning for Text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import matplotlib.pyplot as plt\n", 12 | "import numpy as np\n", 13 | "%matplotlib notebook" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Out of core text classification with the Hashing Vectorizer" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Using the Amazon movie reviews collected by J. McAuley and J. Leskovec\n", 28 | "\n", 29 | "https://snap.stanford.edu/data/web-Movies.html" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "import os\n", 41 | "print(\"file size: %d GB\" % (os.path.getsize(\"data/movies.txt\") / 1024 ** 3))" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "with open(\"data/movies.txt\") as f:\n", 53 | " print(f.read(4000))" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "def review_iter(f):\n", 65 | " current_post = []\n", 66 | " for line in f:\n", 67 | " if line.startswith(\"product/productId\"):\n", 68 | " if len(current_post):\n", 69 | " score = current_post[3].strip(\"review/score: \").strip()\n", 70 | " review = \"\".join(current_post[6:]).strip(\"review/text: \").strip()\n", 71 | " # there are about 20 posts with linebreaks in them.\n", 72 | " # we just ignore those for simplicity\n", 73 | " try:\n", 74 | " yield int(float(score)), review\n", 75 | " except:\n", 76 | " current_post = []\n", 77 | " continue\n", 78 | " current_post = []\n", 79 | " else:\n", 80 | " current_post.append(line)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": false, 88 | "scrolled": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "n_reviews = 0\n", 93 | "with open(\"data/movies.txt\", 'r', errors='ignore') as f:\n", 94 | " for r in review_iter(f):\n", 95 | " n_reviews += 1\n", 96 | "\n", 97 | "print(\"Number of reviews: %d\" % n_reviews)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "from itertools import islice\n", 109 | "\n", 110 | "with open(\"data/movies.txt\", 'rb') as f:\n", 111 | " reviews = islice(review_iter(f), 10000)\n", 112 | " scores, texts = zip(*reviews)\n", 113 | "print(np.bincount(scores))" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [], 123 | "source": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "from itertools import zip_longest # use izip_longest on Python3\n", 134 | "# from the itertools recipes\n", 135 | "def grouper(iterable, n, fillvalue=None):\n", 136 | " \"Collect data into fixed-length chunks or blocks\"\n", 137 | " # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx\n", 138 | " args = [iter(iterable)] * n\n", 139 | " return zip_longest(fillvalue=fillvalue, *args)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "def preprocess_batch(reviews):\n", 151 | " # score == 3 is \"neutral\", we only want \"positive\" or \"negative\"\n", 152 | " reviews_filtered = [r for r in reviews if r is not None and r[0] != 3]\n", 153 | " scores, texts = zip(*reviews_filtered)\n", 154 | " polarity = np.array(scores) > 3\n", 155 | " return polarity, texts" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": false, 163 | "scrolled": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "from sklearn.feature_extraction.text import HashingVectorizer\n", 168 | "\n", 169 | "vectorizer = HashingVectorizer(decode_error=\"ignore\")\n", 170 | "\n", 171 | "with open(\"data/movies.txt\") as f:\n", 172 | " reviews = islice(review_iter(f), 10000)\n", 173 | " polarity_test, texts_test = preprocess_batch(reviews)\n", 174 | " X_test = vectorizer.transform(texts_test)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "from sklearn.linear_model import SGDClassifier\n", 186 | "\n", 187 | "sgd = SGDClassifier(random_state=0)\n", 188 | "\n", 189 | "accuracies = []\n", 190 | "with open(\"data/movies.txt\") as f:\n", 191 | " training_set = islice(review_iter(f), 10000, None)\n", 192 | " batch_iter = grouper(training_set, 10000)\n", 193 | " for batch in batch_iter:\n", 194 | " polarity, texts = preprocess_batch(batch)\n", 195 | " X = vectorizer.transform(texts)\n", 196 | " sgd.partial_fit(X, polarity, classes=[0, 1])\n", 197 | " accuracies.append(sgd.score(X_test, polarity_test))" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": false 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "plt.plot(accuracies)" 209 | ] 210 | } 211 | ], 212 | "metadata": { 213 | "kernelspec": { 214 | "display_name": "Python 3", 215 | "language": "python", 216 | "name": "python3" 217 | }, 218 | "language_info": { 219 | "codemirror_mode": { 220 | "name": "ipython", 221 | "version": 3 222 | }, 223 | "file_extension": ".py", 224 | "mimetype": "text/x-python", 225 | "name": "python", 226 | "nbconvert_exporter": "python", 227 | "pygments_lexer": "ipython3", 228 | "version": "3.4.3" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 0 233 | } 234 | -------------------------------------------------------------------------------- /Out Of Core Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# write out some toy data\n", 12 | "from sklearn.datasets import load_digits\n", 13 | "import pickle\n", 14 | "\n", 15 | "digits = load_digits()\n", 16 | "\n", 17 | "X, y = digits.data, digits.target\n", 18 | "\n", 19 | "for i in range(10):\n", 20 | " pickle.dump((X[i::10] / 16., y[i::10]), open(\"data/batch_%02d.pickle\" % i, \"wb\"), -1)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from sklearn.linear_model import SGDClassifier" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "sgd = SGDClassifier(random_state=1)\n", 43 | "\n", 44 | "for i in range(9):\n", 45 | " X_batch, y_batch = pickle.load(open(\"data/batch_%02d.pickle\" % i, \"rb\"))\n", 46 | " sgd.partial_fit(X_batch, y_batch, classes=range(10))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "X_test, y_test = pickle.load(open(\"data/batch_09.pickle\", \"rb\"))\n", 58 | "\n", 59 | "sgd.score(X_test, y_test)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "# Exercise\n", 67 | "Iterate over the dataset ten times, print the error on the hold-out batch (09) for each pass.\n", 68 | "Try changing the learning rate (and eta0) and see how that affects results." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "# %load solutions/out_of_core.py" 80 | ] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "Python 3", 86 | "language": "python", 87 | "name": "python3" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 3 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython3", 99 | "version": "3.4.3" 100 | } 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 0 104 | } 105 | -------------------------------------------------------------------------------- /Preprocessing and Pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Preprocessing and Pipelines\n", 8 | "=============================" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "collapsed": false 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "from sklearn.datasets import load_digits\n", 20 | "from sklearn.cross_validation import train_test_split\n", 21 | "digits = load_digits()\n", 22 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n", 23 | " digits.target)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.\n", 31 | "To do that, we build a pipeline." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "from sklearn.pipeline import Pipeline, make_pipeline\n", 43 | "from sklearn.svm import SVC\n", 44 | "from sklearn.preprocessing import StandardScaler" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "standard_scaler = StandardScaler()\n", 56 | "standard_scaler.fit(X_train)\n", 57 | "X_train_scaled = standard_scaler.transform(X_train)\n", 58 | "svm = SVC().fit(X_train_scaled, y_train)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "#pipeline = Pipeline([(\"scaler\", StandardScaler()),\n", 70 | "# (\"svm\", SVC())])\n", 71 | "# short version:\n", 72 | "pipeline = make_pipeline(StandardScaler(), SVC())" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "pipeline.fit(X_train, y_train)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "pipeline.score(X_test, y_test)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": false, 102 | "scrolled": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "pipeline.predict(X_test)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Cross-validation with a pipeline\n", 114 | "---------------------------------" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "from sklearn.cross_validation import cross_val_score\n", 126 | "cross_val_score(pipeline, X_train, y_train)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "Grid Search with a pipeline\n", 134 | "===========================" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "import numpy as np\n", 146 | "from sklearn.grid_search import GridSearchCV\n", 147 | "\n", 148 | "param_grid = {'svc__C': 10. ** np.arange(-3, 3),\n", 149 | " 'svc__gamma' : 10. ** np.arange(-3, 3)\n", 150 | " }\n", 151 | "\n", 152 | "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid) " 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "grid_pipeline.fit(X_train, y_train)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "grid_pipeline.score(X_test, y_test)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "source": [ 183 | "# Exercise\n", 184 | "Make a pipeline out of the StandardScaler and KNeighborsClassifier and search over the number of neighbors.\n" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "# %load solutions/pipeline_knn.py" 196 | ] 197 | } 198 | ], 199 | "metadata": { 200 | "kernelspec": { 201 | "display_name": "Python 3", 202 | "language": "python", 203 | "name": "python3" 204 | }, 205 | "language_info": { 206 | "codemirror_mode": { 207 | "name": "ipython", 208 | "version": 3 209 | }, 210 | "file_extension": ".py", 211 | "mimetype": "text/x-python", 212 | "name": "python", 213 | "nbconvert_exporter": "python", 214 | "pygments_lexer": "ipython3", 215 | "version": "3.5.1" 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 0 220 | } 221 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Advanced machine learning with Scikit-learn 2 | This repository contains material and slides for the Boston NLP meetup March 23rd 2016. 3 | 4 | Slides are [here](https://github.com/amueller/advanced-sklearn-boston-nlp-2016/raw/master/advanced-sklearn-boston-nlp-2016.pdf). 5 | 6 | The following packages are required to run the notebooks: 7 | 8 | - scikit-learn >= 0.16 (some might require 0.17.1) 9 | - matplotlib >= 1.3 10 | - numpy >= 1.5 11 | - IPython >= 4.0 12 | - Jupyter Notebook >= 4.0 13 | 14 | The easiest way to install all requirements is to install the free Anaconda Python distribution: 15 | https://www.continuum.io/downloads (OS X, Windows, Linux) 16 | 17 | All material in this repository is licensed CC-0 18 | -------------------------------------------------------------------------------- /Working With Text Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib notebook\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Text Classification of Movie Reviews" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Get data from http://ai.stanford.edu/~amaas/data/sentiment/ and extract into the data folder." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "from sklearn.datasets import load_files\n", 39 | "\n", 40 | "reviews_train = load_files(\"data/aclImdb/train/\")\n", 41 | "text_train, y_train = reviews_train.data, reviews_train.target" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "print(\"Number of documents in training data: %d\" % len(text_train))\n", 53 | "print(np.bincount(y_train))" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "reviews_test = load_files(\"data/aclImdb/test/\")\n", 65 | "text_test, y_test = reviews_test.data, reviews_test.target\n", 66 | "print(\"Number of documents in test data: %d\" % len(text_test))\n", 67 | "print(np.bincount(y_test))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "print(text_train[1])" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "print(y_train[1])" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "from sklearn.feature_extraction.text import CountVectorizer\n", 101 | "cv = CountVectorizer()\n", 102 | "cv.fit(text_train)\n", 103 | "\n", 104 | "len(cv.vocabulary_)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": false, 112 | "scrolled": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "print(cv.get_feature_names()[:50])\n", 117 | "print(cv.get_feature_names()[50000:50050])" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "X_train = cv.transform(text_train)\n", 129 | "X_train" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "print(text_train[19726])" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "X_train[19726].nonzero()[1]" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "X_test = cv.transform(text_test)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "from sklearn.svm import LinearSVC\n", 174 | "\n", 175 | "svm = LinearSVC()\n", 176 | "svm.fit(X_train, y_train)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "svm.score(X_train, y_train)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "svm.score(X_test, y_test)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "def visualize_coefficients(classifier, feature_names, n_top_features=25):\n", 210 | " # get coefficients with large absolute values \n", 211 | " coef = classifier.coef_.ravel()\n", 212 | " positive_coefficients = np.argsort(coef)[-n_top_features:]\n", 213 | " negative_coefficients = np.argsort(coef)[:n_top_features]\n", 214 | " interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n", 215 | " # plot them\n", 216 | " plt.figure(figsize=(15, 5))\n", 217 | " colors = [\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]]\n", 218 | " plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)\n", 219 | " feature_names = np.array(feature_names)\n", 220 | " plt.subplots_adjust(bottom=0.3)\n", 221 | " plt.xticks(np.arange(1, 1 + 2 * n_top_features), feature_names[interesting_coefficients], rotation=60, ha=\"right\");\n" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "visualize_coefficients(svm, cv.get_feature_names())" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "svm = LinearSVC(C=0.001)\n", 244 | "svm.fit(X_train, y_train)\n", 245 | "svm.score(X_test, y_test)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "visualize_coefficients(svm, cv.get_feature_names())" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "# Start pipelines" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": false 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "from sklearn.pipeline import make_pipeline\n", 275 | "text_pipe = make_pipeline(CountVectorizer(), LinearSVC())\n", 276 | "text_pipe.fit(text_train, y_train)\n", 277 | "text_pipe.score(text_test, y_test)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": false, 285 | "scrolled": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "from sklearn.grid_search import GridSearchCV\n", 290 | "\n", 291 | "param_grid = {'linearsvc__C': np.logspace(-5, 0, 6)}\n", 292 | "grid = GridSearchCV(text_pipe, param_grid, cv=5)\n", 293 | "grid.fit(text_train, y_train)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "grid.best_params_" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "collapsed": false 312 | }, 313 | "outputs": [], 314 | "source": [ 315 | "visualize_coefficients(grid.best_estimator_.named_steps['linearsvc'],\n", 316 | " grid.best_estimator_.named_steps['countvectorizer'].get_feature_names())" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "collapsed": false 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "grid.score(text_test, y_test)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "# N-Grams" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": { 341 | "collapsed": false, 342 | "scrolled": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "text_pipe = make_pipeline(CountVectorizer(), LinearSVC())\n", 347 | "from sklearn.grid_search import GridSearchCV\n", 348 | "\n", 349 | "\n", 350 | "param_grid = {'linearsvc__C': np.logspace(-3, 2, 6),\n", 351 | " \"countvectorizer__ngram_range\": [(1, 1), (1, 2)]}\n", 352 | "\n", 353 | "grid = GridSearchCV(text_pipe, param_grid, cv=5)\n", 354 | "\n", 355 | "grid.fit(text_train, y_train)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "collapsed": false 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "scores = np.array([score.mean_validation_score for score in grid.grid_scores_]).reshape(3, -1)\n", 367 | "plt.matshow(scores)\n", 368 | "plt.ylabel(\"n-gram range\")\n", 369 | "plt.yticks(range(3), param_grid[\"countvectorizer__ngram_range\"])\n", 370 | "plt.xlabel(\"C\")\n", 371 | "plt.xticks(range(6), param_grid[\"linearsvc__C\"]);\n", 372 | "plt.colorbar()" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": { 379 | "collapsed": false 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "grid.best_params_" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "visualize_coefficients(grid.best_estimator_.named_steps['linearsvc'],\n", 395 | " grid.best_estimator_.named_steps['countvectorizer'].get_feature_names())" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "collapsed": false 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "grid.score(text_test, y_test)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## Look at SpaCy and NLTK" 414 | ] 415 | } 416 | ], 417 | "metadata": { 418 | "kernelspec": { 419 | "display_name": "Python 3", 420 | "language": "python", 421 | "name": "python3" 422 | }, 423 | "language_info": { 424 | "codemirror_mode": { 425 | "name": "ipython", 426 | "version": 3 427 | }, 428 | "file_extension": ".py", 429 | "mimetype": "text/x-python", 430 | "name": "python", 431 | "nbconvert_exporter": "python", 432 | "pygments_lexer": "ipython3", 433 | "version": "3.5.1" 434 | } 435 | }, 436 | "nbformat": 4, 437 | "nbformat_minor": 0 438 | } 439 | -------------------------------------------------------------------------------- /advanced-sklearn-boston-nlp-2016.odp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/advanced-sklearn-boston-nlp-2016/bd59b30774da68b4d2ddd80148feff6ed2f8b608/advanced-sklearn-boston-nlp-2016.odp -------------------------------------------------------------------------------- /advanced-sklearn-boston-nlp-2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amueller/advanced-sklearn-boston-nlp-2016/bd59b30774da68b4d2ddd80148feff6ed2f8b608/advanced-sklearn-boston-nlp-2016.pdf -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: sklearn 2 | dependencies: 3 | - libgfortran=1.0=0 4 | - numpy=1.10.1=py34_0 5 | - openblas=0.2.14=3 6 | - openssl=1.0.2d=0 7 | - pip=7.1.2=py34_0 8 | - python=3.4.3=2 9 | - readline=6.2=2 10 | - scikit-learn=0.17=np110py34_1 11 | - scipy=0.16.0=np110py34_1 12 | - setuptools=18.5=py34_0 13 | - sqlite=3.8.4.1=1 14 | - tk=8.5.18=0 15 | - wheel=0.26.0=py34_1 16 | - xz=5.0.5=0 17 | - zlib=1.2.8=0 18 | - pip: 19 | - apache-libcloud==0.19.0 20 | - backports.ssl-match-hostname==3.4.0.2 21 | - futures==3.0.3 22 | - pkginfo==1.2.1 23 | - requests-toolbelt==0.4.0 24 | - twine==1.6.4 25 | - wheelhouse-uploader==0.7.4 26 | 27 | -------------------------------------------------------------------------------- /solutions/cross_validation_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.cross_validation import StratifiedKFold, KFold 3 | iris = load_iris() 4 | X, y = iris.data, iris.target 5 | 6 | print(cross_val_score(LinearSVC(), X, y, cv=KFold(len(X), 3))) 7 | print(cross_val_score(LinearSVC(), X, y, cv=StratifiedKFold(y, 3))) 8 | -------------------------------------------------------------------------------- /solutions/digits_tsne.py: -------------------------------------------------------------------------------- 1 | from sklearn.manifold import TSNE 2 | tsne = TSNE() 3 | X_tsne = tsne.fit_transform(X) 4 | plt.figure() 5 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y) 6 | -------------------------------------------------------------------------------- /solutions/grid_search_k_neighbors.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier 2 | 3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]} 4 | 5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid) 6 | grid.fit(X_train, y_train) 7 | print("best parameters: %s" % grid.best_params_) 8 | print("Training set accuracy: %s" % grid.score(X_train, y_train)) 9 | print("Test set accuracy: %s" % grid.score(X_test, y_test)) 10 | -------------------------------------------------------------------------------- /solutions/load_iris.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from sklearn.datasets import load_iris 5 | from sklearn.cross_validation import train_test_split 6 | 7 | iris = load_iris() 8 | X, y = iris.data, iris.target 9 | 10 | print("Dataset size: %d number of features: %d number of classes: %d" 11 | % (X.shape[0], X.shape[1], len(np.unique(y)))) 12 | 13 | X_train, X_test, y_train, y_test = train_test_split(X, y) 14 | 15 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train) 16 | plt.figure() 17 | plt.scatter(X_train[:, 2], X_train[:, 3], c=y_train) 18 | -------------------------------------------------------------------------------- /solutions/out_of_core.py: -------------------------------------------------------------------------------- 1 | sgd = SGDClassifier(learning_rate='invscaling', eta0=.5) 2 | 3 | for j in range(10): 4 | for i in range(9): 5 | X_batch, y_batch = pickle.load(open("data/batch_%02d.pickle" % i, "rb")) 6 | sgd.partial_fit(X_batch, y_batch, classes=range(10)) 7 | print(sgd.score(X_test, y_test)) 8 | -------------------------------------------------------------------------------- /solutions/pipeline_knn.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsClassifier 2 | pipe = make_pipeline(StandardScaler(), KNeighborsClassifier()) 3 | param_grid = {'kneighborsclassifier__n_neighbors': [1, 3, 5, 10]} 4 | grid = GridSearchCV(pipe, param_grid) 5 | grid.fit(X_train, y_train) 6 | print(grid.best_params_) 7 | print(grid.score(X_test, y_test)) 8 | -------------------------------------------------------------------------------- /solutions/text_pipeline.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import make_pipeline 2 | from sklearn.grid_search import GridSearchCV 3 | 4 | pipeline = make_pipeline(CountVectorizer(), 5 | LinearSVC()) 6 | pipeline.fit(text_train, y_train) 7 | print("Pipeline test score: %f" 8 | % pipeline.score(text_test, y_test)) 9 | visualize_coefficients(pipeline.named_steps['linearsvc'], 10 | pipeline.named_steps['countvectorizer'].get_feature_names()) 11 | 12 | param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3)} 13 | 14 | grid_search = GridSearchCV(pipeline, param_grid=param_grid) 15 | grid_search.fit(text_train, y_train) 16 | 17 | print("best parameters : %s" % grid_search.best_params_) 18 | print("Grid-searched test score: %f" 19 | % grid_search.score(text_test, y_test)) 20 | 21 | est = grid_search.best_estimator_ 22 | visualize_coefficients(est.named_steps['linearsvc'], 23 | est.named_steps['countvectorizer'].get_feature_names()) 24 | 25 | param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3), 26 | "countvectorizer__ngram_range": [(1, 1), (1, 2), (2, 2)]} 27 | grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3) 28 | grid_search.fit(text_train, y_train) 29 | 30 | print("best parameters with n-gram search: %s" % grid_search.best_params_) 31 | print("test set score with n-gram search: %s" % grid_search.score(text_test, y_test)) 32 | 33 | est = grid_search.best_estimator_ 34 | visualize_coefficients(est.named_steps['linearsvc'], 35 | est.named_steps['countvectorizer'].get_feature_names()) 36 | -------------------------------------------------------------------------------- /solutions/train_iris.py: -------------------------------------------------------------------------------- 1 | from sklearn.datasets import load_iris 2 | from sklearn.neighbors import KNeighborsClassifier 3 | from sklearn.cross_validation import train_test_split 4 | 5 | iris = load_iris() 6 | X, y = iris.data, iris.target 7 | 8 | X_train, X_test, y_train, y_test = train_test_split(X, y) 9 | 10 | knn = KNeighborsClassifier(n_neighbors=3) 11 | knn.fit(X_train, y_train) 12 | 13 | print("test set score of knn: %f" % knn.score(X_test, y_test)) 14 | --------------------------------------------------------------------------------