├── .gitignore
├── Advanced Scoring.ipynb
├── Combining Pipelines and GridSearchCV.ipynb
├── Cross-validation.ipynb
├── Custom Estimators.ipynb
├── Grid Searches for Hyper Parameters.ipynb
├── LICENSE
├── Out Of Core Learning for Text.ipynb
├── Out Of Core Learning.ipynb
├── Preprocessing and Pipelines.ipynb
├── README.md
├── Working With Text Data.ipynb
├── advanced-sklearn-boston-nlp-2016.odp
├── advanced-sklearn-boston-nlp-2016.pdf
├── environment.yml
└── solutions
    ├── cross_validation_iris.py
    ├── digits_tsne.py
    ├── grid_search_k_neighbors.py
    ├── load_iris.py
    ├── out_of_core.py
    ├── pipeline_knn.py
    ├── text_pipeline.py
    └── train_iris.py


/.gitignore:
--------------------------------------------------------------------------------
1 | data/aclImdb/*
2 | .ipynb_checkpoints/*
3 | data/batch*
4 | data/movies.txt
5 | 


--------------------------------------------------------------------------------
/Advanced Scoring.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib inline\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "import numpy as np\n",
 14 |     "\n",
 15 |     "from sklearn.datasets import load_digits\n",
 16 |     "from sklearn.cross_validation import train_test_split\n",
 17 |     "np.set_printoptions(precision=2)\n",
 18 |     "\n",
 19 |     "digits = load_digits()\n",
 20 |     "X, y = digits.data, digits.target == 3\n",
 21 |     "X_train, X_test, y_train, y_test = train_test_split(X, y)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "collapsed": false
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from sklearn.svm import SVC"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from sklearn.cross_validation import cross_val_score\n",
 44 |     "cross_val_score(SVC(), X_train, y_train)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from sklearn.dummy import DummyClassifier\n",
 56 |     "cross_val_score(DummyClassifier(\"most_frequent\"), X_train, y_train)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": []
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "collapsed": false
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "from sklearn.metrics import roc_curve, roc_auc_score\n",
 77 |     "\n",
 78 |     "for gamma in [.01, .1, 1]:\n",
 79 |     "    plt.xlabel(\"FPR\")\n",
 80 |     "    plt.ylabel(\"TPR\")\n",
 81 |     "    svm = SVC(gamma=gamma).fit(X_train, y_train)\n",
 82 |     "    decision_function = svm.decision_function(X_test)\n",
 83 |     "    fpr, tpr, _ = roc_curve(y_test, decision_function)\n",
 84 |     "    acc = svm.score(X_test, y_test)\n",
 85 |     "    auc = roc_auc_score(y_test, svm.decision_function(X_test))\n",
 86 |     "    plt.plot(fpr, tpr, label=\"acc:%.2f auc:%.2f\" % (acc, auc))\n",
 87 |     "    print()\n",
 88 |     "plt.legend(loc=\"best\")"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "from sklearn.metrics.scorer import SCORERS"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "SCORERS.keys()"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "# Defining your own scoring function"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "def my_accuracy(est, X, y):\n",
129 |     "    return np.mean(est.predict(X) == y)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "from sklearn.svm import LinearSVC\n",
141 |     "print(cross_val_score(LinearSVC(random_state=0), X, y, cv=5))\n",
142 |     "print(cross_val_score(LinearSVC(random_state=0), X, y, cv=5, scoring=my_accuracy))"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {
149 |     "collapsed": true
150 |    },
151 |    "outputs": [],
152 |    "source": []
153 |   }
154 |  ],
155 |  "metadata": {
156 |   "kernelspec": {
157 |    "display_name": "Python 3",
158 |    "language": "python",
159 |    "name": "python3"
160 |   },
161 |   "language_info": {
162 |    "codemirror_mode": {
163 |     "name": "ipython",
164 |     "version": 3
165 |    },
166 |    "file_extension": ".py",
167 |    "mimetype": "text/x-python",
168 |    "name": "python",
169 |    "nbconvert_exporter": "python",
170 |    "pygments_lexer": "ipython3",
171 |    "version": "3.5.1"
172 |   }
173 |  },
174 |  "nbformat": 4,
175 |  "nbformat_minor": 0
176 | }
177 | 


--------------------------------------------------------------------------------
/Combining Pipelines and GridSearchCV.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Pipelining becomes powerful with GridSearchCV\n",
  8 |     "-----------------------------------------------"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {
 15 |     "collapsed": false
 16 |    },
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from sklearn.svm import LinearSVC\n",
 20 |     "from sklearn.pipeline import make_pipeline\n",
 21 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 22 |     "from sklearn.grid_search import GridSearchCV\n",
 23 |     "import numpy as np"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {
 30 |     "collapsed": false
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from sklearn.datasets import load_iris\n",
 35 |     "from sklearn.cross_validation import train_test_split\n",
 36 |     "\n",
 37 |     "\n",
 38 |     "iris = load_iris()\n",
 39 |     "X, y = iris.data, iris.target\n",
 40 |     "X_train, X_test, y_train, y_test = train_test_split(X, y)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "The wrong way to do GridSearchCV with preprocessing:"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "from sklearn.preprocessing import StandardScaler\n",
 59 |     "from sklearn.svm import SVC\n",
 60 |     "\n",
 61 |     "scaler = StandardScaler()\n",
 62 |     "X_preprocessed = scaler.fit_transform(X_train)\n",
 63 |     "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma': 10. ** np.arange(-3, 3)}\n",
 64 |     "\n",
 65 |     "grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "The right way to do GridSearchCV with preprocessing"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "from sklearn.pipeline import make_pipeline\n",
 84 |     "\n",
 85 |     "param_grid_pipeline = {'svc__C': 10. ** np.arange(-3, 3), 'svc__gamma': 10. ** np.arange(-3, 3)}\n",
 86 |     "\n",
 87 |     "scaler_pipe = make_pipeline(StandardScaler(), SVC())\n",
 88 |     "grid = GridSearchCV(scaler_pipe, param_grid=param_grid_pipeline, cv=5)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "grid.fit(X_train, y_train)\n",
100 |     "print(grid.best_params_)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": false
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "from sklearn.pipeline import make_pipeline\n",
112 |     "from sklearn.svm import SVC\n",
113 |     "from sklearn.feature_selection import SelectKBest\n",
114 |     "\n",
115 |     "\n",
116 |     "param_grid = {'selectkbest__k': [1, 2, 3, 4], 'svc__C': 10. ** np.arange(-3, 3), 'svc__gamma': 10. ** np.arange(-3, 3)}\n",
117 |     "\n",
118 |     "scaler_pipe = make_pipeline(SelectKBest(), SVC())\n",
119 |     "grid = GridSearchCV(scaler_pipe, param_grid=param_grid, cv=5)\n",
120 |     "grid.fit(X_train, y_train)\n",
121 |     "print(grid.best_params_)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {
128 |     "collapsed": false
129 |    },
130 |    "outputs": [],
131 |    "source": [
132 |     "text_pipe = make_pipeline(TfidfVectorizer(), LinearSVC())\n",
133 |     "param_grid = {'tfidifvectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 'linearsvc__C': 10. ** np.arange(-3, 3)}\n",
134 |     "\n",
135 |     "grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5)"
136 |    ]
137 |   }
138 |  ],
139 |  "metadata": {},
140 |  "nbformat": 4,
141 |  "nbformat_minor": 0
142 | }
143 | 


--------------------------------------------------------------------------------
/Cross-validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib notebook\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "import numpy as np"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "Cross-Validation\n",
 21 |     "----------------------------------------"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "collapsed": false
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from sklearn.datasets import load_iris"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "iris = load_iris()\n",
 44 |     "X = iris.data\n",
 45 |     "y = iris.target"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": false
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "from sklearn.cross_validation import cross_val_score\n",
 57 |     "from sklearn.svm import LinearSVC"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "cross_val_score(LinearSVC(), X, y, cv=5)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "cross_val_score(LinearSVC(), X, y, cv=5, scoring=\"f1_macro\")"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "Let's go to a binary task for a moment"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "y % 2"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {
104 |     "collapsed": false
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "cross_val_score(LinearSVC(), X, y % 2)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "cross_val_score(LinearSVC(), X, y % 2, scoring=\"average_precision\")"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "cross_val_score(LinearSVC(), X, y % 2, scoring=\"roc_auc\")"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "collapsed": false
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "from sklearn.metrics.scorer import SCORERS\n",
142 |     "print(SCORERS.keys())"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "There are other ways to do cross-valiation"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "from sklearn.cross_validation import ShuffleSplit\n",
161 |     "\n",
162 |     "shuffle_split = ShuffleSplit(len(X), 10, test_size=.4)\n",
163 |     "cross_val_score(LinearSVC(), X, y, cv=shuffle_split)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit\n",
175 |     "\n",
176 |     "def plot_cv(cv, n_samples):\n",
177 |     "    masks = []\n",
178 |     "    for train, test in cv:\n",
179 |     "        mask = np.zeros(n_samples, dtype=bool)\n",
180 |     "        mask[test] = 1\n",
181 |     "        masks.append(mask)\n",
182 |     "    plt.figure(figsize=(10, 4))\n",
183 |     "    plt.subplots_adjust(left=0, bottom=0, right=1, top=1)\n",
184 |     "    plt.imshow(masks, interpolation='none')"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {
191 |     "collapsed": false
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "plot_cv(StratifiedKFold(y, n_folds=5), len(y))"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": false
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "plot_cv(KFold(len(iris.target), n_folds=5), len(iris.target))"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {
213 |     "collapsed": false
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "plot_cv(ShuffleSplit(len(iris.target), n_iter=20, test_size=.2), \n",
218 |     "        len(iris.target))"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {
224 |     "collapsed": false
225 |    },
226 |    "source": [
227 |     "# Exercises\n",
228 |     "Use  KFold cross validation and StratifiedKFold cross validation (3 or 5 folds) for LinearSVC on the iris dataset.\n",
229 |     "Why are the results so different? How could you get more similar results?"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "# %load solutions/cross_validation_iris.py"
241 |    ]
242 |   }
243 |  ],
244 |  "metadata": {
245 |   "kernelspec": {
246 |    "display_name": "Python 3",
247 |    "language": "python",
248 |    "name": "python3"
249 |   },
250 |   "language_info": {
251 |    "codemirror_mode": {
252 |     "name": "ipython",
253 |     "version": 3
254 |    },
255 |    "file_extension": ".py",
256 |    "mimetype": "text/x-python",
257 |    "name": "python",
258 |    "nbconvert_exporter": "python",
259 |    "pygments_lexer": "ipython3",
260 |    "version": "3.5.1"
261 |   }
262 |  },
263 |  "nbformat": 4,
264 |  "nbformat_minor": 0
265 | }
266 | 


--------------------------------------------------------------------------------
/Custom Estimators.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from sklearn.utils.validation import check_X_y\n",
 12 |     "\n",
 13 |     "class MyEstimator(object):\n",
 14 |     "    def __init__(self, my_parameter=\"stuff\"):\n",
 15 |     "        self.my_parameter = my_parameter\n",
 16 |     "    def fit(self, X, y):\n",
 17 |     "        X, y = check_X_y(X, y)\n",
 18 |     "        return self\n",
 19 |     "    def set_params(self, **kwargs):\n",
 20 |     "        for key, value in kwargs:\n",
 21 |     "            if key == \"parameter\":\n",
 22 |     "                self.my_parameter = my_parameter\n",
 23 |     "            else:\n",
 24 |     "                raise ValueError(\"Unknown parameter %s\" % key)\n",
 25 |     "        return self\n",
 26 |     "    def get_params(self, deep=None):\n",
 27 |     "        return {'my_parameter': self.my_parameter}"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "est = MyEstimator(my_parameter=\"bla\")\n",
 39 |     "print(est)             "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "from sklearn.utils.estimator_checks import check_estimator\n",
 51 |     "check_estimator(MyEstimator)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "from sklearn.utils.validation import check_X_y, check_array\n",
 63 |     "\n",
 64 |     "class MyBrokenEstimator(object):\n",
 65 |     "    def __init__(self, my_parameter=\"stuff\"):\n",
 66 |     "        self.my_parameter = my_parameter + \" more stuff\"\n",
 67 |     "    def fit(self, X, y):\n",
 68 |     "        X, y = check_X_y(X, y)\n",
 69 |     "        return self\n",
 70 |     "    def set_params(self, **kwargs):\n",
 71 |     "        for key, value in kwargs:\n",
 72 |     "            if key == \"parameter\":\n",
 73 |     "                self.my_parameter = my_parameter\n",
 74 |     "            else:\n",
 75 |     "                raise ValueError(\"Unknown parameter %s\" % key)\n",
 76 |     "        return self\n",
 77 |     "    def get_params(self, deep=None):\n",
 78 |     "        return {'my_parameter': self.my_parameter}"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "check_estimator(MyBrokenEstimator)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from sklearn.base import BaseEstimator\n",
101 |     "\n",
102 |     "class MyInheritingEstimator(BaseEstimator):\n",
103 |     "    def __init__(self, my_parameter=\"stuff\"):\n",
104 |     "        self.my_parameter = my_parameter\n",
105 |     "    def fit(self, X, y):\n",
106 |     "        X, y = check_X_y(X, y)\n",
107 |     "        return self"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "est = MyInheritingEstimator(my_parameter=\"bla\")\n",
119 |     "print(est)         "
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": true
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "check_estimator(MyInheritingEstimator)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "collapsed": true
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "from sklearn.base import TransformerMixin\n",
142 |     "class MyTransformer(BaseEstimator, TransformerMixin):\n",
143 |     "    def __init__(self, my_parameter=\"stuff\"):\n",
144 |     "        self.my_parameter = my_parameter\n",
145 |     "    def fit(self, X, y):\n",
146 |     "        X, y = check_X_y(X, y)\n",
147 |     "        self.n_features_ = X.shape[1]\n",
148 |     "        return self\n",
149 |     "    def transform(self, X):\n",
150 |     "        X = check_array(X)\n",
151 |     "        if X.shape[1] != self.n_features_:\n",
152 |     "            raise ValueError(\"lol wat\")\n",
153 |     "        return X - 2"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "check_estimator(MyTransformer)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "collapsed": false
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "import numpy as np\n",
176 |     "from sklearn.base import ClassifierMixin\n",
177 |     "\n",
178 |     "class MyBrokenClassifier(BaseEstimator, ClassifierMixin):\n",
179 |     "    def __init__(self, my_parameter=\"stuff\"):\n",
180 |     "        self.my_parameter = my_parameter\n",
181 |     "    def fit(self, X, y):\n",
182 |     "        X, y = check_X_y(X, y)\n",
183 |     "        return self\n",
184 |     "    def predict(self, X):\n",
185 |     "        X = check_array(X)\n",
186 |     "        return np.array([1, 2])"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {
193 |     "collapsed": false
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "check_estimator(MyBrokenClassifier)"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {
204 |     "collapsed": false
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "from sklearn.metrics import euclidean_distances\n",
209 |     "from sklearn.utils.multiclass import unique_labels\n",
210 |     "from sklearn.utils.validation import check_is_fitted\n",
211 |     "\n",
212 |     "class MyClassifier(BaseEstimator, ClassifierMixin):\n",
213 |     "    def __init__(self, my_parameter=\"stuff\"):\n",
214 |     "        self.my_parameter = my_parameter\n",
215 |     "        \n",
216 |     "    def fit(self, X, y):\n",
217 |     "        X, y = check_X_y(X, y)\n",
218 |     "        self.classes_ = unique_labels(y)\n",
219 |     "        self.X_ = X\n",
220 |     "        self.y_ = y\n",
221 |     "        return self\n",
222 |     "    \n",
223 |     "    def predict(self, X):\n",
224 |     "        check_is_fitted(self, [\"X_\", \"y_\"])\n",
225 |     "        X = check_array(X)\n",
226 |     "        closest = np.argmin(euclidean_distances(X, self.X_), axis=1)\n",
227 |     "        return self.y_[closest]"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "collapsed": false
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "check_estimator(MyClassifier)"
239 |    ]
240 |   }
241 |  ],
242 |  "metadata": {
243 |   "kernelspec": {
244 |    "display_name": "Python 3",
245 |    "language": "python",
246 |    "name": "python3"
247 |   },
248 |   "language_info": {
249 |    "codemirror_mode": {
250 |     "name": "ipython",
251 |     "version": 3
252 |    },
253 |    "file_extension": ".py",
254 |    "mimetype": "text/x-python",
255 |    "name": "python",
256 |    "nbconvert_exporter": "python",
257 |    "pygments_lexer": "ipython3",
258 |    "version": "3.5.1"
259 |   }
260 |  },
261 |  "nbformat": 4,
262 |  "nbformat_minor": 0
263 | }
264 | 


--------------------------------------------------------------------------------
/Grid Searches for Hyper Parameters.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Grid Searches\n",
  8 |     "================="
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "Grid-Search with build-in cross validation"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "from sklearn.grid_search import GridSearchCV\n",
 27 |     "from sklearn.svm import SVC"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "from sklearn.datasets import load_digits\n",
 39 |     "from sklearn.cross_validation import train_test_split\n",
 40 |     "digits = load_digits()\n",
 41 |     "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "Define parameter grid:"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import numpy as np\n",
 60 |     "\n",
 61 |     "param_grid = {'C': 10. ** np.arange(-3, 3),\n",
 62 |     "              'gamma' : 10. ** np.arange(-5, 0)}\n",
 63 |     "              \n",
 64 |     "\n",
 65 |     "np.set_printoptions(suppress=True)\n",
 66 |     "print(param_grid)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {
 73 |     "collapsed": false
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "grid_search = GridSearchCV(SVC(), param_grid, verbose=3, cv=5)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "A GridSearchCV object behaves just like a normal classifier."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {
 91 |     "collapsed": false,
 92 |     "scrolled": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "grid_search.fit(X_train, y_train)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": false,
104 |     "scrolled": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "grid_search.predict(X_test)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "grid_search.score(X_test, y_test)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": false
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "grid_search.best_params_"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "collapsed": false
138 |    },
139 |    "outputs": [],
140 |    "source": [
141 |     "# We extract just the scores\n",
142 |     "%matplotlib notebook\n",
143 |     "import matplotlib.pyplot as plt\n",
144 |     "\n",
145 |     "scores = [x[1] for x in grid_search.grid_scores_]\n",
146 |     "scores = np.array(scores).reshape(6, 5)\n",
147 |     "\n",
148 |     "plt.matshow(scores)\n",
149 |     "plt.xlabel('gamma')\n",
150 |     "plt.ylabel('C')\n",
151 |     "plt.colorbar()\n",
152 |     "plt.xticks(np.arange(5), param_grid['gamma'])\n",
153 |     "plt.yticks(np.arange(6), param_grid['C']);"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {
159 |     "collapsed": false
160 |    },
161 |    "source": [
162 |     "# Exercises\n",
163 |     "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "# %load solutions/grid_search_k_neighbors.py"
175 |    ]
176 |   }
177 |  ],
178 |  "metadata": {
179 |   "kernelspec": {
180 |    "display_name": "Python 3",
181 |    "language": "python",
182 |    "name": "python3"
183 |   },
184 |   "language_info": {
185 |    "codemirror_mode": {
186 |     "name": "ipython",
187 |     "version": 3
188 |    },
189 |    "file_extension": ".py",
190 |    "mimetype": "text/x-python",
191 |    "name": "python",
192 |    "nbconvert_exporter": "python",
193 |    "pygments_lexer": "ipython3",
194 |    "version": "3.5.1"
195 |   }
196 |  },
197 |  "nbformat": 4,
198 |  "nbformat_minor": 0
199 | }
200 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | CC0 1.0 Universal
  2 | 
  3 | Statement of Purpose
  4 | 
  5 | The laws of most jurisdictions throughout the world automatically confer
  6 | exclusive Copyright and Related Rights (defined below) upon the creator and
  7 | subsequent owner(s) (each and all, an "owner") of an original work of
  8 | authorship and/or a database (each, a "Work").
  9 | 
 10 | Certain owners wish to permanently relinquish those rights to a Work for the
 11 | purpose of contributing to a commons of creative, cultural and scientific
 12 | works ("Commons") that the public can reliably and without fear of later
 13 | claims of infringement build upon, modify, incorporate in other works, reuse
 14 | and redistribute as freely as possible in any form whatsoever and for any
 15 | purposes, including without limitation commercial purposes. These owners may
 16 | contribute to the Commons to promote the ideal of a free culture and the
 17 | further production of creative, cultural and scientific works, or to gain
 18 | reputation or greater distribution for their Work in part through the use and
 19 | efforts of others.
 20 | 
 21 | For these and/or other purposes and motivations, and without any expectation
 22 | of additional consideration or compensation, the person associating CC0 with a
 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
 25 | and publicly distribute the Work under its terms, with knowledge of his or her
 26 | Copyright and Related Rights in the Work and the meaning and intended legal
 27 | effect of CC0 on those rights.
 28 | 
 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 30 | protected by copyright and related or neighboring rights ("Copyright and
 31 | Related Rights"). Copyright and Related Rights include, but are not limited
 32 | to, the following:
 33 | 
 34 |   i. the right to reproduce, adapt, distribute, perform, display, communicate,
 35 |   and translate a Work;
 36 | 
 37 |   ii. moral rights retained by the original author(s) and/or performer(s);
 38 | 
 39 |   iii. publicity and privacy rights pertaining to a person's image or likeness
 40 |   depicted in a Work;
 41 | 
 42 |   iv. rights protecting against unfair competition in regards to a Work,
 43 |   subject to the limitations in paragraph 4(a), below;
 44 | 
 45 |   v. rights protecting the extraction, dissemination, use and reuse of data in
 46 |   a Work;
 47 | 
 48 |   vi. database rights (such as those arising under Directive 96/9/EC of the
 49 |   European Parliament and of the Council of 11 March 1996 on the legal
 50 |   protection of databases, and under any national implementation thereof,
 51 |   including any amended or successor version of such directive); and
 52 | 
 53 |   vii. other similar, equivalent or corresponding rights throughout the world
 54 |   based on applicable law or treaty, and any national implementations thereof.
 55 | 
 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of,
 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
 59 | and Related Rights and associated claims and causes of action, whether now
 60 | known or unknown (including existing as well as future claims and causes of
 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum
 62 | duration provided by applicable law or treaty (including future time
 63 | extensions), (iii) in any current or future medium and for any number of
 64 | copies, and (iv) for any purpose whatsoever, including without limitation
 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
 66 | the Waiver for the benefit of each member of the public at large and to the
 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver
 68 | shall not be subject to revocation, rescission, cancellation, termination, or
 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work
 70 | by the public as contemplated by Affirmer's express Statement of Purpose.
 71 | 
 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be
 73 | judged legally invalid or ineffective under applicable law, then the Waiver
 74 | shall be preserved to the maximum extent permitted taking into account
 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
 76 | is so judged Affirmer hereby grants to each affected person a royalty-free,
 77 | non transferable, non sublicensable, non exclusive, irrevocable and
 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in
 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration
 80 | provided by applicable law or treaty (including future time extensions), (iii)
 81 | in any current or future medium and for any number of copies, and (iv) for any
 82 | purpose whatsoever, including without limitation commercial, advertising or
 83 | promotional purposes (the "License"). The License shall be deemed effective as
 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the
 85 | License for any reason be judged legally invalid or ineffective under
 86 | applicable law, such partial invalidity or ineffectiveness shall not
 87 | invalidate the remainder of the License, and in such case Affirmer hereby
 88 | affirms that he or she will not (i) exercise any of his or her remaining
 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims
 90 | and causes of action with respect to the Work, in either case contrary to
 91 | Affirmer's express Statement of Purpose.
 92 | 
 93 | 4. Limitations and Disclaimers.
 94 | 
 95 |   a. No trademark or patent rights held by Affirmer are waived, abandoned,
 96 |   surrendered, licensed or otherwise affected by this document.
 97 | 
 98 |   b. Affirmer offers the Work as-is and makes no representations or warranties
 99 |   of any kind concerning the Work, express, implied, statutory or otherwise,
100 |   including without limitation warranties of title, merchantability, fitness
101 |   for a particular purpose, non infringement, or the absence of latent or
102 |   other defects, accuracy, or the present or absence of errors, whether or not
103 |   discoverable, all to the greatest extent permissible under applicable law.
104 | 
105 |   c. Affirmer disclaims responsibility for clearing rights of other persons
106 |   that may apply to the Work or any use thereof, including without limitation
107 |   any person's Copyright and Related Rights in the Work. Further, Affirmer
108 |   disclaims responsibility for obtaining any necessary consents, permissions
109 |   or other rights required for any use of the Work.
110 | 
111 |   d. Affirmer understands and acknowledges that Creative Commons is not a
112 |   party to this document and has no duty or obligation with respect to this
113 |   CC0 or use of the Work.
114 | 
115 | For more information, please see
116 | <http://creativecommons.org/publicdomain/zero/1.0/>
117 | 


--------------------------------------------------------------------------------
/Out Of Core Learning for Text.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "import numpy as np\n",
 13 |     "%matplotlib notebook"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Out of core text classification with the Hashing Vectorizer"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "Using the Amazon movie reviews collected by J. McAuley and J. Leskovec\n",
 28 |     "\n",
 29 |     "https://snap.stanford.edu/data/web-Movies.html"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "import os\n",
 41 |     "print(\"file size: %d GB\" % (os.path.getsize(\"data/movies.txt\") / 1024 ** 3))"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {
 48 |     "collapsed": false
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "with open(\"data/movies.txt\") as f:\n",
 53 |     "    print(f.read(4000))"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "def review_iter(f):\n",
 65 |     "    current_post = []\n",
 66 |     "    for line in f:\n",
 67 |     "        if line.startswith(\"product/productId\"):\n",
 68 |     "            if len(current_post):\n",
 69 |     "                score = current_post[3].strip(\"review/score: \").strip()\n",
 70 |     "                review = \"\".join(current_post[6:]).strip(\"review/text: \").strip()\n",
 71 |     "                # there are about 20 posts with linebreaks in them.\n",
 72 |     "                # we just ignore those for simplicity\n",
 73 |     "                try:\n",
 74 |     "                    yield int(float(score)), review\n",
 75 |     "                except:\n",
 76 |     "                    current_post = []\n",
 77 |     "                    continue\n",
 78 |     "            current_post = []\n",
 79 |     "        else:\n",
 80 |     "            current_post.append(line)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "collapsed": false,
 88 |     "scrolled": false
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "n_reviews = 0\n",
 93 |     "with open(\"data/movies.txt\", 'r', errors='ignore') as f:\n",
 94 |     "    for r in review_iter(f):\n",
 95 |     "        n_reviews += 1\n",
 96 |     "\n",
 97 |     "print(\"Number of reviews: %d\" % n_reviews)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {
104 |     "collapsed": false
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "from itertools import islice\n",
109 |     "\n",
110 |     "with open(\"data/movies.txt\", 'rb') as f:\n",
111 |     "    reviews = islice(review_iter(f), 10000)\n",
112 |     "    scores, texts = zip(*reviews)\n",
113 |     "print(np.bincount(scores))"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "collapsed": false
121 |    },
122 |    "outputs": [],
123 |    "source": []
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "from itertools import zip_longest # use izip_longest on Python3\n",
134 |     "# from the itertools recipes\n",
135 |     "def grouper(iterable, n, fillvalue=None):\n",
136 |     "    \"Collect data into fixed-length chunks or blocks\"\n",
137 |     "    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx\n",
138 |     "    args = [iter(iterable)] * n\n",
139 |     "    return zip_longest(fillvalue=fillvalue, *args)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "def preprocess_batch(reviews):\n",
151 |     "    # score == 3 is \"neutral\", we only want \"positive\" or \"negative\"\n",
152 |     "    reviews_filtered = [r for r in reviews if r is not None and r[0] != 3]\n",
153 |     "    scores, texts = zip(*reviews_filtered)\n",
154 |     "    polarity = np.array(scores) > 3\n",
155 |     "    return polarity, texts"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "collapsed": false,
163 |     "scrolled": true
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "from sklearn.feature_extraction.text import HashingVectorizer\n",
168 |     "\n",
169 |     "vectorizer = HashingVectorizer(decode_error=\"ignore\")\n",
170 |     "\n",
171 |     "with open(\"data/movies.txt\") as f:\n",
172 |     "    reviews = islice(review_iter(f), 10000)\n",
173 |     "    polarity_test, texts_test = preprocess_batch(reviews)\n",
174 |     "    X_test = vectorizer.transform(texts_test)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "from sklearn.linear_model import SGDClassifier\n",
186 |     "\n",
187 |     "sgd = SGDClassifier(random_state=0)\n",
188 |     "\n",
189 |     "accuracies = []\n",
190 |     "with open(\"data/movies.txt\") as f:\n",
191 |     "    training_set = islice(review_iter(f), 10000, None)\n",
192 |     "    batch_iter = grouper(training_set, 10000)\n",
193 |     "    for batch in batch_iter:\n",
194 |     "        polarity, texts = preprocess_batch(batch)\n",
195 |     "        X = vectorizer.transform(texts)\n",
196 |     "        sgd.partial_fit(X, polarity, classes=[0, 1])\n",
197 |     "        accuracies.append(sgd.score(X_test, polarity_test))"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {
204 |     "collapsed": false
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "plt.plot(accuracies)"
209 |    ]
210 |   }
211 |  ],
212 |  "metadata": {
213 |   "kernelspec": {
214 |    "display_name": "Python 3",
215 |    "language": "python",
216 |    "name": "python3"
217 |   },
218 |   "language_info": {
219 |    "codemirror_mode": {
220 |     "name": "ipython",
221 |     "version": 3
222 |    },
223 |    "file_extension": ".py",
224 |    "mimetype": "text/x-python",
225 |    "name": "python",
226 |    "nbconvert_exporter": "python",
227 |    "pygments_lexer": "ipython3",
228 |    "version": "3.4.3"
229 |   }
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 0
233 | }
234 | 


--------------------------------------------------------------------------------
/Out Of Core Learning.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# write out some toy data\n",
 12 |     "from sklearn.datasets import load_digits\n",
 13 |     "import pickle\n",
 14 |     "\n",
 15 |     "digits = load_digits()\n",
 16 |     "\n",
 17 |     "X, y = digits.data, digits.target\n",
 18 |     "\n",
 19 |     "for i in range(10):\n",
 20 |     "    pickle.dump((X[i::10] / 16., y[i::10]), open(\"data/batch_%02d.pickle\" % i, \"wb\"), -1)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "collapsed": false
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "from sklearn.linear_model import SGDClassifier"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "sgd = SGDClassifier(random_state=1)\n",
 43 |     "\n",
 44 |     "for i in range(9):\n",
 45 |     "    X_batch, y_batch = pickle.load(open(\"data/batch_%02d.pickle\" % i, \"rb\"))\n",
 46 |     "    sgd.partial_fit(X_batch, y_batch, classes=range(10))"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "X_test, y_test = pickle.load(open(\"data/batch_09.pickle\", \"rb\"))\n",
 58 |     "\n",
 59 |     "sgd.score(X_test, y_test)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Exercise\n",
 67 |     "Iterate over the dataset ten times, print the error on the hold-out batch (09) for each pass.\n",
 68 |     "Try changing the learning rate (and eta0) and see how that affects results."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# %load solutions/out_of_core.py"
 80 |    ]
 81 |   }
 82 |  ],
 83 |  "metadata": {
 84 |   "kernelspec": {
 85 |    "display_name": "Python 3",
 86 |    "language": "python",
 87 |    "name": "python3"
 88 |   },
 89 |   "language_info": {
 90 |    "codemirror_mode": {
 91 |     "name": "ipython",
 92 |     "version": 3
 93 |    },
 94 |    "file_extension": ".py",
 95 |    "mimetype": "text/x-python",
 96 |    "name": "python",
 97 |    "nbconvert_exporter": "python",
 98 |    "pygments_lexer": "ipython3",
 99 |    "version": "3.4.3"
100 |   }
101 |  },
102 |  "nbformat": 4,
103 |  "nbformat_minor": 0
104 | }
105 | 


--------------------------------------------------------------------------------
/Preprocessing and Pipelines.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Preprocessing and Pipelines\n",
  8 |     "============================="
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {
 15 |     "collapsed": false
 16 |    },
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from sklearn.datasets import load_digits\n",
 20 |     "from sklearn.cross_validation import train_test_split\n",
 21 |     "digits = load_digits()\n",
 22 |     "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n",
 23 |     "                                                    digits.target)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.\n",
 31 |     "To do that, we build a pipeline."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "from sklearn.pipeline import Pipeline, make_pipeline\n",
 43 |     "from sklearn.svm import SVC\n",
 44 |     "from sklearn.preprocessing import StandardScaler"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "standard_scaler = StandardScaler()\n",
 56 |     "standard_scaler.fit(X_train)\n",
 57 |     "X_train_scaled = standard_scaler.transform(X_train)\n",
 58 |     "svm = SVC().fit(X_train_scaled, y_train)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "#pipeline = Pipeline([(\"scaler\", StandardScaler()),\n",
 70 |     "#                     (\"svm\", SVC())])\n",
 71 |     "# short version:\n",
 72 |     "pipeline = make_pipeline(StandardScaler(), SVC())"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "pipeline.fit(X_train, y_train)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "pipeline.score(X_test, y_test)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "collapsed": false,
102 |     "scrolled": true
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "pipeline.predict(X_test)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "Cross-validation with a pipeline\n",
114 |     "---------------------------------"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "from sklearn.cross_validation import cross_val_score\n",
126 |     "cross_val_score(pipeline, X_train, y_train)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "Grid Search with a pipeline\n",
134 |     "==========================="
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "import numpy as np\n",
146 |     "from sklearn.grid_search import GridSearchCV\n",
147 |     "\n",
148 |     "param_grid = {'svc__C': 10. ** np.arange(-3, 3),\n",
149 |     "              'svc__gamma' : 10. ** np.arange(-3, 3)\n",
150 |     "             }\n",
151 |     "\n",
152 |     "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid) "
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "collapsed": false
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "grid_pipeline.fit(X_train, y_train)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "grid_pipeline.score(X_test, y_test)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {
180 |     "collapsed": false
181 |    },
182 |    "source": [
183 |     "# Exercise\n",
184 |     "Make a pipeline out of the StandardScaler and KNeighborsClassifier and search over the number of neighbors.\n"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {
191 |     "collapsed": false
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "# %load solutions/pipeline_knn.py"
196 |    ]
197 |   }
198 |  ],
199 |  "metadata": {
200 |   "kernelspec": {
201 |    "display_name": "Python 3",
202 |    "language": "python",
203 |    "name": "python3"
204 |   },
205 |   "language_info": {
206 |    "codemirror_mode": {
207 |     "name": "ipython",
208 |     "version": 3
209 |    },
210 |    "file_extension": ".py",
211 |    "mimetype": "text/x-python",
212 |    "name": "python",
213 |    "nbconvert_exporter": "python",
214 |    "pygments_lexer": "ipython3",
215 |    "version": "3.5.1"
216 |   }
217 |  },
218 |  "nbformat": 4,
219 |  "nbformat_minor": 0
220 | }
221 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Advanced machine learning with Scikit-learn
 2 | This repository contains material and slides for the Boston NLP meetup March 23rd 2016.
 3 | 
 4 | Slides are [here](https://github.com/amueller/advanced-sklearn-boston-nlp-2016/raw/master/advanced-sklearn-boston-nlp-2016.pdf).
 5 | 
 6 | The following packages are required to run the notebooks:
 7 | 
 8 | - scikit-learn >= 0.16 (some might require 0.17.1)
 9 | - matplotlib >= 1.3
10 | - numpy >= 1.5
11 | - IPython >= 4.0
12 | - Jupyter Notebook >= 4.0
13 | 
14 | The easiest way to install all requirements is to install the free Anaconda Python distribution:
15 | https://www.continuum.io/downloads (OS X, Windows, Linux)
16 | 
17 | All material in this repository is licensed CC-0
18 | 


--------------------------------------------------------------------------------
/Working With Text Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%matplotlib notebook\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "import numpy as np"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Text Classification of Movie Reviews"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "Get data from http://ai.stanford.edu/~amaas/data/sentiment/ and extract into the data folder."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "from sklearn.datasets import load_files\n",
 39 |     "\n",
 40 |     "reviews_train = load_files(\"data/aclImdb/train/\")\n",
 41 |     "text_train, y_train = reviews_train.data, reviews_train.target"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {
 48 |     "collapsed": false
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "print(\"Number of documents in training data: %d\" % len(text_train))\n",
 53 |     "print(np.bincount(y_train))"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "reviews_test = load_files(\"data/aclImdb/test/\")\n",
 65 |     "text_test, y_test = reviews_test.data, reviews_test.target\n",
 66 |     "print(\"Number of documents in test data: %d\" % len(text_test))\n",
 67 |     "print(np.bincount(y_test))"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "print(text_train[1])"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "print(y_train[1])"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
101 |     "cv = CountVectorizer()\n",
102 |     "cv.fit(text_train)\n",
103 |     "\n",
104 |     "len(cv.vocabulary_)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {
111 |     "collapsed": false,
112 |     "scrolled": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "print(cv.get_feature_names()[:50])\n",
117 |     "print(cv.get_feature_names()[50000:50050])"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "X_train = cv.transform(text_train)\n",
129 |     "X_train"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "print(text_train[19726])"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "X_train[19726].nonzero()[1]"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "collapsed": false
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "X_test = cv.transform(text_test)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "from sklearn.svm import LinearSVC\n",
174 |     "\n",
175 |     "svm = LinearSVC()\n",
176 |     "svm.fit(X_train, y_train)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {
183 |     "collapsed": false
184 |    },
185 |    "outputs": [],
186 |    "source": [
187 |     "svm.score(X_train, y_train)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "collapsed": false
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "svm.score(X_test, y_test)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {
205 |     "collapsed": false
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "def visualize_coefficients(classifier, feature_names, n_top_features=25):\n",
210 |     "    # get coefficients with large absolute values \n",
211 |     "    coef = classifier.coef_.ravel()\n",
212 |     "    positive_coefficients = np.argsort(coef)[-n_top_features:]\n",
213 |     "    negative_coefficients = np.argsort(coef)[:n_top_features]\n",
214 |     "    interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n",
215 |     "    # plot them\n",
216 |     "    plt.figure(figsize=(15, 5))\n",
217 |     "    colors = [\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]]\n",
218 |     "    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)\n",
219 |     "    feature_names = np.array(feature_names)\n",
220 |     "    plt.subplots_adjust(bottom=0.3)\n",
221 |     "    plt.xticks(np.arange(1, 1 + 2 * n_top_features), feature_names[interesting_coefficients], rotation=60, ha=\"right\");\n"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "collapsed": false
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "visualize_coefficients(svm, cv.get_feature_names())"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {
239 |     "collapsed": false
240 |    },
241 |    "outputs": [],
242 |    "source": [
243 |     "svm = LinearSVC(C=0.001)\n",
244 |     "svm.fit(X_train, y_train)\n",
245 |     "svm.score(X_test, y_test)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [],
255 |    "source": [
256 |     "visualize_coefficients(svm, cv.get_feature_names())"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "# Start pipelines"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {
270 |     "collapsed": false
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "from sklearn.pipeline import make_pipeline\n",
275 |     "text_pipe = make_pipeline(CountVectorizer(), LinearSVC())\n",
276 |     "text_pipe.fit(text_train, y_train)\n",
277 |     "text_pipe.score(text_test, y_test)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {
284 |     "collapsed": false,
285 |     "scrolled": true
286 |    },
287 |    "outputs": [],
288 |    "source": [
289 |     "from sklearn.grid_search import GridSearchCV\n",
290 |     "\n",
291 |     "param_grid = {'linearsvc__C': np.logspace(-5, 0, 6)}\n",
292 |     "grid = GridSearchCV(text_pipe, param_grid, cv=5)\n",
293 |     "grid.fit(text_train, y_train)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {
300 |     "collapsed": false
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "grid.best_params_"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {
311 |     "collapsed": false
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "visualize_coefficients(grid.best_estimator_.named_steps['linearsvc'],\n",
316 |     "                       grid.best_estimator_.named_steps['countvectorizer'].get_feature_names())"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "collapsed": false
324 |    },
325 |    "outputs": [],
326 |    "source": [
327 |     "grid.score(text_test, y_test)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "# N-Grams"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {
341 |     "collapsed": false,
342 |     "scrolled": true
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "text_pipe = make_pipeline(CountVectorizer(), LinearSVC())\n",
347 |     "from sklearn.grid_search import GridSearchCV\n",
348 |     "\n",
349 |     "\n",
350 |     "param_grid = {'linearsvc__C': np.logspace(-3, 2, 6),\n",
351 |     "              \"countvectorizer__ngram_range\": [(1, 1), (1, 2)]}\n",
352 |     "\n",
353 |     "grid = GridSearchCV(text_pipe, param_grid, cv=5)\n",
354 |     "\n",
355 |     "grid.fit(text_train, y_train)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {
362 |     "collapsed": false
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "scores = np.array([score.mean_validation_score for score in grid.grid_scores_]).reshape(3, -1)\n",
367 |     "plt.matshow(scores)\n",
368 |     "plt.ylabel(\"n-gram range\")\n",
369 |     "plt.yticks(range(3), param_grid[\"countvectorizer__ngram_range\"])\n",
370 |     "plt.xlabel(\"C\")\n",
371 |     "plt.xticks(range(6), param_grid[\"linearsvc__C\"]);\n",
372 |     "plt.colorbar()"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {
379 |     "collapsed": false
380 |    },
381 |    "outputs": [],
382 |    "source": [
383 |     "grid.best_params_"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": null,
389 |    "metadata": {
390 |     "collapsed": false
391 |    },
392 |    "outputs": [],
393 |    "source": [
394 |     "visualize_coefficients(grid.best_estimator_.named_steps['linearsvc'],\n",
395 |     "                       grid.best_estimator_.named_steps['countvectorizer'].get_feature_names())"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {
402 |     "collapsed": false
403 |    },
404 |    "outputs": [],
405 |    "source": [
406 |     "grid.score(text_test, y_test)"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "markdown",
411 |    "metadata": {},
412 |    "source": [
413 |     "## Look at SpaCy and NLTK"
414 |    ]
415 |   }
416 |  ],
417 |  "metadata": {
418 |   "kernelspec": {
419 |    "display_name": "Python 3",
420 |    "language": "python",
421 |    "name": "python3"
422 |   },
423 |   "language_info": {
424 |    "codemirror_mode": {
425 |     "name": "ipython",
426 |     "version": 3
427 |    },
428 |    "file_extension": ".py",
429 |    "mimetype": "text/x-python",
430 |    "name": "python",
431 |    "nbconvert_exporter": "python",
432 |    "pygments_lexer": "ipython3",
433 |    "version": "3.5.1"
434 |   }
435 |  },
436 |  "nbformat": 4,
437 |  "nbformat_minor": 0
438 | }
439 | 


--------------------------------------------------------------------------------
/advanced-sklearn-boston-nlp-2016.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/advanced-sklearn-boston-nlp-2016/bd59b30774da68b4d2ddd80148feff6ed2f8b608/advanced-sklearn-boston-nlp-2016.odp


--------------------------------------------------------------------------------
/advanced-sklearn-boston-nlp-2016.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/advanced-sklearn-boston-nlp-2016/bd59b30774da68b4d2ddd80148feff6ed2f8b608/advanced-sklearn-boston-nlp-2016.pdf


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: sklearn
 2 | dependencies:
 3 | - libgfortran=1.0=0
 4 | - numpy=1.10.1=py34_0
 5 | - openblas=0.2.14=3
 6 | - openssl=1.0.2d=0
 7 | - pip=7.1.2=py34_0
 8 | - python=3.4.3=2
 9 | - readline=6.2=2
10 | - scikit-learn=0.17=np110py34_1
11 | - scipy=0.16.0=np110py34_1
12 | - setuptools=18.5=py34_0
13 | - sqlite=3.8.4.1=1
14 | - tk=8.5.18=0
15 | - wheel=0.26.0=py34_1
16 | - xz=5.0.5=0
17 | - zlib=1.2.8=0
18 | - pip:
19 |   - apache-libcloud==0.19.0
20 |   - backports.ssl-match-hostname==3.4.0.2
21 |   - futures==3.0.3
22 |   - pkginfo==1.2.1
23 |   - requests-toolbelt==0.4.0
24 |   - twine==1.6.4
25 |   - wheelhouse-uploader==0.7.4
26 | 
27 | 


--------------------------------------------------------------------------------
/solutions/cross_validation_iris.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.cross_validation import StratifiedKFold, KFold
3 | iris = load_iris()
4 | X, y = iris.data, iris.target
5 | 
6 | print(cross_val_score(LinearSVC(), X, y, cv=KFold(len(X), 3)))
7 | print(cross_val_score(LinearSVC(), X, y, cv=StratifiedKFold(y, 3)))
8 | 


--------------------------------------------------------------------------------
/solutions/digits_tsne.py:
--------------------------------------------------------------------------------
1 | from sklearn.manifold import TSNE
2 | tsne = TSNE()
3 | X_tsne = tsne.fit_transform(X)
4 | plt.figure()
5 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
6 | 


--------------------------------------------------------------------------------
/solutions/grid_search_k_neighbors.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import KNeighborsClassifier
 2 | 
 3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]}
 4 | 
 5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid)
 6 | grid.fit(X_train, y_train)
 7 | print("best parameters: %s" % grid.best_params_)
 8 | print("Training set accuracy: %s" % grid.score(X_train, y_train))
 9 | print("Test set accuracy: %s" % grid.score(X_test, y_test))
10 | 


--------------------------------------------------------------------------------
/solutions/load_iris.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from sklearn.datasets import load_iris
 5 | from sklearn.cross_validation import train_test_split
 6 | 
 7 | iris = load_iris()
 8 | X, y = iris.data, iris.target
 9 | 
10 | print("Dataset size: %d  number of features: %d  number of classes: %d"
11 |       % (X.shape[0], X.shape[1], len(np.unique(y))))
12 | 
13 | X_train, X_test, y_train, y_test = train_test_split(X, y)
14 | 
15 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
16 | plt.figure()
17 | plt.scatter(X_train[:, 2], X_train[:, 3], c=y_train)
18 | 


--------------------------------------------------------------------------------
/solutions/out_of_core.py:
--------------------------------------------------------------------------------
1 | sgd = SGDClassifier(learning_rate='invscaling', eta0=.5)
2 | 
3 | for j in range(10):
4 |     for i in range(9):
5 |         X_batch, y_batch = pickle.load(open("data/batch_%02d.pickle" % i, "rb"))
6 |         sgd.partial_fit(X_batch, y_batch, classes=range(10))
7 |     print(sgd.score(X_test, y_test))
8 | 


--------------------------------------------------------------------------------
/solutions/pipeline_knn.py:
--------------------------------------------------------------------------------
1 | from sklearn.neighbors import KNeighborsClassifier
2 | pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
3 | param_grid = {'kneighborsclassifier__n_neighbors': [1, 3, 5, 10]}
4 | grid = GridSearchCV(pipe, param_grid)
5 | grid.fit(X_train, y_train)
6 | print(grid.best_params_)
7 | print(grid.score(X_test, y_test))
8 | 


--------------------------------------------------------------------------------
/solutions/text_pipeline.py:
--------------------------------------------------------------------------------
 1 | from sklearn.pipeline import make_pipeline
 2 | from sklearn.grid_search import GridSearchCV
 3 | 
 4 | pipeline = make_pipeline(CountVectorizer(),
 5 |                          LinearSVC())
 6 | pipeline.fit(text_train, y_train)
 7 | print("Pipeline test score: %f"
 8 |       % pipeline.score(text_test, y_test))
 9 | visualize_coefficients(pipeline.named_steps['linearsvc'],
10 |                        pipeline.named_steps['countvectorizer'].get_feature_names())
11 | 
12 | param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3)}
13 | 
14 | grid_search = GridSearchCV(pipeline, param_grid=param_grid)
15 | grid_search.fit(text_train, y_train)
16 | 
17 | print("best parameters : %s" % grid_search.best_params_)
18 | print("Grid-searched test score: %f"
19 |       % grid_search.score(text_test, y_test))
20 | 
21 | est = grid_search.best_estimator_
22 | visualize_coefficients(est.named_steps['linearsvc'],
23 |                        est.named_steps['countvectorizer'].get_feature_names())
24 | 
25 | param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3),
26 |               "countvectorizer__ngram_range": [(1, 1), (1, 2), (2, 2)]}
27 | grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3)
28 | grid_search.fit(text_train, y_train)
29 | 
30 | print("best parameters with n-gram search: %s" % grid_search.best_params_)
31 | print("test set score with n-gram search: %s" % grid_search.score(text_test, y_test))
32 | 
33 | est = grid_search.best_estimator_
34 | visualize_coefficients(est.named_steps['linearsvc'],
35 |                        est.named_steps['countvectorizer'].get_feature_names())
36 | 


--------------------------------------------------------------------------------
/solutions/train_iris.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import load_iris
 2 | from sklearn.neighbors import KNeighborsClassifier
 3 | from sklearn.cross_validation import train_test_split
 4 | 
 5 | iris = load_iris()
 6 | X, y = iris.data, iris.target
 7 | 
 8 | X_train, X_test, y_train, y_test = train_test_split(X, y)
 9 | 
10 | knn = KNeighborsClassifier(n_neighbors=3)
11 | knn.fit(X_train, y_train)
12 | 
13 | print("test set score of knn: %f" % knn.score(X_test, y_test))
14 | 


--------------------------------------------------------------------------------