├── .gitignore
├── Chapter 1 - Introduction to Scikit-learn.ipynb
├── Chapter 2 - Unsupervised Transformers.ipynb
├── Chapter 3 - Cross-validation.ipynb
├── Chapter 4 - Grid Searches for Hyper Parameters.ipynb
├── Chapter 5 - Preprocessing and Pipelines.ipynb
├── Chapter 6 - Working With Text Data.ipynb
├── LICENSE
├── machine-learning-with-scikit-learn-strata-2015.odp
├── machine-learning-with-scikit-learn-strata-2015.pdf
├── test_with_solutions.csv
└── train.csv


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/Chapter 1 - Introduction to Scikit-learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Get some data to play with"
 15 |      ]
 16 |     },
 17 |     {
 18 |      "cell_type": "code",
 19 |      "collapsed": false,
 20 |      "input": [
 21 |       "from sklearn.datasets import load_digits\n",
 22 |       "digits = load_digits()\n",
 23 |       "digits.keys()"
 24 |      ],
 25 |      "language": "python",
 26 |      "metadata": {},
 27 |      "outputs": []
 28 |     },
 29 |     {
 30 |      "cell_type": "code",
 31 |      "collapsed": false,
 32 |      "input": [
 33 |       "digits.images.shape"
 34 |      ],
 35 |      "language": "python",
 36 |      "metadata": {},
 37 |      "outputs": []
 38 |     },
 39 |     {
 40 |      "cell_type": "code",
 41 |      "collapsed": false,
 42 |      "input": [
 43 |       "print(digits.images[0])"
 44 |      ],
 45 |      "language": "python",
 46 |      "metadata": {},
 47 |      "outputs": []
 48 |     },
 49 |     {
 50 |      "cell_type": "code",
 51 |      "collapsed": false,
 52 |      "input": [
 53 |       "import matplotlib.pyplot as plt\n",
 54 |       "%matplotlib inline\n",
 55 |       "\n",
 56 |       "plt.matshow(digits.images[0], cmap=plt.cm.Greys)"
 57 |      ],
 58 |      "language": "python",
 59 |      "metadata": {},
 60 |      "outputs": []
 61 |     },
 62 |     {
 63 |      "cell_type": "code",
 64 |      "collapsed": false,
 65 |      "input": [
 66 |       "digits.data.shape"
 67 |      ],
 68 |      "language": "python",
 69 |      "metadata": {},
 70 |      "outputs": []
 71 |     },
 72 |     {
 73 |      "cell_type": "code",
 74 |      "collapsed": false,
 75 |      "input": [
 76 |       "digits.target.shape"
 77 |      ],
 78 |      "language": "python",
 79 |      "metadata": {},
 80 |      "outputs": []
 81 |     },
 82 |     {
 83 |      "cell_type": "code",
 84 |      "collapsed": false,
 85 |      "input": [
 86 |       "digits.target"
 87 |      ],
 88 |      "language": "python",
 89 |      "metadata": {},
 90 |      "outputs": []
 91 |     },
 92 |     {
 93 |      "cell_type": "markdown",
 94 |      "metadata": {},
 95 |      "source": [
 96 |       "**Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)**"
 97 |      ]
 98 |     },
 99 |     {
100 |      "cell_type": "markdown",
101 |      "metadata": {},
102 |      "source": [
103 |       "Split the data to get going"
104 |      ]
105 |     },
106 |     {
107 |      "cell_type": "code",
108 |      "collapsed": false,
109 |      "input": [
110 |       "from sklearn.cross_validation import train_test_split\n",
111 |       "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
112 |      ],
113 |      "language": "python",
114 |      "metadata": {},
115 |      "outputs": []
116 |     },
117 |     {
118 |      "cell_type": "markdown",
119 |      "metadata": {},
120 |      "source": [
121 |       "Really Simple API\n",
122 |       "-------------------\n",
123 |       "0) Import your model class"
124 |      ]
125 |     },
126 |     {
127 |      "cell_type": "code",
128 |      "collapsed": false,
129 |      "input": [
130 |       "from sklearn.svm import LinearSVC"
131 |      ],
132 |      "language": "python",
133 |      "metadata": {},
134 |      "outputs": []
135 |     },
136 |     {
137 |      "cell_type": "markdown",
138 |      "metadata": {},
139 |      "source": [
140 |       "1) Instantiate an object and set the parameters"
141 |      ]
142 |     },
143 |     {
144 |      "cell_type": "code",
145 |      "collapsed": false,
146 |      "input": [
147 |       "svm = LinearSVC(C=0.1)"
148 |      ],
149 |      "language": "python",
150 |      "metadata": {},
151 |      "outputs": []
152 |     },
153 |     {
154 |      "cell_type": "markdown",
155 |      "metadata": {},
156 |      "source": [
157 |       "2) Fit the model"
158 |      ]
159 |     },
160 |     {
161 |      "cell_type": "code",
162 |      "collapsed": false,
163 |      "input": [
164 |       "svm.fit(X_train, y_train)"
165 |      ],
166 |      "language": "python",
167 |      "metadata": {},
168 |      "outputs": []
169 |     },
170 |     {
171 |      "cell_type": "markdown",
172 |      "metadata": {},
173 |      "source": [
174 |       "3) Apply / evaluate"
175 |      ]
176 |     },
177 |     {
178 |      "cell_type": "code",
179 |      "collapsed": false,
180 |      "input": [
181 |       "print(svm.predict(X_train))\n",
182 |       "print(y_train)"
183 |      ],
184 |      "language": "python",
185 |      "metadata": {},
186 |      "outputs": []
187 |     },
188 |     {
189 |      "cell_type": "code",
190 |      "collapsed": false,
191 |      "input": [
192 |       "svm.score(X_train, y_train)"
193 |      ],
194 |      "language": "python",
195 |      "metadata": {},
196 |      "outputs": []
197 |     },
198 |     {
199 |      "cell_type": "code",
200 |      "collapsed": false,
201 |      "input": [
202 |       "svm.score(X_test, y_test)"
203 |      ],
204 |      "language": "python",
205 |      "metadata": {},
206 |      "outputs": []
207 |     },
208 |     {
209 |      "cell_type": "markdown",
210 |      "metadata": {},
211 |      "source": [
212 |       "And again\n",
213 |       "---------"
214 |      ]
215 |     },
216 |     {
217 |      "cell_type": "code",
218 |      "collapsed": false,
219 |      "input": [
220 |       "from sklearn.ensemble import RandomForestClassifier"
221 |      ],
222 |      "language": "python",
223 |      "metadata": {},
224 |      "outputs": []
225 |     },
226 |     {
227 |      "cell_type": "code",
228 |      "collapsed": false,
229 |      "input": [
230 |       "rf = RandomForestClassifier(n_estimators=50)"
231 |      ],
232 |      "language": "python",
233 |      "metadata": {},
234 |      "outputs": []
235 |     },
236 |     {
237 |      "cell_type": "code",
238 |      "collapsed": false,
239 |      "input": [
240 |       "rf.fit(X_train, y_train)"
241 |      ],
242 |      "language": "python",
243 |      "metadata": {},
244 |      "outputs": []
245 |     },
246 |     {
247 |      "cell_type": "code",
248 |      "collapsed": false,
249 |      "input": [
250 |       "rf.score(X_test, y_test)"
251 |      ],
252 |      "language": "python",
253 |      "metadata": {},
254 |      "outputs": []
255 |     },
256 |     {
257 |      "cell_type": "code",
258 |      "collapsed": false,
259 |      "input": [
260 |       "#%load from github"
261 |      ],
262 |      "language": "python",
263 |      "metadata": {},
264 |      "outputs": []
265 |     },
266 |     {
267 |      "cell_type": "code",
268 |      "collapsed": false,
269 |      "input": [
270 |       "import numpy as np\n",
271 |       "import pylab as pl\n",
272 |       "from matplotlib.colors import ListedColormap\n",
273 |       "from sklearn.cross_validation import train_test_split\n",
274 |       "from sklearn.preprocessing import StandardScaler\n",
275 |       "from sklearn.datasets import make_moons, make_circles, make_classification\n",
276 |       "from sklearn.neighbors import KNeighborsClassifier\n",
277 |       "from sklearn.svm import SVC\n",
278 |       "from sklearn.tree import DecisionTreeClassifier\n",
279 |       "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
280 |       "from sklearn.naive_bayes import GaussianNB\n",
281 |       "from sklearn.lda import LDA\n",
282 |       "from sklearn.qda import QDA\n",
283 |       "\n",
284 |       "h = .02  # step size in the mesh\n",
285 |       "\n",
286 |       "names = [\"Nearest Neighbors\", \"Linear SVM\", \"RBF SVM\", \"Decision Tree\",\n",
287 |       "         \"Random Forest\", \"AdaBoost\", \"Naive Bayes\", \"LDA\", \"QDA\"]\n",
288 |       "classifiers = [\n",
289 |       "    KNeighborsClassifier(3),\n",
290 |       "    SVC(kernel=\"linear\", C=0.025),\n",
291 |       "    SVC(gamma=2, C=1),\n",
292 |       "    DecisionTreeClassifier(max_depth=5),\n",
293 |       "    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n",
294 |       "    AdaBoostClassifier(),\n",
295 |       "    GaussianNB(),\n",
296 |       "    LDA(),\n",
297 |       "    QDA()]\n",
298 |       "\n",
299 |       "X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,\n",
300 |       "                           random_state=1, n_clusters_per_class=1)\n",
301 |       "rng = np.random.RandomState(2)\n",
302 |       "X += 2 * rng.uniform(size=X.shape)\n",
303 |       "linearly_separable = (X, y)\n",
304 |       "\n",
305 |       "datasets = [make_moons(noise=0.3, random_state=0),\n",
306 |       "            make_circles(noise=0.2, factor=0.5, random_state=1),\n",
307 |       "            linearly_separable\n",
308 |       "            ]\n",
309 |       "\n",
310 |       "figure = pl.figure(figsize=(27, 9))\n",
311 |       "i = 1\n",
312 |       "# iterate over datasets\n",
313 |       "for ds in datasets:\n",
314 |       "    # preprocess dataset, split into training and test part\n",
315 |       "    X, y = ds\n",
316 |       "    X = StandardScaler().fit_transform(X)\n",
317 |       "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)\n",
318 |       "\n",
319 |       "    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5\n",
320 |       "    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5\n",
321 |       "    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n",
322 |       "                         np.arange(y_min, y_max, h))\n",
323 |       "\n",
324 |       "    # just plot the dataset first\n",
325 |       "    cm = pl.cm.RdBu\n",
326 |       "    cm_bright = ListedColormap(['#FF0000', '#0000FF'])\n",
327 |       "    ax = pl.subplot(len(datasets), len(classifiers) + 1, i)\n",
328 |       "    # Plot the training points\n",
329 |       "    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)\n",
330 |       "    # and testing points\n",
331 |       "    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)\n",
332 |       "    ax.set_xlim(xx.min(), xx.max())\n",
333 |       "    ax.set_ylim(yy.min(), yy.max())\n",
334 |       "    ax.set_xticks(())\n",
335 |       "    ax.set_yticks(())\n",
336 |       "    i += 1\n",
337 |       "\n",
338 |       "    # iterate over classifiers\n",
339 |       "    for name, clf in zip(names, classifiers):\n",
340 |       "        ax = pl.subplot(len(datasets), len(classifiers) + 1, i)\n",
341 |       "        clf.fit(X_train, y_train)\n",
342 |       "        score = clf.score(X_test, y_test)\n",
343 |       "\n",
344 |       "        # Plot the decision boundary. For that, we will assign a color to each\n",
345 |       "        # point in the mesh [x_min, m_max]x[y_min, y_max].\n",
346 |       "        if hasattr(clf, \"decision_function\"):\n",
347 |       "            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])\n",
348 |       "        else:\n",
349 |       "            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]\n",
350 |       "\n",
351 |       "        # Put the result into a color plot\n",
352 |       "        Z = Z.reshape(xx.shape)\n",
353 |       "        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)\n",
354 |       "\n",
355 |       "        # Plot also the training points\n",
356 |       "        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)\n",
357 |       "        # and testing points\n",
358 |       "        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,\n",
359 |       "                   alpha=0.6)\n",
360 |       "\n",
361 |       "        ax.set_xlim(xx.min(), xx.max())\n",
362 |       "        ax.set_ylim(yy.min(), yy.max())\n",
363 |       "        ax.set_xticks(())\n",
364 |       "        ax.set_yticks(())\n",
365 |       "        ax.set_title(name)\n",
366 |       "        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),\n",
367 |       "                size=15, horizontalalignment='right')\n",
368 |       "        i += 1\n",
369 |       "\n",
370 |       "figure.subplots_adjust(left=.02, right=.98)\n"
371 |      ],
372 |      "language": "python",
373 |      "metadata": {},
374 |      "outputs": []
375 |     },
376 |     {
377 |      "cell_type": "code",
378 |      "collapsed": false,
379 |      "input": [],
380 |      "language": "python",
381 |      "metadata": {},
382 |      "outputs": []
383 |     }
384 |    ],
385 |    "metadata": {}
386 |   }
387 |  ]
388 | }


--------------------------------------------------------------------------------
/Chapter 2 - Unsupervised Transformers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "code",
 12 |      "collapsed": false,
 13 |      "input": [
 14 |       "from sklearn.datasets import load_digits\n",
 15 |       "from sklearn.cross_validation import train_test_split\n",
 16 |       "import numpy as np\n",
 17 |       "np.set_printoptions(suppress=True)\n",
 18 |       "\n",
 19 |       "digits = load_digits()\n",
 20 |       "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
 21 |      ],
 22 |      "language": "python",
 23 |      "metadata": {},
 24 |      "outputs": []
 25 |     },
 26 |     {
 27 |      "cell_type": "markdown",
 28 |      "metadata": {},
 29 |      "source": [
 30 |       "Removing mean and scaling variance\n",
 31 |       "==================================="
 32 |      ]
 33 |     },
 34 |     {
 35 |      "cell_type": "code",
 36 |      "collapsed": false,
 37 |      "input": [
 38 |       "from sklearn.preprocessing import StandardScaler"
 39 |      ],
 40 |      "language": "python",
 41 |      "metadata": {},
 42 |      "outputs": []
 43 |     },
 44 |     {
 45 |      "cell_type": "markdown",
 46 |      "metadata": {},
 47 |      "source": [
 48 |       "1) Instantiate the model"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "scaler = StandardScaler()"
 56 |      ],
 57 |      "language": "python",
 58 |      "metadata": {},
 59 |      "outputs": []
 60 |     },
 61 |     {
 62 |      "cell_type": "markdown",
 63 |      "metadata": {},
 64 |      "source": [
 65 |       "2) Fit using only the data."
 66 |      ]
 67 |     },
 68 |     {
 69 |      "cell_type": "code",
 70 |      "collapsed": false,
 71 |      "input": [
 72 |       "scaler.fit(X_train)"
 73 |      ],
 74 |      "language": "python",
 75 |      "metadata": {},
 76 |      "outputs": []
 77 |     },
 78 |     {
 79 |      "cell_type": "markdown",
 80 |      "metadata": {},
 81 |      "source": [
 82 |       "3) `transform` the data (not `predict`)."
 83 |      ]
 84 |     },
 85 |     {
 86 |      "cell_type": "code",
 87 |      "collapsed": false,
 88 |      "input": [
 89 |       "X_train_scaled = scaler.transform(X_train)"
 90 |      ],
 91 |      "language": "python",
 92 |      "metadata": {},
 93 |      "outputs": []
 94 |     },
 95 |     {
 96 |      "cell_type": "code",
 97 |      "collapsed": false,
 98 |      "input": [
 99 |       "X_train.shape"
100 |      ],
101 |      "language": "python",
102 |      "metadata": {},
103 |      "outputs": []
104 |     },
105 |     {
106 |      "cell_type": "code",
107 |      "collapsed": false,
108 |      "input": [
109 |       "X_train_scaled.shape"
110 |      ],
111 |      "language": "python",
112 |      "metadata": {},
113 |      "outputs": []
114 |     },
115 |     {
116 |      "cell_type": "markdown",
117 |      "metadata": {},
118 |      "source": [
119 |       "The transformed version of the data has the mean removed:"
120 |      ]
121 |     },
122 |     {
123 |      "cell_type": "code",
124 |      "collapsed": false,
125 |      "input": [
126 |       "X_train_scaled.mean(axis=0)"
127 |      ],
128 |      "language": "python",
129 |      "metadata": {},
130 |      "outputs": []
131 |     },
132 |     {
133 |      "cell_type": "code",
134 |      "collapsed": false,
135 |      "input": [
136 |       "X_train_scaled.std(axis=0)"
137 |      ],
138 |      "language": "python",
139 |      "metadata": {},
140 |      "outputs": []
141 |     },
142 |     {
143 |      "cell_type": "code",
144 |      "collapsed": false,
145 |      "input": [
146 |       "X_test_transformed = scaler.transform(X_test)"
147 |      ],
148 |      "language": "python",
149 |      "metadata": {},
150 |      "outputs": []
151 |     },
152 |     {
153 |      "cell_type": "markdown",
154 |      "metadata": {},
155 |      "source": [
156 |       "Principal Component Analysis\n",
157 |       "============================="
158 |      ]
159 |     },
160 |     {
161 |      "cell_type": "markdown",
162 |      "metadata": {},
163 |      "source": [
164 |       "0) Import the model"
165 |      ]
166 |     },
167 |     {
168 |      "cell_type": "code",
169 |      "collapsed": false,
170 |      "input": [
171 |       "from sklearn.decomposition import PCA"
172 |      ],
173 |      "language": "python",
174 |      "metadata": {},
175 |      "outputs": []
176 |     },
177 |     {
178 |      "cell_type": "markdown",
179 |      "metadata": {},
180 |      "source": [
181 |       "1) Instantiate the model"
182 |      ]
183 |     },
184 |     {
185 |      "cell_type": "code",
186 |      "collapsed": false,
187 |      "input": [
188 |       "pca = PCA(n_components=2)"
189 |      ],
190 |      "language": "python",
191 |      "metadata": {},
192 |      "outputs": []
193 |     },
194 |     {
195 |      "cell_type": "markdown",
196 |      "metadata": {},
197 |      "source": [
198 |       "2) Fit to training data"
199 |      ]
200 |     },
201 |     {
202 |      "cell_type": "code",
203 |      "collapsed": false,
204 |      "input": [
205 |       "pca.fit(X_train)"
206 |      ],
207 |      "language": "python",
208 |      "metadata": {},
209 |      "outputs": []
210 |     },
211 |     {
212 |      "cell_type": "markdown",
213 |      "metadata": {},
214 |      "source": [
215 |       "3) Transform to lower-dimensional representation"
216 |      ]
217 |     },
218 |     {
219 |      "cell_type": "code",
220 |      "collapsed": false,
221 |      "input": [
222 |       "print(X_train.shape)\n",
223 |       "X_pca = pca.transform(X_train)\n",
224 |       "X_pca.shape"
225 |      ],
226 |      "language": "python",
227 |      "metadata": {},
228 |      "outputs": []
229 |     },
230 |     {
231 |      "cell_type": "markdown",
232 |      "metadata": {},
233 |      "source": [
234 |       "Visualize\n",
235 |       "----------"
236 |      ]
237 |     },
238 |     {
239 |      "cell_type": "code",
240 |      "collapsed": false,
241 |      "input": [
242 |       "import matplotlib.pyplot as plt\n",
243 |       "%matplotlib inline\n",
244 |       "plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train)"
245 |      ],
246 |      "language": "python",
247 |      "metadata": {},
248 |      "outputs": []
249 |     },
250 |     {
251 |      "cell_type": "code",
252 |      "collapsed": false,
253 |      "input": [],
254 |      "language": "python",
255 |      "metadata": {},
256 |      "outputs": []
257 |     }
258 |    ],
259 |    "metadata": {}
260 |   }
261 |  ]
262 | }


--------------------------------------------------------------------------------
/Chapter 3 - Cross-validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Cross-Validation\n",
 15 |       "----------------------------------------"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "code",
 20 |      "collapsed": false,
 21 |      "input": [
 22 |       "from sklearn.datasets import load_digits"
 23 |      ],
 24 |      "language": "python",
 25 |      "metadata": {},
 26 |      "outputs": []
 27 |     },
 28 |     {
 29 |      "cell_type": "code",
 30 |      "collapsed": false,
 31 |      "input": [
 32 |       "digits = load_digits()\n",
 33 |       "X = digits.data\n",
 34 |       "y = digits.target"
 35 |      ],
 36 |      "language": "python",
 37 |      "metadata": {},
 38 |      "outputs": []
 39 |     },
 40 |     {
 41 |      "cell_type": "code",
 42 |      "collapsed": false,
 43 |      "input": [
 44 |       "from sklearn.cross_validation import cross_val_score\n",
 45 |       "from sklearn.svm import LinearSVC"
 46 |      ],
 47 |      "language": "python",
 48 |      "metadata": {},
 49 |      "outputs": []
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "cross_val_score(LinearSVC(), X, y)"
 56 |      ],
 57 |      "language": "python",
 58 |      "metadata": {},
 59 |      "outputs": []
 60 |     },
 61 |     {
 62 |      "cell_type": "code",
 63 |      "collapsed": false,
 64 |      "input": [
 65 |       "cross_val_score(LinearSVC(), X, y, cv=5, scoring=\"f1_macro\")"
 66 |      ],
 67 |      "language": "python",
 68 |      "metadata": {},
 69 |      "outputs": []
 70 |     },
 71 |     {
 72 |      "cell_type": "markdown",
 73 |      "metadata": {},
 74 |      "source": [
 75 |       "Let's go to a binary task for a moment (even vs uneven)"
 76 |      ]
 77 |     },
 78 |     {
 79 |      "cell_type": "code",
 80 |      "collapsed": false,
 81 |      "input": [
 82 |       "y % 2"
 83 |      ],
 84 |      "language": "python",
 85 |      "metadata": {},
 86 |      "outputs": []
 87 |     },
 88 |     {
 89 |      "cell_type": "code",
 90 |      "collapsed": false,
 91 |      "input": [
 92 |       "cross_val_score(LinearSVC(), X, y % 2)"
 93 |      ],
 94 |      "language": "python",
 95 |      "metadata": {},
 96 |      "outputs": []
 97 |     },
 98 |     {
 99 |      "cell_type": "code",
100 |      "collapsed": false,
101 |      "input": [
102 |       "cross_val_score(LinearSVC(), X, y % 2, scoring=\"average_precision\")"
103 |      ],
104 |      "language": "python",
105 |      "metadata": {},
106 |      "outputs": []
107 |     },
108 |     {
109 |      "cell_type": "code",
110 |      "collapsed": false,
111 |      "input": [
112 |       "cross_val_score(LinearSVC(), X, y % 2, scoring=\"roc_auc\")"
113 |      ],
114 |      "language": "python",
115 |      "metadata": {},
116 |      "outputs": []
117 |     },
118 |     {
119 |      "cell_type": "markdown",
120 |      "metadata": {},
121 |      "source": [
122 |       "There are other ways to do cross-valiation"
123 |      ]
124 |     },
125 |     {
126 |      "cell_type": "code",
127 |      "collapsed": false,
128 |      "input": [
129 |       "from sklearn.cross_validation import ShuffleSplit\n",
130 |       "shuffle_split = ShuffleSplit(len(X), 10, test_size=.4)\n",
131 |       "cross_val_score(LinearSVC(), X, y, cv=shuffle_split)"
132 |      ],
133 |      "language": "python",
134 |      "metadata": {},
135 |      "outputs": []
136 |     },
137 |     {
138 |      "cell_type": "code",
139 |      "collapsed": false,
140 |      "input": [],
141 |      "language": "python",
142 |      "metadata": {},
143 |      "outputs": []
144 |     }
145 |    ],
146 |    "metadata": {}
147 |   }
148 |  ]
149 | }


--------------------------------------------------------------------------------
/Chapter 4 - Grid Searches for Hyper Parameters.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Grid Searches\n",
 15 |       "================="
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "Grid-Search with build-in cross validation"
 23 |      ]
 24 |     },
 25 |     {
 26 |      "cell_type": "code",
 27 |      "collapsed": false,
 28 |      "input": [
 29 |       "from sklearn.grid_search import GridSearchCV\n",
 30 |       "from sklearn.svm import SVC"
 31 |      ],
 32 |      "language": "python",
 33 |      "metadata": {},
 34 |      "outputs": []
 35 |     },
 36 |     {
 37 |      "cell_type": "code",
 38 |      "collapsed": false,
 39 |      "input": [
 40 |       "from sklearn.datasets import load_digits\n",
 41 |       "from sklearn.cross_validation import train_test_split\n",
 42 |       "digits = load_digits()\n",
 43 |       "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
 44 |      ],
 45 |      "language": "python",
 46 |      "metadata": {},
 47 |      "outputs": []
 48 |     },
 49 |     {
 50 |      "cell_type": "markdown",
 51 |      "metadata": {},
 52 |      "source": [
 53 |       "Define parameter grid:"
 54 |      ]
 55 |     },
 56 |     {
 57 |      "cell_type": "code",
 58 |      "collapsed": false,
 59 |      "input": [
 60 |       "import numpy as np\n",
 61 |       "\n",
 62 |       "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma' : 10. ** np.arange(-5, 0)}\n",
 63 |       "\n",
 64 |       "np.set_printoptions(suppress=True)\n",
 65 |       "print(param_grid)"
 66 |      ],
 67 |      "language": "python",
 68 |      "metadata": {},
 69 |      "outputs": []
 70 |     },
 71 |     {
 72 |      "cell_type": "code",
 73 |      "collapsed": false,
 74 |      "input": [
 75 |       "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)"
 76 |      ],
 77 |      "language": "python",
 78 |      "metadata": {},
 79 |      "outputs": []
 80 |     },
 81 |     {
 82 |      "cell_type": "markdown",
 83 |      "metadata": {},
 84 |      "source": [
 85 |       "A GridSearchCV object behaves just like a normal classifier."
 86 |      ]
 87 |     },
 88 |     {
 89 |      "cell_type": "code",
 90 |      "collapsed": false,
 91 |      "input": [
 92 |       "grid_search.fit(X_train, y_train)"
 93 |      ],
 94 |      "language": "python",
 95 |      "metadata": {},
 96 |      "outputs": []
 97 |     },
 98 |     {
 99 |      "cell_type": "code",
100 |      "collapsed": false,
101 |      "input": [
102 |       "grid_search.predict(X_test)"
103 |      ],
104 |      "language": "python",
105 |      "metadata": {},
106 |      "outputs": []
107 |     },
108 |     {
109 |      "cell_type": "code",
110 |      "collapsed": false,
111 |      "input": [
112 |       "grid_search.score(X_test, y_test)"
113 |      ],
114 |      "language": "python",
115 |      "metadata": {},
116 |      "outputs": []
117 |     },
118 |     {
119 |      "cell_type": "code",
120 |      "collapsed": false,
121 |      "input": [
122 |       "grid_search.best_params_"
123 |      ],
124 |      "language": "python",
125 |      "metadata": {},
126 |      "outputs": []
127 |     },
128 |     {
129 |      "cell_type": "code",
130 |      "collapsed": false,
131 |      "input": [
132 |       "# We extract just the scores\n",
133 |       "%matplotlib inline\n",
134 |       "import matplotlib.pyplot as plt\n",
135 |       "\n",
136 |       "scores = [x[1] for x in grid_search.grid_scores_]\n",
137 |       "scores = np.array(scores).reshape(6, 5)\n",
138 |       "\n",
139 |       "plt.matshow(scores)\n",
140 |       "plt.xlabel('gamma')\n",
141 |       "plt.ylabel('C')\n",
142 |       "plt.colorbar()\n",
143 |       "plt.xticks(np.arange(5), param_grid['gamma'])\n",
144 |       "plt.yticks(np.arange(6), param_grid['C']);"
145 |      ],
146 |      "language": "python",
147 |      "metadata": {},
148 |      "outputs": []
149 |     },
150 |     {
151 |      "cell_type": "code",
152 |      "collapsed": false,
153 |      "input": [],
154 |      "language": "python",
155 |      "metadata": {},
156 |      "outputs": []
157 |     }
158 |    ],
159 |    "metadata": {}
160 |   }
161 |  ]
162 | }


--------------------------------------------------------------------------------
/Chapter 5 - Preprocessing and Pipelines.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "Preprocessing and Pipelines\n",
 15 |       "============================="
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "code",
 20 |      "collapsed": false,
 21 |      "input": [
 22 |       "from sklearn.datasets import load_digits\n",
 23 |       "from sklearn.cross_validation import train_test_split\n",
 24 |       "digits = load_digits()\n",
 25 |       "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
 26 |      ],
 27 |      "language": "python",
 28 |      "metadata": {},
 29 |      "outputs": []
 30 |     },
 31 |     {
 32 |      "cell_type": "markdown",
 33 |      "metadata": {},
 34 |      "source": [
 35 |       "Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.\n",
 36 |       "To do that, we build a pipeline."
 37 |      ]
 38 |     },
 39 |     {
 40 |      "cell_type": "code",
 41 |      "collapsed": false,
 42 |      "input": [
 43 |       "from sklearn.pipeline import Pipeline\n",
 44 |       "from sklearn.svm import SVC\n",
 45 |       "from sklearn.preprocessing import StandardScaler"
 46 |      ],
 47 |      "language": "python",
 48 |      "metadata": {},
 49 |      "outputs": []
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"svm\", SVC())])\n",
 56 |       "# in new versions: make_pipeline(StandardScaler(), SVC())"
 57 |      ],
 58 |      "language": "python",
 59 |      "metadata": {},
 60 |      "outputs": []
 61 |     },
 62 |     {
 63 |      "cell_type": "code",
 64 |      "collapsed": false,
 65 |      "input": [
 66 |       "pipeline.fit(X_train, y_train)"
 67 |      ],
 68 |      "language": "python",
 69 |      "metadata": {},
 70 |      "outputs": []
 71 |     },
 72 |     {
 73 |      "cell_type": "code",
 74 |      "collapsed": false,
 75 |      "input": [
 76 |       "pipeline.predict(X_test)"
 77 |      ],
 78 |      "language": "python",
 79 |      "metadata": {},
 80 |      "outputs": []
 81 |     },
 82 |     {
 83 |      "cell_type": "markdown",
 84 |      "metadata": {},
 85 |      "source": [
 86 |       "Cross-validation with a pipeline\n",
 87 |       "---------------------------------"
 88 |      ]
 89 |     },
 90 |     {
 91 |      "cell_type": "code",
 92 |      "collapsed": false,
 93 |      "input": [
 94 |       "from sklearn.cross_validation import cross_val_score\n",
 95 |       "cross_val_score(pipeline, X_train, y_train)"
 96 |      ],
 97 |      "language": "python",
 98 |      "metadata": {},
 99 |      "outputs": []
100 |     },
101 |     {
102 |      "cell_type": "markdown",
103 |      "metadata": {},
104 |      "source": [
105 |       "Grid Search with a pipeline\n",
106 |       "==========================="
107 |      ]
108 |     },
109 |     {
110 |      "cell_type": "code",
111 |      "collapsed": false,
112 |      "input": [
113 |       "import numpy as np\n",
114 |       "from sklearn.grid_search import GridSearchCV\n",
115 |       "\n",
116 |       "param_grid = {'svm__C': 10. ** np.arange(-3, 3), 'svm__gamma' : 10. ** np.arange(-3, 3)}\n",
117 |       "\n",
118 |       "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)"
119 |      ],
120 |      "language": "python",
121 |      "metadata": {},
122 |      "outputs": []
123 |     },
124 |     {
125 |      "cell_type": "code",
126 |      "collapsed": false,
127 |      "input": [
128 |       "grid_pipeline.fit(X_train, y_train)"
129 |      ],
130 |      "language": "python",
131 |      "metadata": {},
132 |      "outputs": []
133 |     },
134 |     {
135 |      "cell_type": "code",
136 |      "collapsed": false,
137 |      "input": [
138 |       "grid_pipeline.score(X_test, y_test)"
139 |      ],
140 |      "language": "python",
141 |      "metadata": {},
142 |      "outputs": []
143 |     },
144 |     {
145 |      "cell_type": "code",
146 |      "collapsed": false,
147 |      "input": [],
148 |      "language": "python",
149 |      "metadata": {},
150 |      "outputs": []
151 |     }
152 |    ],
153 |    "metadata": {}
154 |   }
155 |  ]
156 | }


--------------------------------------------------------------------------------
/Chapter 6 - Working With Text Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "code",
 12 |      "collapsed": false,
 13 |      "input": [
 14 |       "import pandas as pd\n",
 15 |       "data = pd.read_csv(\"train.csv\")"
 16 |      ],
 17 |      "language": "python",
 18 |      "metadata": {},
 19 |      "outputs": []
 20 |     },
 21 |     {
 22 |      "cell_type": "code",
 23 |      "collapsed": false,
 24 |      "input": [
 25 |       "len(data)"
 26 |      ],
 27 |      "language": "python",
 28 |      "metadata": {},
 29 |      "outputs": []
 30 |     },
 31 |     {
 32 |      "cell_type": "code",
 33 |      "collapsed": false,
 34 |      "input": [
 35 |       "data"
 36 |      ],
 37 |      "language": "python",
 38 |      "metadata": {},
 39 |      "outputs": []
 40 |     },
 41 |     {
 42 |      "cell_type": "code",
 43 |      "collapsed": false,
 44 |      "input": [
 45 |       "import numpy as np\n",
 46 |       "y_train = np.array(data.Insult)"
 47 |      ],
 48 |      "language": "python",
 49 |      "metadata": {},
 50 |      "outputs": []
 51 |     },
 52 |     {
 53 |      "cell_type": "code",
 54 |      "collapsed": false,
 55 |      "input": [
 56 |       "y_train"
 57 |      ],
 58 |      "language": "python",
 59 |      "metadata": {},
 60 |      "outputs": []
 61 |     },
 62 |     {
 63 |      "cell_type": "code",
 64 |      "collapsed": false,
 65 |      "input": [
 66 |       "text_train = data.Comment.tolist()"
 67 |      ],
 68 |      "language": "python",
 69 |      "metadata": {},
 70 |      "outputs": []
 71 |     },
 72 |     {
 73 |      "cell_type": "code",
 74 |      "collapsed": false,
 75 |      "input": [
 76 |       "text_train[6]"
 77 |      ],
 78 |      "language": "python",
 79 |      "metadata": {},
 80 |      "outputs": []
 81 |     },
 82 |     {
 83 |      "cell_type": "code",
 84 |      "collapsed": false,
 85 |      "input": [
 86 |       "data_test = pd.read_csv(\"test_with_solutions.csv\")"
 87 |      ],
 88 |      "language": "python",
 89 |      "metadata": {},
 90 |      "outputs": []
 91 |     },
 92 |     {
 93 |      "cell_type": "code",
 94 |      "collapsed": false,
 95 |      "input": [
 96 |       "text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)"
 97 |      ],
 98 |      "language": "python",
 99 |      "metadata": {},
100 |      "outputs": []
101 |     },
102 |     {
103 |      "cell_type": "code",
104 |      "collapsed": false,
105 |      "input": [
106 |       "from sklearn.feature_extraction.text import CountVectorizer"
107 |      ],
108 |      "language": "python",
109 |      "metadata": {},
110 |      "outputs": []
111 |     },
112 |     {
113 |      "cell_type": "code",
114 |      "collapsed": false,
115 |      "input": [
116 |       "cv = CountVectorizer()\n",
117 |       "cv.fit(text_train)"
118 |      ],
119 |      "language": "python",
120 |      "metadata": {},
121 |      "outputs": []
122 |     },
123 |     {
124 |      "cell_type": "code",
125 |      "collapsed": false,
126 |      "input": [
127 |       "len(cv.vocabulary_)"
128 |      ],
129 |      "language": "python",
130 |      "metadata": {},
131 |      "outputs": []
132 |     },
133 |     {
134 |      "cell_type": "code",
135 |      "collapsed": true,
136 |      "input": [
137 |       "cv.vocabulary_"
138 |      ],
139 |      "language": "python",
140 |      "metadata": {},
141 |      "outputs": []
142 |     },
143 |     {
144 |      "cell_type": "code",
145 |      "collapsed": false,
146 |      "input": [
147 |       "X_train = cv.transform(text_train)"
148 |      ],
149 |      "language": "python",
150 |      "metadata": {},
151 |      "outputs": []
152 |     },
153 |     {
154 |      "cell_type": "code",
155 |      "collapsed": false,
156 |      "input": [
157 |       "X_train"
158 |      ],
159 |      "language": "python",
160 |      "metadata": {},
161 |      "outputs": []
162 |     },
163 |     {
164 |      "cell_type": "code",
165 |      "collapsed": false,
166 |      "input": [
167 |       "text_train[6]"
168 |      ],
169 |      "language": "python",
170 |      "metadata": {},
171 |      "outputs": []
172 |     },
173 |     {
174 |      "cell_type": "code",
175 |      "collapsed": false,
176 |      "input": [
177 |       "X_train[6, :].nonzero()[1]"
178 |      ],
179 |      "language": "python",
180 |      "metadata": {},
181 |      "outputs": []
182 |     },
183 |     {
184 |      "cell_type": "code",
185 |      "collapsed": false,
186 |      "input": [
187 |       "X_test = cv.transform(text_test)"
188 |      ],
189 |      "language": "python",
190 |      "metadata": {},
191 |      "outputs": []
192 |     },
193 |     {
194 |      "cell_type": "code",
195 |      "collapsed": false,
196 |      "input": [
197 |       "from sklearn.svm import LinearSVC\n",
198 |       "svm = LinearSVC(C=.01)"
199 |      ],
200 |      "language": "python",
201 |      "metadata": {},
202 |      "outputs": []
203 |     },
204 |     {
205 |      "cell_type": "code",
206 |      "collapsed": false,
207 |      "input": [
208 |       "svm.fit(X_train, y_train)"
209 |      ],
210 |      "language": "python",
211 |      "metadata": {},
212 |      "outputs": []
213 |     },
214 |     {
215 |      "cell_type": "code",
216 |      "collapsed": false,
217 |      "input": [
218 |       "svm.score(X_train, y_train)"
219 |      ],
220 |      "language": "python",
221 |      "metadata": {},
222 |      "outputs": []
223 |     },
224 |     {
225 |      "cell_type": "code",
226 |      "collapsed": false,
227 |      "input": [
228 |       "svm.score(X_test, y_test)"
229 |      ],
230 |      "language": "python",
231 |      "metadata": {},
232 |      "outputs": []
233 |     },
234 |     {
235 |      "cell_type": "code",
236 |      "collapsed": false,
237 |      "input": [
238 |       "coef = svm.coef_.ravel()\n",
239 |       "positive_coefficients = np.argsort(coef)[-25:]\n",
240 |       "negative_coefficients = np.argsort(coef)[:25]\n",
241 |       "interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n"
242 |      ],
243 |      "language": "python",
244 |      "metadata": {},
245 |      "outputs": []
246 |     },
247 |     {
248 |      "cell_type": "code",
249 |      "collapsed": false,
250 |      "input": [
251 |       "%matplotlib inline\n",
252 |       "import matplotlib.pyplot as plt\n",
253 |       "\n",
254 |       "plt.figure(figsize=(15, 5))\n",
255 |       "plt.bar(np.arange(50), coef[interesting_coefficients], color=[\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]])\n",
256 |       "feature_names = np.array(cv.get_feature_names())\n",
257 |       "plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha=\"right\");"
258 |      ],
259 |      "language": "python",
260 |      "metadata": {},
261 |      "outputs": []
262 |     },
263 |     {
264 |      "cell_type": "code",
265 |      "collapsed": false,
266 |      "input": [
267 |       "from sklearn.pipeline import Pipeline"
268 |      ],
269 |      "language": "python",
270 |      "metadata": {},
271 |      "outputs": []
272 |     },
273 |     {
274 |      "cell_type": "code",
275 |      "collapsed": false,
276 |      "input": [
277 |       "pipeline = Pipeline([('vectorizer', cv), ('classifier', svm)])"
278 |      ],
279 |      "language": "python",
280 |      "metadata": {},
281 |      "outputs": []
282 |     },
283 |     {
284 |      "cell_type": "code",
285 |      "collapsed": false,
286 |      "input": [
287 |       "pipeline.fit(text_train, y_train)"
288 |      ],
289 |      "language": "python",
290 |      "metadata": {},
291 |      "outputs": []
292 |     },
293 |     {
294 |      "cell_type": "code",
295 |      "collapsed": false,
296 |      "input": [
297 |       "pipeline.score(text_test, y_test)"
298 |      ],
299 |      "language": "python",
300 |      "metadata": {},
301 |      "outputs": []
302 |     },
303 |     {
304 |      "cell_type": "code",
305 |      "collapsed": false,
306 |      "input": [
307 |       "from sklearn.grid_search import GridSearchCV"
308 |      ],
309 |      "language": "python",
310 |      "metadata": {},
311 |      "outputs": []
312 |     },
313 |     {
314 |      "cell_type": "code",
315 |      "collapsed": false,
316 |      "input": [
317 |       "param_grid = {'classifier__C': 10. ** np.arange(-3, 3)}\n",
318 |       "grid_search = GridSearchCV(pipeline, param_grid=param_grid)"
319 |      ],
320 |      "language": "python",
321 |      "metadata": {},
322 |      "outputs": []
323 |     },
324 |     {
325 |      "cell_type": "code",
326 |      "collapsed": false,
327 |      "input": [
328 |       "grid_search.fit(text_train, y_train)"
329 |      ],
330 |      "language": "python",
331 |      "metadata": {},
332 |      "outputs": []
333 |     },
334 |     {
335 |      "cell_type": "code",
336 |      "collapsed": false,
337 |      "input": [
338 |       "grid_search.score(text_test, y_test)"
339 |      ],
340 |      "language": "python",
341 |      "metadata": {},
342 |      "outputs": []
343 |     },
344 |     {
345 |      "cell_type": "code",
346 |      "collapsed": false,
347 |      "input": [
348 |       "param_grid = {'classifier__C': 10. ** np.arange(-3, 3), \"vectorizer__ngram_range\": [(1, 1), (1, 2), (1, 3), (2, 3), (2, 2)]}\n",
349 |       "grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3)"
350 |      ],
351 |      "language": "python",
352 |      "metadata": {},
353 |      "outputs": []
354 |     },
355 |     {
356 |      "cell_type": "code",
357 |      "collapsed": false,
358 |      "input": [
359 |       "grid_search.fit(text_train, y_train)"
360 |      ],
361 |      "language": "python",
362 |      "metadata": {},
363 |      "outputs": []
364 |     },
365 |     {
366 |      "cell_type": "code",
367 |      "collapsed": false,
368 |      "input": [
369 |       "grid_search.best_params_"
370 |      ],
371 |      "language": "python",
372 |      "metadata": {},
373 |      "outputs": []
374 |     },
375 |     {
376 |      "cell_type": "code",
377 |      "collapsed": false,
378 |      "input": [
379 |       "grid_search.best_score_"
380 |      ],
381 |      "language": "python",
382 |      "metadata": {},
383 |      "outputs": []
384 |     }
385 |    ],
386 |    "metadata": {}
387 |   }
388 |  ]
389 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | CC0 1.0 Universal
  2 | 
  3 | Statement of Purpose
  4 | 
  5 | The laws of most jurisdictions throughout the world automatically confer
  6 | exclusive Copyright and Related Rights (defined below) upon the creator and
  7 | subsequent owner(s) (each and all, an "owner") of an original work of
  8 | authorship and/or a database (each, a "Work").
  9 | 
 10 | Certain owners wish to permanently relinquish those rights to a Work for the
 11 | purpose of contributing to a commons of creative, cultural and scientific
 12 | works ("Commons") that the public can reliably and without fear of later
 13 | claims of infringement build upon, modify, incorporate in other works, reuse
 14 | and redistribute as freely as possible in any form whatsoever and for any
 15 | purposes, including without limitation commercial purposes. These owners may
 16 | contribute to the Commons to promote the ideal of a free culture and the
 17 | further production of creative, cultural and scientific works, or to gain
 18 | reputation or greater distribution for their Work in part through the use and
 19 | efforts of others.
 20 | 
 21 | For these and/or other purposes and motivations, and without any expectation
 22 | of additional consideration or compensation, the person associating CC0 with a
 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
 25 | and publicly distribute the Work under its terms, with knowledge of his or her
 26 | Copyright and Related Rights in the Work and the meaning and intended legal
 27 | effect of CC0 on those rights.
 28 | 
 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 30 | protected by copyright and related or neighboring rights ("Copyright and
 31 | Related Rights"). Copyright and Related Rights include, but are not limited
 32 | to, the following:
 33 | 
 34 |   i. the right to reproduce, adapt, distribute, perform, display, communicate,
 35 |   and translate a Work;
 36 | 
 37 |   ii. moral rights retained by the original author(s) and/or performer(s);
 38 | 
 39 |   iii. publicity and privacy rights pertaining to a person's image or likeness
 40 |   depicted in a Work;
 41 | 
 42 |   iv. rights protecting against unfair competition in regards to a Work,
 43 |   subject to the limitations in paragraph 4(a), below;
 44 | 
 45 |   v. rights protecting the extraction, dissemination, use and reuse of data in
 46 |   a Work;
 47 | 
 48 |   vi. database rights (such as those arising under Directive 96/9/EC of the
 49 |   European Parliament and of the Council of 11 March 1996 on the legal
 50 |   protection of databases, and under any national implementation thereof,
 51 |   including any amended or successor version of such directive); and
 52 | 
 53 |   vii. other similar, equivalent or corresponding rights throughout the world
 54 |   based on applicable law or treaty, and any national implementations thereof.
 55 | 
 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of,
 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
 59 | and Related Rights and associated claims and causes of action, whether now
 60 | known or unknown (including existing as well as future claims and causes of
 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum
 62 | duration provided by applicable law or treaty (including future time
 63 | extensions), (iii) in any current or future medium and for any number of
 64 | copies, and (iv) for any purpose whatsoever, including without limitation
 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
 66 | the Waiver for the benefit of each member of the public at large and to the
 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver
 68 | shall not be subject to revocation, rescission, cancellation, termination, or
 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work
 70 | by the public as contemplated by Affirmer's express Statement of Purpose.
 71 | 
 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be
 73 | judged legally invalid or ineffective under applicable law, then the Waiver
 74 | shall be preserved to the maximum extent permitted taking into account
 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
 76 | is so judged Affirmer hereby grants to each affected person a royalty-free,
 77 | non transferable, non sublicensable, non exclusive, irrevocable and
 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in
 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration
 80 | provided by applicable law or treaty (including future time extensions), (iii)
 81 | in any current or future medium and for any number of copies, and (iv) for any
 82 | purpose whatsoever, including without limitation commercial, advertising or
 83 | promotional purposes (the "License"). The License shall be deemed effective as
 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the
 85 | License for any reason be judged legally invalid or ineffective under
 86 | applicable law, such partial invalidity or ineffectiveness shall not
 87 | invalidate the remainder of the License, and in such case Affirmer hereby
 88 | affirms that he or she will not (i) exercise any of his or her remaining
 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims
 90 | and causes of action with respect to the Work, in either case contrary to
 91 | Affirmer's express Statement of Purpose.
 92 | 
 93 | 4. Limitations and Disclaimers.
 94 | 
 95 |   a. No trademark or patent rights held by Affirmer are waived, abandoned,
 96 |   surrendered, licensed or otherwise affected by this document.
 97 | 
 98 |   b. Affirmer offers the Work as-is and makes no representations or warranties
 99 |   of any kind concerning the Work, express, implied, statutory or otherwise,
100 |   including without limitation warranties of title, merchantability, fitness
101 |   for a particular purpose, non infringement, or the absence of latent or
102 |   other defects, accuracy, or the present or absence of errors, whether or not
103 |   discoverable, all to the greatest extent permissible under applicable law.
104 | 
105 |   c. Affirmer disclaims responsibility for clearing rights of other persons
106 |   that may apply to the Work or any use thereof, including without limitation
107 |   any person's Copyright and Related Rights in the Work. Further, Affirmer
108 |   disclaims responsibility for obtaining any necessary consents, permissions
109 |   or other rights required for any use of the Work.
110 | 
111 |   d. Affirmer understands and acknowledges that Creative Commons is not a
112 |   party to this document and has no duty or obligation with respect to this
113 |   CC0 or use of the Work.
114 | 
115 | For more information, please see
116 | <http://creativecommons.org/publicdomain/zero/1.0/>
117 | 


--------------------------------------------------------------------------------
/machine-learning-with-scikit-learn-strata-2015.odp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/pydata-strata-2015/4c8565f5ddf23c1652db90daadbcf2e9e2655a9f/machine-learning-with-scikit-learn-strata-2015.odp


--------------------------------------------------------------------------------
/machine-learning-with-scikit-learn-strata-2015.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/pydata-strata-2015/4c8565f5ddf23c1652db90daadbcf2e9e2655a9f/machine-learning-with-scikit-learn-strata-2015.pdf


--------------------------------------------------------------------------------