├── .gitignore
├── API Summary.ipynb
├── Cross-validation.ipynb
├── First Steps.ipynb
├── Grid Searches for Hyper Parameters.ipynb
├── Intro to Machine Learning and data representations.ipynb
├── LICENSE
├── Linear models.ipynb
├── Model Complexity.ipynb
├── Preprocessing and Pipelines.ipynb
├── Stochastic Gradient Descent.ipynb
├── Support Vector Machines.ipynb
├── Unsupervised Transformers.ipynb
├── Using built-in and custom score functions.ipynb
├── figures
├── bag_of_words.svg
├── cluster_comparison.png
├── cross_validation.svg
├── data_representation.svg
├── feature_union.svg
├── grid_search_cross_validation.svg
├── hashing_vectorizer.svg
├── overfitting_underfitting_cartoon.svg
├── pipeline.svg
├── pipeline_cross_validation.svg
├── randomized_search.png
├── supervised_workflow.svg
├── train_test_split.svg
├── train_test_split_matrix.svg
├── train_validation_test2.svg
└── unsupervised_workflow.svg
├── outline.rst
├── plots
├── __init__.py
├── plot_2d_separator.py
├── plot_interactive_forest.py
├── plot_interactive_tree.py
├── plot_kneighbors_regularization.py
├── plot_linear_svc_regularization.py
└── plot_rbf_svm_parameters.py
└── solutions
├── cross_validation_iris.py
├── digits_unsupervised.py
├── forests.py
├── grid_search_forest.py
├── grid_search_k_neighbors.py
├── linear_models.py
├── load_iris.py
├── pipeline_iris.py
├── svms.py
├── train_iris.py
└── validation_curve.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | notebooks/.ipynb_checkpoints/
3 | notebooks/datasets
4 | notebooks/joblib/
5 | .ipynb_checkpoints
6 |
--------------------------------------------------------------------------------
/API Summary.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# A recap on Scikit-learn's estimator interface\n"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "``X`` : data, 2d numpy array or scipy sparse matrix of shape (n_samples, n_features)\n",
15 | "\n",
16 | "``y`` : targets, 1d numpy array of shape (n_samples,)"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "## Methods"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "
\n",
31 | "``model.fit(X_train, [y_train])`` |
\n",
32 | "``model.predict(X_test)`` | ``model.transform(X_test)`` |
\n",
33 | "Classification | Preprocessing |
\n",
34 | "Regression | Dimensionality Reduction |
\n",
35 | "Clustering | Feature Extraction |
\n",
36 | " | Feature selection |
\n",
37 | "
"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "## Efficient alternatives, methods for models that don't generalize\n",
45 | "``model.fit_predict(X)`` (clustering)\n",
46 | "\n",
47 | "``model.fit_transform(X)`` (manifold learning)"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "### Additional methods\n",
55 | "__Model evaluation__ : ``score(X, [y])``\n",
56 | "\n",
57 | "__Uncertainties from Classifiers__: ``decision_function(X)`` and ``predict_proba(X)``."
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## Attributes\n",
65 | "__Classifiers__: ``classes_``\n",
66 | "\n",
67 | "__Clustering__: ``labels_``\n",
68 | "\n",
69 | "__Manifold Learning__: ``embedding_``\n",
70 | "\n",
71 | "__Linear models__: ``coef_``\n",
72 | "\n",
73 | "__Linear Decompositions__: ``components_``"
74 | ]
75 | }
76 | ],
77 | "metadata": {
78 | "kernelspec": {
79 | "display_name": "Python 2",
80 | "language": "python",
81 | "name": "python2"
82 | },
83 | "language_info": {
84 | "codemirror_mode": {
85 | "name": "ipython",
86 | "version": 2
87 | },
88 | "file_extension": ".py",
89 | "mimetype": "text/x-python",
90 | "name": "python",
91 | "nbconvert_exporter": "python",
92 | "pygments_lexer": "ipython2",
93 | "version": "2.7.10"
94 | }
95 | },
96 | "nbformat": 4,
97 | "nbformat_minor": 0
98 | }
99 |
--------------------------------------------------------------------------------
/Cross-validation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Cross-Validation\n",
21 | "----------------------------------------"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {
35 | "collapsed": false
36 | },
37 | "outputs": [],
38 | "source": [
39 | "from sklearn.datasets import load_iris"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [],
49 | "source": [
50 | "iris = load_iris()\n",
51 | "X = iris.data\n",
52 | "y = iris.target"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {
59 | "collapsed": false
60 | },
61 | "outputs": [],
62 | "source": [
63 | "from sklearn.cross_validation import cross_val_score\n",
64 | "from sklearn.svm import LinearSVC"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {
71 | "collapsed": false
72 | },
73 | "outputs": [],
74 | "source": [
75 | "cross_val_score(LinearSVC(), X, y, cv=5)"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [],
85 | "source": [
86 | "cross_val_score(LinearSVC(), X, y, cv=5, scoring=\"f1_macro\")"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Let's go to a binary task for a moment"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [],
103 | "source": [
104 | "y % 2"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [],
114 | "source": [
115 | "cross_val_score(LinearSVC(), X, y % 2)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "collapsed": false
123 | },
124 | "outputs": [],
125 | "source": [
126 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"average_precision\")"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [],
136 | "source": [
137 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"roc_auc\")"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "collapsed": false
145 | },
146 | "outputs": [],
147 | "source": [
148 | "from sklearn.metrics.scorer import SCORERS\n",
149 | "print(SCORERS.keys())"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "Implementing your own scoring metric:"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {
163 | "collapsed": false
164 | },
165 | "outputs": [],
166 | "source": [
167 | "def my_accuracy_scoring(est, X, y):\n",
168 | " return np.mean(est.predict(X) == y)\n",
169 | "\n",
170 | "cross_val_score(LinearSVC(), X, y, scoring=my_accuracy_scoring)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {
177 | "collapsed": true
178 | },
179 | "outputs": [],
180 | "source": [
181 | "def my_super_scoring(est, X, y):\n",
182 | " return np.mean(est.predict(X) == y) - np.mean(est.coef_ != 0)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {
189 | "collapsed": false
190 | },
191 | "outputs": [],
192 | "source": [
193 | "from sklearn.grid_search import GridSearchCV\n",
194 | "\n",
195 | "y = iris.target\n",
196 | "grid = GridSearchCV(LinearSVC(C=.01, dual=False),\n",
197 | " param_grid={'penalty' : ['l1', 'l2']},\n",
198 | " scoring=my_super_scoring)\n",
199 | "grid.fit(X, y)\n",
200 | "print(grid.best_params_)"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "There are other ways to do cross-valiation"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {
214 | "collapsed": false
215 | },
216 | "outputs": [],
217 | "source": [
218 | "from sklearn.cross_validation import ShuffleSplit\n",
219 | "\n",
220 | "shuffle_split = ShuffleSplit(len(X), 10, test_size=.4)\n",
221 | "cross_val_score(LinearSVC(), X, y, cv=shuffle_split)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {
228 | "collapsed": true
229 | },
230 | "outputs": [],
231 | "source": [
232 | "from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit\n",
233 | "\n",
234 | "def plot_cv(cv, n_samples):\n",
235 | " masks = []\n",
236 | " for train, test in cv:\n",
237 | " mask = np.zeros(n_samples, dtype=bool)\n",
238 | " mask[test] = 1\n",
239 | " masks.append(mask)\n",
240 | " plt.matshow(masks)"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {
247 | "collapsed": false
248 | },
249 | "outputs": [],
250 | "source": [
251 | "plot_cv(StratifiedKFold(y, n_folds=5), len(y))"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {
258 | "collapsed": false
259 | },
260 | "outputs": [],
261 | "source": [
262 | "plot_cv(KFold(len(iris.target), n_folds=5), len(iris.target))"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {
269 | "collapsed": false
270 | },
271 | "outputs": [],
272 | "source": [
273 | "plot_cv(ShuffleSplit(len(iris.target), n_iter=20, test_size=.2), \n",
274 | " len(iris.target))"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "source": [
283 | "# Exercises\n",
284 | "Use KFold cross validation and StratifiedKFold cross validation (3 or 5 folds) for LinearSVC on the iris dataset.\n",
285 | "Why are the results so different? How could you get more similar results?"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {
292 | "collapsed": false
293 | },
294 | "outputs": [],
295 | "source": [
296 | "# %load solutions/cross_validation_iris.py"
297 | ]
298 | }
299 | ],
300 | "metadata": {
301 | "kernelspec": {
302 | "display_name": "Python 2",
303 | "language": "python",
304 | "name": "python2"
305 | },
306 | "language_info": {
307 | "codemirror_mode": {
308 | "name": "ipython",
309 | "version": 2
310 | },
311 | "file_extension": ".py",
312 | "mimetype": "text/x-python",
313 | "name": "python",
314 | "nbconvert_exporter": "python",
315 | "pygments_lexer": "ipython2",
316 | "version": "2.7.9"
317 | }
318 | },
319 | "nbformat": 4,
320 | "nbformat_minor": 0
321 | }
322 |
--------------------------------------------------------------------------------
/First Steps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Get some data to play with"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from sklearn.datasets import load_digits\n",
32 | "digits = load_digits()"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [],
42 | "source": [
43 | "from sklearn.cross_validation import train_test_split\n",
44 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n",
45 | " digits.target)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [],
55 | "source": [
56 | "X_train.shape"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "Really Simple API\n",
64 | "-------------------\n",
65 | "0) Import your model class"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {
72 | "collapsed": false
73 | },
74 | "outputs": [],
75 | "source": [
76 | "from sklearn.svm import LinearSVC"
77 | ]
78 | },
79 | {
80 | "cell_type": "markdown",
81 | "metadata": {},
82 | "source": [
83 | "1) Instantiate an object and set the parameters"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [],
93 | "source": [
94 | "svm = LinearSVC(C=0.1)"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "2) Fit the model"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": null,
107 | "metadata": {
108 | "collapsed": false
109 | },
110 | "outputs": [],
111 | "source": [
112 | "svm.fit(X_train, y_train)"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "3) Apply / evaluate"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "collapsed": false,
127 | "scrolled": true
128 | },
129 | "outputs": [],
130 | "source": [
131 | "print(svm.predict(X_test))\n"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [],
141 | "source": [
142 | "svm.score(X_train, y_train)"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {
149 | "collapsed": false
150 | },
151 | "outputs": [],
152 | "source": [
153 | "svm.score(X_test, y_test)"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "And again\n",
161 | "---------"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "collapsed": false
169 | },
170 | "outputs": [],
171 | "source": [
172 | "from sklearn.ensemble import RandomForestClassifier"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {
179 | "collapsed": false
180 | },
181 | "outputs": [],
182 | "source": [
183 | "rf = RandomForestClassifier(n_estimators=50)"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [],
193 | "source": [
194 | "rf.fit(X_train, y_train)"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "collapsed": false
202 | },
203 | "outputs": [],
204 | "source": [
205 | "rf.score(X_test, y_test)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {
212 | "collapsed": false
213 | },
214 | "outputs": [],
215 | "source": [
216 | "%load https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/examples/classification/plot_classifier_comparison.py"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "# Exercises\n",
224 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n",
225 | "\n",
226 | "Split it into training and test set using ``train_test_split``.\n",
227 | "Then train an evaluate a classifier of your choice.\n"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "collapsed": false
235 | },
236 | "outputs": [],
237 | "source": [
238 | "# %load solutions/train_iris.py"
239 | ]
240 | }
241 | ],
242 | "metadata": {
243 | "kernelspec": {
244 | "display_name": "Python 2",
245 | "language": "python",
246 | "name": "python2"
247 | },
248 | "language_info": {
249 | "codemirror_mode": {
250 | "name": "ipython",
251 | "version": 2
252 | },
253 | "file_extension": ".py",
254 | "mimetype": "text/x-python",
255 | "name": "python",
256 | "nbconvert_exporter": "python",
257 | "pygments_lexer": "ipython2",
258 | "version": "2.7.10"
259 | }
260 | },
261 | "nbformat": 4,
262 | "nbformat_minor": 0
263 | }
264 |
--------------------------------------------------------------------------------
/Grid Searches for Hyper Parameters.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Grid Searches\n",
21 | "================="
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "Grid-Search with build-in cross validation"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [],
45 | "source": [
46 | "from sklearn.grid_search import GridSearchCV\n",
47 | "from sklearn.svm import SVC"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [],
57 | "source": [
58 | "from sklearn.datasets import load_digits\n",
59 | "from sklearn.cross_validation import train_test_split\n",
60 | "digits = load_digits()\n",
61 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n",
62 | " digits.target, random_state=0)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "Define parameter grid:"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {
76 | "collapsed": false
77 | },
78 | "outputs": [],
79 | "source": [
80 | "import numpy as np\n",
81 | "\n",
82 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n",
83 | " 'gamma' : 10. ** np.arange(-5, 0)}\n",
84 | "\n",
85 | "np.set_printoptions(suppress=True)\n",
86 | "print(param_grid)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [],
96 | "source": [
97 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "A GridSearchCV object behaves just like a normal classifier."
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": false,
112 | "scrolled": true
113 | },
114 | "outputs": [],
115 | "source": [
116 | "grid_search.fit(X_train, y_train)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {
123 | "collapsed": false,
124 | "scrolled": true
125 | },
126 | "outputs": [],
127 | "source": [
128 | "grid_search.predict(X_test)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {
135 | "collapsed": false
136 | },
137 | "outputs": [],
138 | "source": [
139 | "grid_search.score(X_test, y_test)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {
146 | "collapsed": false
147 | },
148 | "outputs": [],
149 | "source": [
150 | "grid_search.best_params_"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {
157 | "collapsed": false
158 | },
159 | "outputs": [],
160 | "source": [
161 | "# We extract just the scores\n",
162 | "\n",
163 | "scores = [x.mean_validation_score for x in grid_search.grid_scores_]\n",
164 | "scores = np.array(scores).reshape(6, 5)\n",
165 | "\n",
166 | "plt.matshow(scores)\n",
167 | "plt.xlabel('gamma')\n",
168 | "plt.ylabel('C')\n",
169 | "plt.colorbar()\n",
170 | "plt.xticks(np.arange(5), param_grid['gamma'])\n",
171 | "plt.yticks(np.arange(6), param_grid['C']);"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "Nested Cross-validation in scikit-learn:"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {
185 | "collapsed": false
186 | },
187 | "outputs": [],
188 | "source": []
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "metadata": {
193 | "collapsed": true
194 | },
195 | "source": [
196 | "# Exercises\n",
197 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier.\n",
198 | "Visualize ``grid_search.grid_scores_``."
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {
205 | "collapsed": true
206 | },
207 | "outputs": [],
208 | "source": [
209 | "from sklearn.neighbors import KNeighborsClassifier"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "collapsed": false
217 | },
218 | "outputs": [],
219 | "source": [
220 | "# %load solutions/grid_search_k_neighbors.py"
221 | ]
222 | }
223 | ],
224 | "metadata": {
225 | "kernelspec": {
226 | "display_name": "Python 2",
227 | "language": "python",
228 | "name": "python2"
229 | },
230 | "language_info": {
231 | "codemirror_mode": {
232 | "name": "ipython",
233 | "version": 2
234 | },
235 | "file_extension": ".py",
236 | "mimetype": "text/x-python",
237 | "name": "python",
238 | "nbconvert_exporter": "python",
239 | "pygments_lexer": "ipython2",
240 | "version": "2.7.10"
241 | }
242 | },
243 | "nbformat": 4,
244 | "nbformat_minor": 0
245 | }
246 |
--------------------------------------------------------------------------------
/Intro to Machine Learning and data representations.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# What is machine learning ?"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Supervised learning\n"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {
20 | "collapsed": true
21 | },
22 | "source": [
23 | "
"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "# Data Representations"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "
"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "# Dataset Split"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "
"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": []
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "collapsed": false
64 | },
65 | "outputs": [],
66 | "source": [
67 | "% matplotlib nbagg\n",
68 | "import matplotlib.pyplot as plt\n",
69 | "import numpy as np"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {
76 | "collapsed": false
77 | },
78 | "outputs": [],
79 | "source": [
80 | "from sklearn.datasets import load_digits\n",
81 | "digits = load_digits()\n",
82 | "digits.keys()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [],
92 | "source": [
93 | "digits.images.shape"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [],
103 | "source": [
104 | "print(digits.images[0])"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [],
114 | "source": [
115 | "plt.matshow(digits.images[0], cmap=plt.cm.Greys)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "collapsed": false
123 | },
124 | "outputs": [],
125 | "source": [
126 | "digits.data.shape"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [],
136 | "source": [
137 | "digits.target.shape"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "collapsed": false
145 | },
146 | "outputs": [],
147 | "source": [
148 | "digits.target"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "**Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)**"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "Splitting the data:"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "collapsed": true
170 | },
171 | "outputs": [],
172 | "source": [
173 | "from sklearn.cross_validation import train_test_split\n",
174 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "# Exercises\n",
182 | "\n",
183 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n",
184 | "The function returns a dictionary-like object that has the same attributes as ``digits``.\n",
185 | "\n",
186 | "What is the number of classes, features and data points in this dataset?\n",
187 | "Use a scatterplot to visualize the dataset.\n",
188 | "\n",
189 | "You can look at ``DESCR`` attribute to learn more about the dataset."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {
196 | "collapsed": true
197 | },
198 | "outputs": [],
199 | "source": [
200 | "# %load solutions/load_iris.py"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {
207 | "collapsed": true
208 | },
209 | "outputs": [],
210 | "source": []
211 | }
212 | ],
213 | "metadata": {
214 | "kernelspec": {
215 | "display_name": "Python 2",
216 | "language": "python",
217 | "name": "python2"
218 | },
219 | "language_info": {
220 | "codemirror_mode": {
221 | "name": "ipython",
222 | "version": 2
223 | },
224 | "file_extension": ".py",
225 | "mimetype": "text/x-python",
226 | "name": "python",
227 | "nbconvert_exporter": "python",
228 | "pygments_lexer": "ipython2",
229 | "version": "2.7.9"
230 | }
231 | },
232 | "nbformat": 4,
233 | "nbformat_minor": 0
234 | }
235 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015, Andreas Mueller
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 |
--------------------------------------------------------------------------------
/Linear models.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import numpy as np\n",
13 | "import matplotlib.pyplot as plt"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Linear models for regression"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "\n",
28 | "```\n",
29 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_\n",
30 | "```"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {
37 | "collapsed": false
38 | },
39 | "outputs": [],
40 | "source": [
41 | "from sklearn.datasets import make_regression\n",
42 | "from sklearn.cross_validation import train_test_split\n",
43 | "\n",
44 | "X, y, true_coefficient = make_regression(n_samples=80, n_features=30, n_informative=10, noise=100, coef=True, random_state=5)\n",
45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)\n",
46 | "print(X_train.shape)\n",
47 | "print(y_train.shape)"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "## Linear Regression\n",
55 | "\n",
56 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 $$"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "collapsed": false,
64 | "scrolled": true
65 | },
66 | "outputs": [],
67 | "source": [
68 | "from sklearn.linear_model import LinearRegression\n",
69 | "linear_regression = LinearRegression().fit(X_train, y_train)\n",
70 | "print(\"R^2 on training set: %f\" % linear_regression.score(X_train, y_train))\n",
71 | "print(\"R^2 on test set: %f\" % linear_regression.score(X_test, y_test))"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": false
79 | },
80 | "outputs": [],
81 | "source": [
82 | "from sklearn.metrics import r2_score\n",
83 | "print(r2_score(np.dot(X, true_coefficient), y))"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [],
93 | "source": [
94 | "plt.figure(figsize=(10, 5))\n",
95 | "coefficient_sorting = np.argsort(true_coefficient)[::-1]\n",
96 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\")\n",
97 | "plt.plot(linear_regression.coef_[coefficient_sorting], \"o\", label=\"linear regression\")\n",
98 | "\n",
99 | "plt.legend()"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "## Ridge Regression (L2 penalty)\n",
107 | "\n",
108 | "$$ \\text{min}_{w,b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_2^2$$ "
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [],
118 | "source": [
119 | "from sklearn.linear_model import Ridge\n",
120 | "ridge_models = {}\n",
121 | "training_scores = []\n",
122 | "test_scores = []\n",
123 | "\n",
124 | "for alpha in [100, 10, 1, .01]:\n",
125 | " ridge = Ridge(alpha=alpha).fit(X_train, y_train)\n",
126 | " training_scores.append(ridge.score(X_train, y_train))\n",
127 | " test_scores.append(ridge.score(X_test, y_test))\n",
128 | " ridge_models[alpha] = ridge\n",
129 | "\n",
130 | "plt.figure()\n",
131 | "plt.plot(training_scores, label=\"training scores\")\n",
132 | "plt.plot(test_scores, label=\"test scores\")\n",
133 | "plt.xticks(range(4), [100, 10, 1, .01])\n",
134 | "plt.legend(loc=\"best\")"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "collapsed": false
142 | },
143 | "outputs": [],
144 | "source": [
145 | "plt.figure(figsize=(10, 5))\n",
146 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n",
147 | "\n",
148 | "for i, alpha in enumerate([100, 10, 1, .01]):\n",
149 | " plt.plot(ridge_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n",
150 | " \n",
151 | "plt.legend(loc=\"best\")"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "## Lasso (L1 penalty)\n",
159 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_1$$ "
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {
166 | "collapsed": false
167 | },
168 | "outputs": [],
169 | "source": [
170 | "from sklearn.linear_model import Lasso\n",
171 | "\n",
172 | "lasso_models = {}\n",
173 | "training_scores = []\n",
174 | "test_scores = []\n",
175 | "\n",
176 | "for alpha in [30, 10, 1, .01]:\n",
177 | " lasso = Lasso(alpha=alpha).fit(X_train, y_train)\n",
178 | " training_scores.append(lasso.score(X_train, y_train))\n",
179 | " test_scores.append(lasso.score(X_test, y_test))\n",
180 | " lasso_models[alpha] = lasso\n",
181 | "plt.figure()\n",
182 | "plt.plot(training_scores, label=\"training scores\")\n",
183 | "plt.plot(test_scores, label=\"test scores\")\n",
184 | "plt.xticks(range(4), [30, 10, 1, .01])\n",
185 | "plt.legend(loc=\"best\")"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {
192 | "collapsed": false
193 | },
194 | "outputs": [],
195 | "source": [
196 | "plt.figure(figsize=(10, 5))\n",
197 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n",
198 | "\n",
199 | "for i, alpha in enumerate([30, 10, 1, .01]):\n",
200 | " plt.plot(lasso_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n",
201 | " \n",
202 | "plt.legend(loc=\"best\")"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "## Linear models for classification"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "\n",
217 | "```\n",
218 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_ > 0\n",
219 | "```"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "The influence of C in LinearSVC"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [],
236 | "source": [
237 | "from plots import plot_linear_svc_regularization\n",
238 | "plot_linear_svc_regularization()"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "## Multi-Class linear classification"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {
252 | "collapsed": false
253 | },
254 | "outputs": [],
255 | "source": [
256 | "from sklearn.datasets import make_blobs\n",
257 | "plt.figure()\n",
258 | "X, y = make_blobs(random_state=42)\n",
259 | "plt.scatter(X[:, 0], X[:, 1], c=y)"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {
266 | "collapsed": false
267 | },
268 | "outputs": [],
269 | "source": [
270 | "from sklearn.svm import LinearSVC\n",
271 | "linear_svm = LinearSVC().fit(X, y)\n",
272 | "print(linear_svm.coef_.shape)\n",
273 | "print(linear_svm.intercept_.shape)"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "outputs": [],
283 | "source": [
284 | "plt.figure()\n",
285 | "plt.scatter(X[:, 0], X[:, 1], c=y)\n",
286 | "line = np.linspace(-15, 15)\n",
287 | "for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):\n",
288 | " plt.plot(line, -(line * coef[0] + intercept) / coef[1])\n",
289 | "plt.ylim(-10, 15)\n",
290 | "plt.xlim(-10, 8)"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "# Exercises"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "* Compare Logistic regression with l1 penalty and l2 penalty by plotting the coefficients as above for the digits dataset. Classify odd vs even digits to make it a binary task."
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {
311 | "collapsed": true
312 | },
313 | "outputs": [],
314 | "source": [
315 | "y % 2"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": null,
321 | "metadata": {
322 | "collapsed": false
323 | },
324 | "outputs": [],
325 | "source": [
326 | "# %load solutions/linear_models.py"
327 | ]
328 | }
329 | ],
330 | "metadata": {
331 | "kernelspec": {
332 | "display_name": "Python 2",
333 | "language": "python",
334 | "name": "python2"
335 | },
336 | "language_info": {
337 | "codemirror_mode": {
338 | "name": "ipython",
339 | "version": 2
340 | },
341 | "file_extension": ".py",
342 | "mimetype": "text/x-python",
343 | "name": "python",
344 | "nbconvert_exporter": "python",
345 | "pygments_lexer": "ipython2",
346 | "version": "2.7.10"
347 | }
348 | },
349 | "nbformat": 4,
350 | "nbformat_minor": 0
351 | }
352 |
--------------------------------------------------------------------------------
/Model Complexity.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import matplotlib.pyplot as plt\n",
12 | "import numpy as np\n",
13 | "%matplotlib nbagg"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Model Complexity, Overfitting and Underfitting\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from plots import plot_kneighbors_regularization\n",
32 | "plot_kneighbors_regularization()"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | ""
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "source": [
48 | "# Validation Curves"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {
55 | "collapsed": true
56 | },
57 | "outputs": [],
58 | "source": [
59 | "from sklearn.datasets import load_digits\n",
60 | "from sklearn.ensemble import RandomForestClassifier\n",
61 | "from sklearn.learning_curve import validation_curve"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "digits = load_digits()\n",
73 | "X, y = digits.data, digits.target"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {
80 | "collapsed": false
81 | },
82 | "outputs": [],
83 | "source": [
84 | "model = RandomForestClassifier(n_estimators=20)\n",
85 | "param_range = range(1, 13)\n",
86 | "training_scores, validation_scores = validation_curve(model, X, y,\n",
87 | " param_name=\"max_depth\",\n",
88 | " param_range=param_range, cv=5)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {
95 | "collapsed": false
96 | },
97 | "outputs": [],
98 | "source": [
99 | "training_scores.shape"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "collapsed": true
107 | },
108 | "outputs": [],
109 | "source": [
110 | "def plot_validation_curve(parameter_values, train_scores, validation_scores):\n",
111 | " train_scores_mean = np.mean(train_scores, axis=1)\n",
112 | " train_scores_std = np.std(train_scores, axis=1)\n",
113 | " validation_scores_mean = np.mean(validation_scores, axis=1)\n",
114 | " validation_scores_std = np.std(validation_scores, axis=1)\n",
115 | "\n",
116 | " plt.fill_between(parameter_values, train_scores_mean - train_scores_std,\n",
117 | " train_scores_mean + train_scores_std, alpha=0.1,\n",
118 | " color=\"r\")\n",
119 | " plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,\n",
120 | " validation_scores_mean + validation_scores_std, alpha=0.1, color=\"g\")\n",
121 | " plt.plot(parameter_values, train_scores_mean, 'o-', color=\"r\",\n",
122 | " label=\"Training score\")\n",
123 | " plt.plot(parameter_values, validation_scores_mean, 'o-', color=\"g\",\n",
124 | " label=\"Cross-validation score\")\n",
125 | " plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)\n",
126 | " plt.legend(loc=\"best\")"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [],
136 | "source": [
137 | "plt.figure()\n",
138 | "plot_validation_curve(param_range, training_scores, validation_scores)"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "# Exercise\n",
146 | "\n",
147 | "Plot the validation curve on the digit dataset for:\n",
148 | "* a LinearSVC with a logarithmic range of regularization parameters ``C``.\n",
149 | "* KNeighborsClassifier with a linear range of neighbors ``k``.\n",
150 | "\n",
151 | "What do you expect them to look like? How do they actually look like?"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "collapsed": false
159 | },
160 | "outputs": [],
161 | "source": [
162 | "# %load solutions/validation_curve.py"
163 | ]
164 | }
165 | ],
166 | "metadata": {
167 | "kernelspec": {
168 | "display_name": "Python 2",
169 | "language": "python",
170 | "name": "python2"
171 | },
172 | "language_info": {
173 | "codemirror_mode": {
174 | "name": "ipython",
175 | "version": 2
176 | },
177 | "file_extension": ".py",
178 | "mimetype": "text/x-python",
179 | "name": "python",
180 | "nbconvert_exporter": "python",
181 | "pygments_lexer": "ipython2",
182 | "version": "2.7.9"
183 | }
184 | },
185 | "nbformat": 4,
186 | "nbformat_minor": 0
187 | }
188 |
--------------------------------------------------------------------------------
/Preprocessing and Pipelines.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Preprocessing and Pipelines\n",
21 | "============================="
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {
35 | "collapsed": false
36 | },
37 | "outputs": [],
38 | "source": [
39 | "from sklearn.datasets import load_digits\n",
40 | "from sklearn.cross_validation import train_test_split\n",
41 | "digits = load_digits()\n",
42 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.\n",
50 | "To do that, we build a pipeline."
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {
57 | "collapsed": false
58 | },
59 | "outputs": [],
60 | "source": [
61 | "from sklearn.pipeline import Pipeline, make_pipeline\n",
62 | "from sklearn.svm import SVC\n",
63 | "from sklearn.preprocessing import StandardScaler"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {
70 | "collapsed": false
71 | },
72 | "outputs": [],
73 | "source": [
74 | "pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"svm\", SVC())])\n",
75 | "# or for short:\n",
76 | "make_pipeline(StandardScaler(), SVC())"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {
83 | "collapsed": false
84 | },
85 | "outputs": [],
86 | "source": [
87 | "pipeline.fit(X_train, y_train)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {
94 | "collapsed": false
95 | },
96 | "outputs": [],
97 | "source": [
98 | "pipeline.predict(X_test)"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "
"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Cross-validation with a pipeline\n",
113 | "---------------------------------"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {
120 | "collapsed": false
121 | },
122 | "outputs": [],
123 | "source": [
124 | "from sklearn.cross_validation import cross_val_score\n",
125 | "cross_val_score(pipeline, X_train, y_train)"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "Grid Search with a pipeline\n",
133 | "==========================="
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "collapsed": false
141 | },
142 | "outputs": [],
143 | "source": [
144 | "from sklearn.grid_search import GridSearchCV\n",
145 | "\n",
146 | "param_grid = {'svm__C': 10. ** np.arange(-3, 3),\n",
147 | " 'svm__gamma' : 10. ** np.arange(-3, 3)}\n",
148 | "\n",
149 | "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {
156 | "collapsed": false
157 | },
158 | "outputs": [],
159 | "source": [
160 | "grid_pipeline.fit(X_train, y_train)"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {
167 | "collapsed": false
168 | },
169 | "outputs": [],
170 | "source": [
171 | "grid_pipeline.score(X_test, y_test)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {
177 | "collapsed": false
178 | },
179 | "source": [
180 | "# Exercises\n",
181 | "Add random features to the iris dataset using ``np.random.uniform`` and ``np.hstack``.\n",
182 | "\n",
183 | "Build a pipeline using the SelectKBest univariate feature selection from the sklearn.feature_selection module and the LinearSVC on the iris dataset.\n",
184 | "\n",
185 | "Use GridSearchCV to adjust C and the number of features selected in SelectKBest."
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {
192 | "collapsed": false,
193 | "scrolled": true
194 | },
195 | "outputs": [],
196 | "source": [
197 | "# %load solutions/pipeline_iris.py"
198 | ]
199 | }
200 | ],
201 | "metadata": {
202 | "kernelspec": {
203 | "display_name": "Python 2",
204 | "language": "python",
205 | "name": "python2"
206 | },
207 | "language_info": {
208 | "codemirror_mode": {
209 | "name": "ipython",
210 | "version": 2
211 | },
212 | "file_extension": ".py",
213 | "mimetype": "text/x-python",
214 | "name": "python",
215 | "nbconvert_exporter": "python",
216 | "pygments_lexer": "ipython2",
217 | "version": "2.7.9"
218 | }
219 | },
220 | "nbformat": 4,
221 | "nbformat_minor": 0
222 | }
223 |
--------------------------------------------------------------------------------
/Stochastic Gradient Descent.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from sklearn.datasets import load_digits\n",
12 | "from sklearn.linear_model import SGDClassifier\n",
13 | "from sklearn.preprocessing import StandardScaler\n",
14 | "from sklearn.cross_validation import train_test_split\n",
15 | "\n",
16 | "digits = load_digits()\n",
17 | "\n",
18 | "\n",
19 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)\n",
20 | "scaler = StandardScaler()\n",
21 | "X_train_scaled = scaler.fit_transform(X_train)\n",
22 | "X_test_scaled = scaler.transform(X_test)"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {
29 | "collapsed": false
30 | },
31 | "outputs": [],
32 | "source": [
33 | "sgd = SGDClassifier(n_iter=5, loss=\"hinge\", penalty=\"l2\")\n",
34 | "sgd.fit(X_train_scaled, y_train)\n",
35 | "print(sgd.score(X_test_scaled, y_test))"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [],
45 | "source": [
46 | "sgd = SGDClassifier(shuffle=False)\n",
47 | "sgd.partial_fit(X_train_scaled, y_train, classes=range(10))\n",
48 | "print(sgd.score(X_test_scaled, y_test))\n",
49 | "sgd.partial_fit(X_train_scaled, y_train)\n",
50 | "print(sgd.score(X_test_scaled, y_test))\n",
51 | "sgd.partial_fit(X_train_scaled, y_train)\n",
52 | "print(sgd.score(X_test_scaled, y_test))"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "collapsed": false
59 | },
60 | "source": [
61 | "# Exercise\n",
62 | "Record the training and test loss for 10 iterations using constant learning rate and \"invscaling\" learning rate.\n",
63 | "Plot the resulting convergence curves. Try different learning rates."
64 | ]
65 | }
66 | ],
67 | "metadata": {
68 | "kernelspec": {
69 | "display_name": "Python 2",
70 | "language": "python",
71 | "name": "python2"
72 | },
73 | "language_info": {
74 | "codemirror_mode": {
75 | "name": "ipython",
76 | "version": 2
77 | },
78 | "file_extension": ".py",
79 | "mimetype": "text/x-python",
80 | "name": "python",
81 | "nbconvert_exporter": "python",
82 | "pygments_lexer": "ipython2",
83 | "version": "2.7.10"
84 | }
85 | },
86 | "nbformat": 4,
87 | "nbformat_minor": 0
88 | }
89 |
--------------------------------------------------------------------------------
/Support Vector Machines.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import numpy as np\n",
13 | "import matplotlib.pyplot as plt"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Support Vector Machines"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from sklearn.datasets import load_digits\n",
32 | "from sklearn.cross_validation import train_test_split\n",
33 | "\n",
34 | "digits = load_digits()\n",
35 | "X_train, X_test, y_train, y_test = train_test_split(digits.data / 16., digits.target % 2, random_state=2)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [],
45 | "source": [
46 | "from sklearn.svm import LinearSVC, SVC\n",
47 | "linear_svc = LinearSVC(loss=\"hinge\").fit(X_train, y_train)\n",
48 | "svc = SVC(kernel=\"linear\").fit(X_train, y_train)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [],
58 | "source": [
59 | "np.mean(linear_svc.predict(X_test) == svc.predict(X_test))"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "## Kernel SVMs\n",
67 | "\n",
68 | "\n",
69 | "Predictions in a kernel-SVM are made using the formular\n",
70 | "\n",
71 | "$$\n",
72 | "\\hat{y} = \\alpha_0 + \\alpha_1 y_1 k(\\mathbf{x^{(1)}}, \\mathbf{x}) + ... + \\alpha_n y_n k(\\mathbf{x^{(n)}}, \\mathbf{x})> 0\n",
73 | "$$\n",
74 | "\n",
75 | "$$\n",
76 | "0 \\leq \\alpha_i \\leq C\n",
77 | "$$\n",
78 | "\n"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "Radial basis function (Gaussian) kernel:\n",
86 | "$$k(\\mathbf{x}, \\mathbf{x'}) = \\exp(-\\gamma ||\\mathbf{x} - \\mathbf{x'}||^2)$$"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [],
96 | "source": [
97 | "from sklearn.metrics.pairwise import rbf_kernel\n",
98 | "line = np.linspace(-3, 3, 100)[:, np.newaxis]\n",
99 | "kernel_value = rbf_kernel([[0]], line, gamma=1)\n",
100 | "plt.plot(line, kernel_value.T)"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": false
108 | },
109 | "outputs": [],
110 | "source": [
111 | "from plots import plot_svm_interactive\n",
112 | "plot_svm_interactive()"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "collapsed": false
120 | },
121 | "outputs": [],
122 | "source": [
123 | "svc = SVC().fit(X_train, y_train)\n",
124 | "svc.score(X_test, y_test)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {
131 | "collapsed": false
132 | },
133 | "outputs": [],
134 | "source": [
135 | "Cs = [0.001, 0.01, 0.1, 1, 10, 100]\n",
136 | "gammas = [0.001, 0.01, 0.1, 1, 10, 100]\n",
137 | "\n",
138 | "from sklearn.grid_search import GridSearchCV\n",
139 | "\n",
140 | "param_grid = {'C': Cs, 'gamma' : gammas}\n",
141 | "grid_search = GridSearchCV(SVC(), param_grid, cv=5)\n",
142 | "grid_search.fit(X_train, y_train)"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {
149 | "collapsed": false
150 | },
151 | "outputs": [],
152 | "source": [
153 | "grid_search.score(X_test, y_test)"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {
160 | "collapsed": false
161 | },
162 | "outputs": [],
163 | "source": [
164 | "# We extract just the scores\n",
165 | "scores = [x[1] for x in grid_search.grid_scores_]\n",
166 | "scores = np.array(scores).reshape(6, 6)\n",
167 | "\n",
168 | "plt.matshow(scores)\n",
169 | "plt.xlabel('gamma')\n",
170 | "plt.ylabel('C')\n",
171 | "plt.colorbar()\n",
172 | "plt.xticks(np.arange(6), param_grid['gamma'])\n",
173 | "plt.yticks(np.arange(6), param_grid['C']);"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {
179 | "collapsed": true
180 | },
181 | "source": [
182 | "# Excercise\n",
183 | "* Scale the data using StandardScaler before applying the SVC. How does the performance of the default parameters change?\n",
184 | "* Grid-Search the parameters for the scaled data. How do they differ from the previous ones?"
185 | ]
186 | }
187 | ],
188 | "metadata": {
189 | "kernelspec": {
190 | "display_name": "Python 2",
191 | "language": "python",
192 | "name": "python2"
193 | },
194 | "language_info": {
195 | "codemirror_mode": {
196 | "name": "ipython",
197 | "version": 2
198 | },
199 | "file_extension": ".py",
200 | "mimetype": "text/x-python",
201 | "name": "python",
202 | "nbconvert_exporter": "python",
203 | "pygments_lexer": "ipython2",
204 | "version": "2.7.10"
205 | }
206 | },
207 | "nbformat": 4,
208 | "nbformat_minor": 0
209 | }
210 |
--------------------------------------------------------------------------------
/Unsupervised Transformers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "
"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from sklearn.datasets import load_digits\n",
32 | "from sklearn.cross_validation import train_test_split\n",
33 | "import numpy as np\n",
34 | "np.set_printoptions(suppress=True)\n",
35 | "\n",
36 | "digits = load_digits()\n",
37 | "X, y = digits.data, digits.target\n",
38 | "X_train, X_test, y_train, y_test = train_test_split(X, y)"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "Removing mean and scaling variance\n",
46 | "==================================="
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "from sklearn.preprocessing import StandardScaler"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "1) Instantiate the model"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {
71 | "collapsed": false
72 | },
73 | "outputs": [],
74 | "source": [
75 | "scaler = StandardScaler()"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "2) Fit using only the data."
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [],
92 | "source": [
93 | "scaler.fit(X_train)"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "3) `transform` the data (not `predict`)."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": false
108 | },
109 | "outputs": [],
110 | "source": [
111 | "X_train_scaled = scaler.transform(X_train)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "X_train.shape"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {
129 | "collapsed": false
130 | },
131 | "outputs": [],
132 | "source": [
133 | "X_train_scaled.shape"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "The transformed version of the data has the mean removed:"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {
147 | "collapsed": false
148 | },
149 | "outputs": [],
150 | "source": [
151 | "X_train_scaled.mean(axis=0)"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "collapsed": false
159 | },
160 | "outputs": [],
161 | "source": [
162 | "X_train_scaled.std(axis=0)"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [],
172 | "source": [
173 | "X_test_transformed = scaler.transform(X_test)"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "Principal Component Analysis\n",
181 | "============================="
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "0) Import the model"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {
195 | "collapsed": false
196 | },
197 | "outputs": [],
198 | "source": [
199 | "from sklearn.decomposition import PCA"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "1) Instantiate the model"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {
213 | "collapsed": false
214 | },
215 | "outputs": [],
216 | "source": [
217 | "pca = PCA(n_components=2)"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "2) Fit to training data"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {
231 | "collapsed": false
232 | },
233 | "outputs": [],
234 | "source": [
235 | "pca.fit(X)"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "3) Transform to lower-dimensional representation"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "collapsed": false
250 | },
251 | "outputs": [],
252 | "source": [
253 | "print(X.shape)\n",
254 | "X_pca = pca.transform(X)\n",
255 | "X_pca.shape"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "metadata": {},
261 | "source": [
262 | "Visualize\n",
263 | "----------"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "metadata": {
270 | "collapsed": false
271 | },
272 | "outputs": [],
273 | "source": [
274 | "plt.figure()\n",
275 | "plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": null,
281 | "metadata": {
282 | "collapsed": false
283 | },
284 | "outputs": [],
285 | "source": [
286 | "pca.components_.shape"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {
293 | "collapsed": false
294 | },
295 | "outputs": [],
296 | "source": [
297 | "plt.matshow(pca.components_[0].reshape(8, 8), cmap=\"gray\")\n",
298 | "plt.colorbar()\n",
299 | "plt.matshow(pca.components_[1].reshape(8, 8), cmap=\"gray\")\n",
300 | "plt.colorbar()"
301 | ]
302 | },
303 | {
304 | "cell_type": "markdown",
305 | "metadata": {
306 | "collapsed": false
307 | },
308 | "source": [
309 | "Manifold Learning\n",
310 | "=================="
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "metadata": {
317 | "collapsed": true
318 | },
319 | "outputs": [],
320 | "source": [
321 | "from sklearn.manifold import Isomap\n",
322 | "isomap = Isomap()"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {
329 | "collapsed": false
330 | },
331 | "outputs": [],
332 | "source": [
333 | "X_isomap = isomap.fit_transform(X)"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {
340 | "collapsed": false
341 | },
342 | "outputs": [],
343 | "source": [
344 | "plt.scatter(X_isomap[:, 0], X_isomap[:, 1], c=y)"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {
350 | "collapsed": true
351 | },
352 | "source": [
353 | "# Exercises\n",
354 | "* Visualize the digits dataset using the TSNE algorithm from the sklearn.manifold module (it runs for a couple of seconds).\n",
355 | "* Extract non-negative components from the digits dataset using NMF. Visualize the resulting components. The interface of NMF is identical to the PCA one. What qualitative difference can you find compared to PCA?"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {
362 | "collapsed": false
363 | },
364 | "outputs": [],
365 | "source": [
366 | "# %load solutions/digits_unsupervised.py\n",
367 | "from sklearn.manifold import TSNE\n",
368 | "from sklearn.decomposition import NMF\n",
369 | "\n",
370 | "# Compute TSNE embedding\n",
371 | "tsne = TSNE()\n",
372 | "X_tsne = tsne.fit_transform(X)\n",
373 | "\n",
374 | "# Visualize TSNE results\n",
375 | "plt.title(\"All classes\")\n",
376 | "plt.figure()\n",
377 | "plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)\n",
378 | "\n",
379 | "# build an NMF factorization of the digits dataset\n",
380 | "nmf = NMF(n_components=16).fit(X)\n",
381 | "\n",
382 | "# visualize the components\n",
383 | "fig, axes = plt.subplots(4, 4)\n",
384 | "for ax, component in zip(axes.ravel(), nmf.components_):\n",
385 | " ax.imshow(component.reshape(8, 8), cmap=\"gray\", interpolation=\"nearest\")\n"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {
392 | "collapsed": true
393 | },
394 | "outputs": [],
395 | "source": []
396 | }
397 | ],
398 | "metadata": {
399 | "kernelspec": {
400 | "display_name": "Python 2",
401 | "language": "python",
402 | "name": "python2"
403 | },
404 | "language_info": {
405 | "codemirror_mode": {
406 | "name": "ipython",
407 | "version": 2
408 | },
409 | "file_extension": ".py",
410 | "mimetype": "text/x-python",
411 | "name": "python",
412 | "nbconvert_exporter": "python",
413 | "pygments_lexer": "ipython2",
414 | "version": "2.7.10"
415 | }
416 | },
417 | "nbformat": 4,
418 | "nbformat_minor": 0
419 | }
420 |
--------------------------------------------------------------------------------
/Using built-in and custom score functions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np\n",
14 | "np.set_printoptions(precision=2)"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Built-In and custom scoring functions\n",
22 | "======================================="
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "### Using built-in scoring functions"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {
36 | "collapsed": false
37 | },
38 | "outputs": [],
39 | "source": [
40 | "from sklearn.datasets import make_classification\n",
41 | "from sklearn.cross_validation import train_test_split\n",
42 | "\n",
43 | "X, y = make_classification(random_state=0)\n",
44 | "\n",
45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [],
55 | "source": [
56 | "from sklearn.linear_model import LogisticRegression\n",
57 | "\n",
58 | "lr = LogisticRegression()\n",
59 | "lr.fit(X_train, y_train)"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {
66 | "collapsed": false
67 | },
68 | "outputs": [],
69 | "source": [
70 | "print(lr.score(X_test, y_test))"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {
77 | "collapsed": true
78 | },
79 | "outputs": [],
80 | "source": [
81 | "pred = lr.predict(X_test)"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {
88 | "collapsed": false
89 | },
90 | "outputs": [],
91 | "source": [
92 | "from sklearn.metrics import confusion_matrix\n",
93 | "print(confusion_matrix(y_test, pred))"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "Binary confusion matrix:\n",
101 | "\n",
102 | "\n",
103 | "True Positive (TP) | False Negative (FN) |
\n",
104 | "False Positive (FP) | True Negative (TN) |
\n",
105 | "
\n"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "\n",
113 | "$$ \\text{precision} = \\frac{TP}{FP + TP} $$\n",
114 | "\n",
115 | "$$ \\text{recall} = \\frac{TP}{FN + TP} $$\n",
116 | "\n",
117 | "$$ \\text{accuracy} = \\frac{TP + TN}{FP + FN + TP + TN} $$\n",
118 | "\n",
119 | "$$ f_1 = 2 \\frac{\\text{precision} \\cdot \\text{recall}}{\\text{precision} + \\text{recall}} $$\n"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "collapsed": false
127 | },
128 | "outputs": [],
129 | "source": [
130 | "from sklearn.metrics import classification_report\n",
131 | "print(classification_report(y_test, pred))"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [],
141 | "source": [
142 | "from sklearn.metrics import precision_score, f1_score\n",
143 | "print(\"precision: %f f1_score: %f\" % (precision_score(y_test, pred), f1_score(y_test, pred)))"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {
150 | "collapsed": false
151 | },
152 | "outputs": [],
153 | "source": [
154 | "from sklearn.metrics import roc_auc_score, average_precision_score, log_loss\n",
155 | "\n",
156 | "probs = lr.predict_proba(X_test)[:, 1]\n",
157 | "\n",
158 | "print(\"area under the roc_curve: %f\" % roc_auc_score(y_test, probs))\n",
159 | "print(\"average precision: %f\" % average_precision_score(y_test, probs))\n",
160 | "print(\"log loss: %f\" % log_loss(y_test, probs))"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "## Scorers for cross-validation and grid-search"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "collapsed": false
175 | },
176 | "outputs": [],
177 | "source": [
178 | "from sklearn.metrics.scorer import SCORERS\n",
179 | "print(SCORERS.keys())"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [],
189 | "source": [
190 | "from sklearn.cross_validation import cross_val_score\n",
191 | "\n",
192 | "cross_val_score(LogisticRegression(), X, y)"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {
199 | "collapsed": false
200 | },
201 | "outputs": [],
202 | "source": [
203 | "print(\"Accuracy scoring: %s\" % cross_val_score(LogisticRegression(), X, y, scoring=\"accuracy\"))\n",
204 | "print(\"F1 scoring: %s\" % cross_val_score(LogisticRegression(), X, y, scoring=\"f1\"))\n",
205 | "print(\"AUC scoring: %s\" % cross_val_score(LogisticRegression(), X, y, scoring=\"roc_auc\"))\n",
206 | "print(\"Log loss scoring: %s\" % cross_val_score(LogisticRegression(), X, y, scoring=\"log_loss\"))"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {
213 | "collapsed": false
214 | },
215 | "outputs": [],
216 | "source": [
217 | "from sklearn.grid_search import GridSearchCV\n",
218 | "\n",
219 | "param_grid = {'C': np.logspace(start=-3, stop=3, num=10)}\n",
220 | "grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring=\"log_loss\")\n",
221 | "grid_search.fit(X, y)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {
228 | "collapsed": false
229 | },
230 | "outputs": [],
231 | "source": [
232 | "grid_search.grid_scores_"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {
239 | "collapsed": false
240 | },
241 | "outputs": [],
242 | "source": [
243 | "grid_search.best_params_"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "## Defining your own scoring callable"
251 | ]
252 | },
253 | {
254 | "cell_type": "markdown",
255 | "metadata": {},
256 | "source": [
257 | "### From scratch"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": null,
263 | "metadata": {
264 | "collapsed": false
265 | },
266 | "outputs": [],
267 | "source": [
268 | "def my_accuracy_scoring(est, X, y):\n",
269 | " return np.mean(est.predict(X) == y)\n",
270 | "\n",
271 | "print(cross_val_score(LogisticRegression(), X, y))\n",
272 | "print(cross_val_score(LogisticRegression(), X, y, scoring=my_accuracy_scoring))"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {},
278 | "source": [
279 | "### From a score function"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "metadata": {
286 | "collapsed": false
287 | },
288 | "outputs": [],
289 | "source": [
290 | "from sklearn.metrics import fbeta_score\n",
291 | "fbeta_score(y_test, pred, beta=10)"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": null,
297 | "metadata": {
298 | "collapsed": false
299 | },
300 | "outputs": [],
301 | "source": [
302 | "from sklearn.metrics.scorer import make_scorer\n",
303 | "my_fbeta_scorer = make_scorer(fbeta_score, beta=10)\n",
304 | "\n",
305 | "print(cross_val_score(LogisticRegression(), X, y, scoring=my_fbeta_scorer))"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "### Accessing the estimator"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {
319 | "collapsed": true
320 | },
321 | "outputs": [],
322 | "source": [
323 | "def my_sparse_scoring(est, X, y):\n",
324 | " return np.mean(est.predict(X) == y) - np.mean(est.coef_ != 0)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {
331 | "collapsed": false
332 | },
333 | "outputs": [],
334 | "source": [
335 | "from sklearn.grid_search import GridSearchCV\n",
336 | "from sklearn.svm import LinearSVC\n",
337 | "\n",
338 | "grid = GridSearchCV(LinearSVC(C=.01, dual=False),\n",
339 | " param_grid={'penalty' : ['l1', 'l2']},\n",
340 | " scoring=my_sparse_scoring)\n",
341 | "grid.fit(X, y)\n",
342 | "print(grid.best_params_)"
343 | ]
344 | }
345 | ],
346 | "metadata": {
347 | "kernelspec": {
348 | "display_name": "Python 2",
349 | "language": "python",
350 | "name": "python2"
351 | },
352 | "language_info": {
353 | "codemirror_mode": {
354 | "name": "ipython",
355 | "version": 2
356 | },
357 | "file_extension": ".py",
358 | "mimetype": "text/x-python",
359 | "name": "python",
360 | "nbconvert_exporter": "python",
361 | "pygments_lexer": "ipython2",
362 | "version": "2.7.10"
363 | }
364 | },
365 | "nbformat": 4,
366 | "nbformat_minor": 0
367 | }
368 |
--------------------------------------------------------------------------------
/figures/cluster_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/nyu_ml_lectures/3c5858870bd7177e1850fdd4c721af0115e6a258/figures/cluster_comparison.png
--------------------------------------------------------------------------------
/figures/pipeline.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
844 |
--------------------------------------------------------------------------------
/figures/randomized_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/nyu_ml_lectures/3c5858870bd7177e1850fdd4c721af0115e6a258/figures/randomized_search.png
--------------------------------------------------------------------------------
/figures/train_test_split.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/figures/train_validation_test2.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/outline.rst:
--------------------------------------------------------------------------------
1 | copy cross-validation and intro to sklearn and api back in.
2 |
3 | start of with grid-search for nearest neighbors
4 | then go into linear models.
5 | do exercises for classifiers plotting coefficients bla
6 | then SGD (partial fit?)
7 |
8 | then do grid-search over linear models for good measure
9 | do learning curves maybe?
10 |
11 | Do scaling and pipelines
12 |
--------------------------------------------------------------------------------
/plots/__init__.py:
--------------------------------------------------------------------------------
1 | from .plot_2d_separator import plot_2d_separator
2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \
3 | plot_regression_datasets, make_dataset
4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization
5 | from .plot_interactive_tree import plot_tree_interactive
6 | from .plot_interactive_forest import plot_forest_interactive
7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters
8 | from .plot_rbf_svm_parameters import plot_svm_interactive
9 |
10 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization',
11 | 'plot_linear_svc_regularization', 'plot_tree_interactive',
12 | 'plot_regression_datasets', 'make_dataset',
13 | "plot_forest_interactive", "plot_rbf_svm_parameters",
14 | "plot_svm_interactive"]
15 |
--------------------------------------------------------------------------------
/plots/plot_2d_separator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None):
6 | if eps is None:
7 | eps = X.std() / 2.
8 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
9 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
10 | xx = np.linspace(x_min, x_max, 100)
11 | yy = np.linspace(y_min, y_max, 100)
12 |
13 | X1, X2 = np.meshgrid(xx, yy)
14 | X_grid = np.c_[X1.ravel(), X2.ravel()]
15 | try:
16 | decision_values = classifier.decision_function(X_grid)
17 | levels = [0]
18 | fill_levels = [decision_values.min(), 0, decision_values.max()]
19 | except AttributeError:
20 | # no decision_function
21 | decision_values = classifier.predict_proba(X_grid)[:, 1]
22 | levels = [.5]
23 | fill_levels = [0, .5, 1]
24 |
25 | if ax is None:
26 | ax = plt.gca()
27 | if fill:
28 | ax.contourf(X1, X2, decision_values.reshape(X1.shape),
29 | levels=fill_levels, colors=['blue', 'red'])
30 | else:
31 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels,
32 | colors="black")
33 | ax.set_xlim(x_min, x_max)
34 | ax.set_ylim(y_min, y_max)
35 | ax.set_xticks(())
36 | ax.set_yticks(())
37 |
38 |
39 | if __name__ == '__main__':
40 | from sklearn.datasets import make_blobs
41 | from sklearn.linear_model import LogisticRegression
42 | X, y = make_blobs(centers=2, random_state=42)
43 | clf = LogisticRegression().fit(X, y)
44 | plot_2d_separator(clf, X, fill=True)
45 | plt.scatter(X[:, 0], X[:, 1], c=y)
46 | plt.show()
47 |
--------------------------------------------------------------------------------
/plots/plot_interactive_forest.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.datasets import make_blobs
5 | from sklearn.ensemble import RandomForestClassifier
6 |
7 |
8 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
9 |
10 |
11 | def plot_forest(max_depth=1):
12 | plt.figure()
13 | ax = plt.gca()
14 | h = 0.02
15 |
16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
18 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
19 |
20 | if max_depth != 0:
21 | forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth,
22 | random_state=1).fit(X, y)
23 | Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
24 | Z = Z.reshape(xx.shape)
25 | ax.contourf(xx, yy, Z, alpha=.4)
26 | ax.set_title("max_depth = %d" % max_depth)
27 | else:
28 | ax.set_title("data set")
29 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
30 | ax.set_xlim(x_min, x_max)
31 | ax.set_ylim(y_min, y_max)
32 | ax.set_xticks(())
33 | ax.set_yticks(())
34 |
35 |
36 | def plot_forest_interactive():
37 | from IPython.html.widgets import interactive, IntSlider
38 | slider = IntSlider(min=0, max=8, step=1, value=0)
39 | return interactive(plot_forest, max_depth=slider)
40 |
--------------------------------------------------------------------------------
/plots/plot_interactive_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.datasets import make_blobs
5 | from sklearn.tree import DecisionTreeClassifier
6 |
7 | from sklearn.externals.six import StringIO # doctest: +SKIP
8 | from sklearn.tree import export_graphviz
9 | from scipy.misc import imread
10 | from scipy import ndimage
11 | import os
12 |
13 | GRAPHVIS_PATH = r"C:\Program Files (x86)\Graphviz2.38\bin"
14 | if GRAPHVIS_PATH not in os.environ['PATH']:
15 | os.environ['PATH'] += ";" + GRAPHVIS_PATH
16 |
17 | import re
18 |
19 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
20 |
21 |
22 | def tree_image(tree, fout=None):
23 | try:
24 | import pydot
25 | import a_reliable_dot_rendering
26 | except ImportError:
27 | return None
28 | dot_data = StringIO()
29 | export_graphviz(tree, out_file=dot_data)
30 | data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue())
31 | data = re.sub(r"samples = [0-9]+\\n", "", data)
32 | data = re.sub(r"\\nsamples = [0-9]+", "", data)
33 |
34 | graph = pydot.graph_from_dot_data(data)
35 | if fout is None:
36 | fout = "tmp.png"
37 | graph.write_png(fout)
38 | return imread(fout)
39 |
40 |
41 | def plot_tree(max_depth=1):
42 | fig, ax = plt.subplots(1, 2, figsize=(15, 7))
43 | h = 0.02
44 |
45 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
46 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
47 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
48 |
49 | if max_depth != 0:
50 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y)
51 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
52 | Z = Z.reshape(xx.shape)
53 | faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
54 | faces = faces.reshape(xx.shape)
55 | border = ndimage.laplace(faces) != 0
56 | ax[0].contourf(xx, yy, Z, alpha=.4)
57 | ax[0].scatter(xx[border], yy[border], marker='.', s=1)
58 | ax[0].set_title("max_depth = %d" % max_depth)
59 | img = tree_image(tree)
60 | if img is not None:
61 | ax[1].imshow(i)
62 | ax[1].axis("off")
63 | else:
64 | ax[1].set_visible(False)
65 | else:
66 | ax[0].set_title("data set")
67 | ax[1].set_visible(False)
68 | ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
69 | ax[0].set_xlim(x_min, x_max)
70 | ax[0].set_ylim(y_min, y_max)
71 | ax[0].set_xticks(())
72 | ax[0].set_yticks(())
73 |
74 |
75 | def plot_tree_interactive():
76 | from IPython.html.widgets import interactive, IntSlider
77 | slider = IntSlider(min=0, max=8, step=1, value=0)
78 | return interactive(plot_tree, max_depth=slider)
79 |
--------------------------------------------------------------------------------
/plots/plot_kneighbors_regularization.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.neighbors import KNeighborsRegressor
5 |
6 |
7 | def make_dataset(n_samples=100):
8 | rnd = np.random.RandomState(42)
9 | x = np.linspace(-3, 3, n_samples)
10 | y_no_noise = np.sin(4 * x) + x
11 | y = y_no_noise + rnd.normal(size=len(x))
12 | return x, y
13 |
14 |
15 | def plot_regression_datasets():
16 | fig, axes = plt.subplots(1, 3, figsize=(15, 5))
17 | for n_samples, ax in zip([10, 100, 1000], axes):
18 | x, y = make_dataset(n_samples)
19 | ax.plot(x, y, 'o', alpha=.6)
20 |
21 |
22 | def plot_kneighbors_regularization():
23 | rnd = np.random.RandomState(42)
24 | x = np.linspace(-3, 3, 100)
25 | y_no_noise = np.sin(4 * x) + x
26 | y = y_no_noise + rnd.normal(size=len(x))
27 | X = x[:, np.newaxis]
28 | fig, axes = plt.subplots(1, 3, figsize=(15, 5))
29 |
30 | x_test = np.linspace(-3, 3, 1000)
31 |
32 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()):
33 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors)
34 | kneighbor_regression.fit(X, y)
35 | ax.plot(x, y_no_noise, label="true function")
36 | ax.plot(x, y, "o", label="data")
37 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]),
38 | label="prediction")
39 | ax.legend(loc="best")
40 | ax.set_title("n_neighbors = %d" % n_neighbors)
41 |
42 | if __name__ == "__main__":
43 | plot_kneighbors_regularization()
44 | plt.show()
45 |
--------------------------------------------------------------------------------
/plots/plot_linear_svc_regularization.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.svm import SVC
4 | from sklearn.datasets import make_blobs
5 |
6 |
7 | def plot_linear_svc_regularization():
8 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
9 | fig, axes = plt.subplots(1, 3, figsize=(12, 4))
10 |
11 | # a carefully hand-designed dataset lol
12 | y[7] = 0
13 | y[27] = 0
14 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
15 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
16 |
17 | for ax, C in zip(axes, [1e-2, 1, 1e2]):
18 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
19 |
20 | svm = SVC(kernel='linear', C=C, tol=0.00001).fit(X, y)
21 | w = svm.coef_[0]
22 | a = -w[0] / w[1]
23 | xx = np.linspace(6, 13)
24 | yy = a * xx - (svm.intercept_[0]) / w[1]
25 | ax.plot(xx, yy, label="C = %.e" % C, c='k')
26 | ax.set_xlim(x_min, x_max)
27 | ax.set_ylim(y_min, y_max)
28 | ax.set_xticks(())
29 | ax.set_yticks(())
30 | ax.set_title("C = %f" % C)
31 |
32 | if __name__ == "__main__":
33 | plot_linear_svc_regularization()
34 | plt.show()
35 |
--------------------------------------------------------------------------------
/plots/plot_rbf_svm_parameters.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.svm import SVC
4 | from sklearn.datasets import make_blobs
5 | from sklearn.externals.joblib import Memory
6 | from .plot_2d_separator import plot_2d_separator
7 |
8 | def make_handcrafted_dataset():
9 | # a carefully hand-designed dataset lol
10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
11 | y[np.array([7, 27])] = 0
12 | mask = np.ones(len(X), dtype=np.bool)
13 | mask[np.array([0, 1, 5, 26])] = 0
14 | X, y = X[mask], y[mask]
15 | return X, y
16 |
17 |
18 | def plot_rbf_svm_parameters():
19 | X, y = make_handcrafted_dataset()
20 |
21 | fig, axes = plt.subplots(1, 3, figsize=(12, 4))
22 | for ax, C in zip(axes, [1e0, 5, 10, 100]):
23 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
24 |
25 | svm = SVC(kernel='rbf', C=C).fit(X, y)
26 | plot_2d_separator(svm, X, ax=ax, eps=.5)
27 | ax.set_title("C = %f" % C)
28 |
29 | fig, axes = plt.subplots(1, 4, figsize=(15, 3))
30 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]):
31 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
32 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y)
33 | plot_2d_separator(svm, X, ax=ax, eps=.5)
34 | ax.set_title("gamma = %f" % gamma)
35 |
36 |
37 | def plot_svm(log_C, log_gamma):
38 | X, y = make_handcrafted_dataset()
39 | C = 10. ** log_C
40 | gamma = 10. ** log_gamma
41 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y)
42 | ax = plt.gca()
43 | plot_2d_separator(svm, X, ax=ax, eps=.5)
44 | # plot data
45 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
46 | # plot support vectors
47 | sv = svm.support_vectors_
48 | ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3)
49 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma))
50 |
51 |
52 | def plot_svm_interactive():
53 | from IPython.html.widgets import interactive, FloatSlider
54 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False)
55 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False)
56 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider)
57 |
--------------------------------------------------------------------------------
/solutions/cross_validation_iris.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.cross_validation import StratifiedKFold, KFold
3 | iris = load_iris()
4 | X, y = iris.data, iris.target
5 |
6 | print(cross_val_score(LinearSVC(), X, y, cv=KFold(len(X), 3)))
7 | print(cross_val_score(LinearSVC(), X, y, cv=StratifiedKFold(y, 3)))
8 |
--------------------------------------------------------------------------------
/solutions/digits_unsupervised.py:
--------------------------------------------------------------------------------
1 | from sklearn.manifold import TSNE
2 | from sklearn.decomposition import NMF
3 |
4 | # Compute TSNE embedding
5 | tsne = TSNE()
6 | X_tsne = tsne.fit_transform(X)
7 |
8 | # Visualize TSNE results
9 | plt.title("All classes")
10 | plt.figure()
11 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
12 |
13 | # build an NMF factorization of the digits dataset
14 | nmf = NMF(n_components=16).fit(X)
15 |
16 | # visualize the components
17 | fig, axes = plt.subplots(4, 4)
18 | for ax, component in zip(axes.ravel(), nmf.components_):
19 | ax.imshow(component.reshape(8, 8), cmap="gray", interpolation="nearest")
20 | ax.xticks(())
21 | ax.yticks(())
22 |
--------------------------------------------------------------------------------
/solutions/forests.py:
--------------------------------------------------------------------------------
1 | from sklearn.tree import DecisionTreeClassifier
2 | from sklearn.ensemble import RandomForestClassifier
3 | from sklearn.datasets import load_digits
4 | from sklearn.learning_curve import validation_curve
5 |
6 | digits = load_digits()
7 |
8 | def plot_validation_curve(parameter_values, train_scores, validation_scores):
9 | train_scores_mean = np.mean(train_scores, axis=1)
10 | train_scores_std = np.std(train_scores, axis=1)
11 | validation_scores_mean = np.mean(validation_scores, axis=1)
12 | validation_scores_std = np.std(validation_scores, axis=1)
13 |
14 | plt.fill_between(parameter_values, train_scores_mean - train_scores_std,
15 | train_scores_mean + train_scores_std, alpha=0.1,
16 | color="r")
17 | plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,
18 | validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
19 | plt.plot(parameter_values, train_scores_mean, 'o-', color="r",
20 | label="Training score")
21 | plt.plot(parameter_values, validation_scores_mean, 'o-', color="g",
22 | label="Cross-validation score")
23 | plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)
24 | plt.legend(loc="best")
25 |
26 | param_range = range(1, 50)
27 | training_scores, validation_scores = validation_curve(DecisionTreeClassifier(), digits.data, digits.target,
28 | param_name="max_depth",
29 | param_range=param_range,
30 | cv=5)
31 | plt.figure()
32 | plot_validation_curve(param_range, training_scores, validation_scores)
33 |
34 | param_range = range(1, 20, 1)
35 | training_scores, validation_scores = validation_curve(RandomForestClassifier(n_estimators=100),
36 | digits.data, digits.target,
37 | param_name="max_features",
38 | param_range=param_range,
39 | cv=5)
40 | plt.figure()
41 | plot_validation_curve(param_range, training_scores, validation_scores)
42 |
--------------------------------------------------------------------------------
/solutions/grid_search_forest.py:
--------------------------------------------------------------------------------
1 | from sklearn.ensemble import RandomForestClassifier
2 |
3 | param_grid = {'max_depth': [1, 3, 5, 7, 10], 'max_features': [5, 8, 10, 20]}
4 |
5 | grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
6 | grid.fit(X_train, y_train)
7 | print("best parameters: %s" % grid.best_params_)
8 | print("Training set accuracy: %s" % grid.score(X_train, y_train))
9 | print("Test set accuracy: %s" % grid.score(X_test, y_test))
10 |
11 | scores = [x.mean_validation_score for x in grid.grid_scores_]
12 | scores = np.array(scores).reshape(5, 4)
13 | plt.matshow(scores)
14 | plt.xlabel("max_features")
15 | plt.ylabel("max_depth")
16 |
--------------------------------------------------------------------------------
/solutions/grid_search_k_neighbors.py:
--------------------------------------------------------------------------------
1 | from sklearn.neighbors import KNeighborsClassifier
2 |
3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]}
4 |
5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid)
6 | grid.fit(X_train, y_train)
7 | print("best parameters: %s" % grid.best_params_)
8 | print("Training set accuracy: %s" % grid.score(X_train, y_train))
9 | print("Test set accuracy: %s" % grid.score(X_test, y_test))
10 |
--------------------------------------------------------------------------------
/solutions/linear_models.py:
--------------------------------------------------------------------------------
1 | from pprint import pprint
2 |
3 | from sklearn.grid_search import GridSearchCV
4 | from sklearn.datasets import load_digits
5 | from sklearn.cross_validation import train_test_split
6 | from sklearn.svm import LinearSVC
7 |
8 | digits = load_digits()
9 | X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target % 2)
10 |
11 | grid = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-6, 2, 9)}, cv=5)
12 | grid.fit(X_train, y_train)
13 | pprint(grid.grid_scores_)
14 | pprint(grid.score(X_test, y_test))
15 |
16 |
17 | Cs = [10, 1, .01, 0.001, 0.0001]
18 | for penalty in ['l1', 'l2']:
19 | svm_models = {}
20 | training_scores = []
21 | test_scores = []
22 | for C in Cs:
23 | svm = LinearSVC(C=C, penalty=penalty, dual=False).fit(X_train, y_train)
24 | training_scores.append(svm.score(X_train, y_train))
25 | test_scores.append(svm.score(X_test, y_test))
26 | svm_models[C] = svm
27 |
28 | plt.figure()
29 | plt.plot(training_scores, label="training scores")
30 | plt.plot(test_scores, label="test scores")
31 | plt.xticks(range(4), Cs)
32 | plt.legend(loc="best")
33 |
34 | plt.figure(figsize=(10, 5))
35 | for i, C in enumerate(Cs):
36 | plt.plot(svm_models[C].coef_.ravel(), "o", label="C = %.2f" % C)
37 |
38 | plt.legend(loc="best")
39 |
--------------------------------------------------------------------------------
/solutions/load_iris.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.datasets import load_iris
5 | from sklearn.cross_validation import train_test_split
6 |
7 | iris = load_iris()
8 | X, y = iris.data, iris.target
9 |
10 | print("Dataset size: %d number of features: %d number of classes: %d"
11 | % (X.shape[0], X.shape[1], len(np.unique(y))))
12 |
13 | X_train, X_test, y_train, y_test = train_test_split(X, y)
14 |
15 | plt.scatters(X_train[:, 0], X_train[:, 1], c=y_train)
16 | plt.figure()
17 | plt.scatter(X_train[:, 2], X_train[:, 3], c=y_train)
18 |
--------------------------------------------------------------------------------
/solutions/pipeline_iris.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.feature_selection import SelectKBest
3 | from sklearn.pipeline import make_pipeline
4 | from sklearn.svm import LinearSVC
5 |
6 | rng = np.random.RandomState(42)
7 | iris = load_iris()
8 | X = np.hstack([iris.data, rng.uniform(size=(len(iris.data), 5))])
9 | X_train, X_test, y_train, y_test = train_test_split(X, iris.target, random_state=2)
10 |
11 | selection_pipe = make_pipeline(SelectKBest(), LinearSVC())
12 | param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3),
13 | 'selectkbest__k': [1, 2, 3, 4, 5, 7]}
14 | grid = GridSearchCV(selection_pipe, param_grid, cv=5)
15 | grid.fit(X_train, y_train)
16 | print("Best parameters: %s" % grid.best_params_)
17 | print("Test set performance: %s" % grid.score(X_test, y_test))
18 |
--------------------------------------------------------------------------------
/solutions/svms.py:
--------------------------------------------------------------------------------
1 | print("default score without scaling: %f" % SVC().fit(X_train, y_train).score(X_test, y_test))
2 |
3 | from sklearn.preprocessing import StandardScaler
4 | scaler = StandardScaler()
5 | X_train_scaled = scaler.fit_transform(X_train)
6 | X_test_scaled = scaler.transform(X_test)
7 | print("default score with scaling: %f" % SVC().fit(X_train_scaled, y_train).score(X_test_scaled, y_test))
8 |
9 | grid_search.fit(X_train_scaled, y_train)
10 |
11 | # We extract just the scores
12 | scores = [x[1] for x in grid_search.grid_scores_]
13 | scores = np.array(scores).reshape(6, 6)
14 |
15 | plt.matshow(scores)
16 | plt.xlabel('gamma')
17 | plt.ylabel('C')
18 | plt.colorbar()
19 | plt.xticks(np.arange(6), param_grid['gamma'])
20 | plt.yticks(np.arange(6), param_grid['C'])
21 |
--------------------------------------------------------------------------------
/solutions/train_iris.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.neighbors import KNeighborsClassifier
3 | from sklearn.cross_validation import train_test_split
4 |
5 | iris = load_iris()
6 | X, y = iris.data, iris.target
7 |
8 | X_train, X_test, y_train, y_test = train_test_split(X, y)
9 |
10 | knn = KNeighborsClassifier(n_neighbors=3)
11 | knn.fit(X_train, y_train)
12 |
13 | print("test set score of knn: %f" % knn.score(X_test, y_test))
14 |
--------------------------------------------------------------------------------
/solutions/validation_curve.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 |
3 | from sklearn.svm import LinearSVC
4 | from sklearn.neighbors import KNeighborsClassifier
5 | from sklearn.learning_curve import validation_curve
6 |
7 |
8 | cs = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
9 | training_scores, test_scores = validation_curve(LinearSVC(), X, y,
10 | param_name="C", param_range=cs)
11 | plt.figure()
12 | plot_validation_curve(range(7), training_scores, test_scores)
13 |
14 |
15 | ks = range(10)
16 | training_scores, test_scores = validation_curve(KNeighborsClassifier(), X, y,
17 | param_name="n_neighbors", param_range=ks)
18 | plt.figure()
19 | plot_validation_curve(ks, training_scores, test_scores)
20 |
--------------------------------------------------------------------------------