├── .gitignore
├── 01 - Intro to Machine Learning.ipynb
├── 02 - First Steps.ipynb
├── 03 - Unsupervised Transformers.ipynb
├── 04 - API Summary.ipynb
├── 05 - Cross-validation.ipynb
├── 06 - Model Complexity.ipynb
├── 07 - Grid Searches for Hyper Parameters.ipynb
├── 08 - Preprocessing and Pipelines.ipynb
├── 09.1 - Linear models.ipynb
├── 09.2 - Support Vector Machines.ipynb
├── 09.3 - Trees and Forests.ipynb
├── 10 - Working With Text Data.ipynb
├── 11 - Out Of Core Learning.ipynb
├── LICENSE
├── README.md
├── data
├── test_with_solutions.csv
└── train.csv
├── figures
├── bag_of_words.svg
├── cluster_comparison.png
├── cross_validation.svg
├── data_representation.svg
├── feature_union.svg
├── grid_search_cross_validation.svg
├── hashing_vectorizer.svg
├── overfitting_underfitting_cartoon.svg
├── pipeline.svg
├── pipeline_cross_validation.svg
├── randomized_search.png
├── supervised_workflow.svg
├── train_test_split.svg
├── train_test_split_matrix.svg
├── train_validation_test2.svg
└── unsupervised_workflow.svg
├── plots
├── __init__.py
├── plot_2d_separator.py
├── plot_interactive_forest.py
├── plot_interactive_tree.py
├── plot_kneighbors_regularization.py
├── plot_linear_svc_regularization.py
└── plot_rbf_svm_parameters.py
└── solutions
├── cross_validation_iris.py
├── digits_tsne.py
├── forests.py
├── grid_search_forest.py
├── grid_search_k_neighbors.py
├── linear_models.py
├── load_iris.py
├── pipeline_iris.py
├── svms.py
├── text_pipeline.py
├── train_iris.py
└── validation_curve.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | .ipynb_checkpoints/
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 |
47 | # Translations
48 | *.mo
49 | *.pot
50 |
51 | # Django stuff:
52 | *.log
53 |
54 | # Sphinx documentation
55 | docs/_build/
56 |
57 | # PyBuilder
58 | target/
59 |
--------------------------------------------------------------------------------
/01 - Intro to Machine Learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# What is machine learning ?"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Supervised learning\n"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {
20 | "collapsed": true
21 | },
22 | "source": [
23 | "
"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "# Data Representations"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "
"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "# Dataset Split"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "
"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {},
57 | "source": []
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "collapsed": false
64 | },
65 | "outputs": [],
66 | "source": [
67 | "% matplotlib nbagg\n",
68 | "import matplotlib.pyplot as plt\n",
69 | "import numpy as np"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {
76 | "collapsed": false
77 | },
78 | "outputs": [],
79 | "source": [
80 | "from sklearn.datasets import load_digits\n",
81 | "digits = load_digits()\n",
82 | "digits.keys()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [],
92 | "source": [
93 | "digits.images.shape"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [],
103 | "source": [
104 | "print(digits.images[0])"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [],
114 | "source": [
115 | "plt.matshow(digits.images[0], cmap=plt.cm.Greys)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "collapsed": false
123 | },
124 | "outputs": [],
125 | "source": [
126 | "digits.data.shape"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [],
136 | "source": [
137 | "digits.target.shape"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "collapsed": false
145 | },
146 | "outputs": [],
147 | "source": [
148 | "digits.target"
149 | ]
150 | },
151 | {
152 | "cell_type": "markdown",
153 | "metadata": {},
154 | "source": [
155 | "**Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)**"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "Splitting the data:"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "collapsed": true
170 | },
171 | "outputs": [],
172 | "source": [
173 | "from sklearn.cross_validation import train_test_split\n",
174 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {},
180 | "source": [
181 | "# Exercises\n",
182 | "\n",
183 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n",
184 | "The function returns a dictionary-like object that has the same attributes as ``digits``.\n",
185 | "\n",
186 | "What is the number of classes, features and data points in this dataset?\n",
187 | "Use a scatterplot to visualize the dataset.\n",
188 | "\n",
189 | "You can look at ``DESCR`` attribute to learn more about the dataset."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {
196 | "collapsed": true
197 | },
198 | "outputs": [],
199 | "source": [
200 | "# %load solutions/load_iris.py"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {
207 | "collapsed": true
208 | },
209 | "outputs": [],
210 | "source": []
211 | }
212 | ],
213 | "metadata": {
214 | "kernelspec": {
215 | "display_name": "Python 2",
216 | "language": "python",
217 | "name": "python2"
218 | },
219 | "language_info": {
220 | "codemirror_mode": {
221 | "name": "ipython",
222 | "version": 2
223 | },
224 | "file_extension": ".py",
225 | "mimetype": "text/x-python",
226 | "name": "python",
227 | "nbconvert_exporter": "python",
228 | "pygments_lexer": "ipython2",
229 | "version": "2.7.9"
230 | }
231 | },
232 | "nbformat": 4,
233 | "nbformat_minor": 0
234 | }
235 |
--------------------------------------------------------------------------------
/02 - First Steps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Get some data to play with"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from sklearn.datasets import load_digits\n",
32 | "digits = load_digits()"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [],
42 | "source": [
43 | "from sklearn.cross_validation import train_test_split\n",
44 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n",
45 | " digits.target)"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "Really Simple API\n",
53 | "-------------------\n",
54 | "0) Import your model class"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "collapsed": false
62 | },
63 | "outputs": [],
64 | "source": [
65 | "from sklearn.svm import LinearSVC"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "1) Instantiate an object and set the parameters"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {
79 | "collapsed": false
80 | },
81 | "outputs": [],
82 | "source": [
83 | "svm = LinearSVC()"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "2) Fit the model"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {
97 | "collapsed": false
98 | },
99 | "outputs": [],
100 | "source": [
101 | "svm.fit(X_train, y_train)"
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {},
107 | "source": [
108 | "3) Apply / evaluate"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [],
118 | "source": [
119 | "print(svm.predict(X_train))\n",
120 | "print(y_train)"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {
127 | "collapsed": false
128 | },
129 | "outputs": [],
130 | "source": [
131 | "svm.score(X_train, y_train)"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [],
141 | "source": [
142 | "np.sum(svm.predict(X_train) == y_train) / float(len(X_train))"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {
149 | "collapsed": false
150 | },
151 | "outputs": [],
152 | "source": [
153 | "svm.score(X_test, y_test)"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "And again\n",
161 | "---------"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "collapsed": false
169 | },
170 | "outputs": [],
171 | "source": [
172 | "from sklearn.ensemble import RandomForestClassifier"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {
179 | "collapsed": false
180 | },
181 | "outputs": [],
182 | "source": [
183 | "rf = RandomForestClassifier(n_estimators=50)"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [],
193 | "source": [
194 | "rf.fit(X_train, y_train)"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "collapsed": false
202 | },
203 | "outputs": [],
204 | "source": [
205 | "rf.score(X_test, y_test)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {
212 | "collapsed": false
213 | },
214 | "outputs": [],
215 | "source": [
216 | "%load https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/examples/classification/plot_classifier_comparison.py"
217 | ]
218 | },
219 | {
220 | "cell_type": "markdown",
221 | "metadata": {},
222 | "source": [
223 | "# Exercises\n",
224 | "Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.\n",
225 | "\n",
226 | "Split it into training and test set using ``train_test_split``.\n",
227 | "Then train an evaluate a classifier of your choice.\n"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "collapsed": false
235 | },
236 | "outputs": [],
237 | "source": [
238 | "# %load solutions/train_iris.py"
239 | ]
240 | }
241 | ],
242 | "metadata": {
243 | "kernelspec": {
244 | "display_name": "Python 2",
245 | "language": "python",
246 | "name": "python2"
247 | },
248 | "language_info": {
249 | "codemirror_mode": {
250 | "name": "ipython",
251 | "version": 2
252 | },
253 | "file_extension": ".py",
254 | "mimetype": "text/x-python",
255 | "name": "python",
256 | "nbconvert_exporter": "python",
257 | "pygments_lexer": "ipython2",
258 | "version": "2.7.9"
259 | }
260 | },
261 | "nbformat": 4,
262 | "nbformat_minor": 0
263 | }
264 |
--------------------------------------------------------------------------------
/03 - Unsupervised Transformers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "
"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from sklearn.datasets import load_digits\n",
32 | "from sklearn.cross_validation import train_test_split\n",
33 | "import numpy as np\n",
34 | "np.set_printoptions(suppress=True)\n",
35 | "\n",
36 | "digits = load_digits()\n",
37 | "X, y = digits.data, digits.target\n",
38 | "X_train, X_test, y_train, y_test = train_test_split(X, y)"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "Removing mean and scaling variance\n",
46 | "==================================="
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "from sklearn.preprocessing import StandardScaler"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "1) Instantiate the model"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {
71 | "collapsed": false
72 | },
73 | "outputs": [],
74 | "source": [
75 | "scaler = StandardScaler()"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "2) Fit using only the data."
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [],
92 | "source": [
93 | "scaler.fit(X_train)"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "3) `transform` the data (not `predict`)."
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": false
108 | },
109 | "outputs": [],
110 | "source": [
111 | "X_train_scaled = scaler.transform(X_train)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "X_train.shape"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "metadata": {
129 | "collapsed": false
130 | },
131 | "outputs": [],
132 | "source": [
133 | "X_train_scaled.shape"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "The transformed version of the data has the mean removed:"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {
147 | "collapsed": false
148 | },
149 | "outputs": [],
150 | "source": [
151 | "X_train_scaled.mean(axis=0)"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "collapsed": false
159 | },
160 | "outputs": [],
161 | "source": [
162 | "X_train_scaled.std(axis=0)"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [],
172 | "source": [
173 | "X_test_transformed = scaler.transform(X_test)"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "Principal Component Analysis\n",
181 | "============================="
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "metadata": {},
187 | "source": [
188 | "0) Import the model"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {
195 | "collapsed": false
196 | },
197 | "outputs": [],
198 | "source": [
199 | "from sklearn.decomposition import PCA"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "1) Instantiate the model"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {
213 | "collapsed": false
214 | },
215 | "outputs": [],
216 | "source": [
217 | "pca = PCA(n_components=2)"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "2) Fit to training data"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {
231 | "collapsed": false
232 | },
233 | "outputs": [],
234 | "source": [
235 | "pca.fit(X)"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "3) Transform to lower-dimensional representation"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {
249 | "collapsed": false
250 | },
251 | "outputs": [],
252 | "source": [
253 | "print(X.shape)\n",
254 | "X_pca = pca.transform(X)\n",
255 | "X_pca.shape"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "metadata": {},
261 | "source": [
262 | "Visualize\n",
263 | "----------"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "metadata": {
270 | "collapsed": false
271 | },
272 | "outputs": [],
273 | "source": [
274 | "plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "source": [
283 | "Manifold Learning\n",
284 | "=================="
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {
291 | "collapsed": true
292 | },
293 | "outputs": [],
294 | "source": [
295 | "from sklearn.manifold import Isomap\n",
296 | "isomap = Isomap()"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": null,
302 | "metadata": {
303 | "collapsed": false
304 | },
305 | "outputs": [],
306 | "source": [
307 | "X_isomap = isomap.fit_transform(X)"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": null,
313 | "metadata": {
314 | "collapsed": false
315 | },
316 | "outputs": [],
317 | "source": [
318 | "plt.figure()\n",
319 | "plt.scatter(X_isomap[:, 0], X_isomap[:, 1], c=y)"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {
325 | "collapsed": true
326 | },
327 | "source": [
328 | "# Exercises\n",
329 | "Visualize the digits dataset using the TSNE algorithm from the sklearn.manifold module (it runs for a couple of seconds).\n"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {
336 | "collapsed": false
337 | },
338 | "outputs": [],
339 | "source": [
340 | "# %load solutions/digits_tsne.py"
341 | ]
342 | }
343 | ],
344 | "metadata": {
345 | "kernelspec": {
346 | "display_name": "Python 2",
347 | "language": "python",
348 | "name": "python2"
349 | },
350 | "language_info": {
351 | "codemirror_mode": {
352 | "name": "ipython",
353 | "version": 2
354 | },
355 | "file_extension": ".py",
356 | "mimetype": "text/x-python",
357 | "name": "python",
358 | "nbconvert_exporter": "python",
359 | "pygments_lexer": "ipython2",
360 | "version": "2.7.9"
361 | }
362 | },
363 | "nbformat": 4,
364 | "nbformat_minor": 0
365 | }
366 |
--------------------------------------------------------------------------------
/04 - API Summary.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# A recap on Scikit-learn's estimator interface\n"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "``X`` : data, 2d numpy array or scipy sparse matrix of shape (n_samples, n_features)\n",
15 | "\n",
16 | "``y`` : targets, 1d numpy array of shape (n_samples,)"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "## Methods"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "
\n",
31 | "``model.fit(X_train, [y_train])`` |
\n",
32 | "``model.predict(X_test)`` | ``model.transform(X_test)`` |
\n",
33 | "Classification | Preprocessing |
\n",
34 | "Regression | Dimensionality Reduction |
\n",
35 | "Clustering | Feature Extraction |
\n",
36 | " | Feature selection |
\n",
37 | "
"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "## Efficient alternatives, methods for models that don't generalize\n",
45 | "``model.fit_predict(X)`` (clustering)\n",
46 | "\n",
47 | "``model.fit_transform(X)`` (manifold learning)"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "### Additional methods\n",
55 | "__Model evaluation__ : ``score(X, [y])``\n",
56 | "\n",
57 | "__Uncertainties from Classifiers__: ``decision_function(X)`` and ``predict_proba(X)``."
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## Attributes\n",
65 | "__Classifiers__: ``classes_``\n",
66 | "\n",
67 | "__Clustering__: ``labels_``\n",
68 | "\n",
69 | "__Manifold Learning__: ``embedding_``\n",
70 | "\n",
71 | "__Linear models__: ``coef_``\n",
72 | "\n",
73 | "__Linear Decompositions__: ``components_``"
74 | ]
75 | }
76 | ],
77 | "metadata": {
78 | "kernelspec": {
79 | "display_name": "Python 2",
80 | "language": "python",
81 | "name": "python2"
82 | },
83 | "language_info": {
84 | "codemirror_mode": {
85 | "name": "ipython",
86 | "version": 2
87 | },
88 | "file_extension": ".py",
89 | "mimetype": "text/x-python",
90 | "name": "python",
91 | "nbconvert_exporter": "python",
92 | "pygments_lexer": "ipython2",
93 | "version": "2.7.9"
94 | }
95 | },
96 | "nbformat": 4,
97 | "nbformat_minor": 0
98 | }
99 |
--------------------------------------------------------------------------------
/05 - Cross-validation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Cross-Validation\n",
21 | "----------------------------------------"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {
35 | "collapsed": false
36 | },
37 | "outputs": [],
38 | "source": [
39 | "from sklearn.datasets import load_iris"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [],
49 | "source": [
50 | "iris = load_iris()\n",
51 | "X = iris.data\n",
52 | "y = iris.target"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {
59 | "collapsed": false
60 | },
61 | "outputs": [],
62 | "source": [
63 | "from sklearn.cross_validation import cross_val_score\n",
64 | "from sklearn.svm import LinearSVC"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {
71 | "collapsed": false
72 | },
73 | "outputs": [],
74 | "source": [
75 | "cross_val_score(LinearSVC(), X, y, cv=5)"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [],
85 | "source": [
86 | "cross_val_score(LinearSVC(), X, y, cv=5, scoring=\"f1_macro\")"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "Let's go to a binary task for a moment"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [],
103 | "source": [
104 | "y % 2"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [],
114 | "source": [
115 | "cross_val_score(LinearSVC(), X, y % 2)"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "collapsed": false
123 | },
124 | "outputs": [],
125 | "source": [
126 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"average_precision\")"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [],
136 | "source": [
137 | "cross_val_score(LinearSVC(), X, y % 2, scoring=\"roc_auc\")"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": null,
143 | "metadata": {
144 | "collapsed": false
145 | },
146 | "outputs": [],
147 | "source": [
148 | "from sklearn.metrics.scorer import SCORERS\n",
149 | "print(SCORERS.keys())"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "Implementing your own scoring metric:"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": null,
162 | "metadata": {
163 | "collapsed": false
164 | },
165 | "outputs": [],
166 | "source": [
167 | "def my_accuracy_scoring(est, X, y):\n",
168 | " return np.mean(est.predict(X) == y)\n",
169 | "\n",
170 | "cross_val_score(LinearSVC(), X, y, scoring=my_accuracy_scoring)"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "metadata": {
177 | "collapsed": true
178 | },
179 | "outputs": [],
180 | "source": [
181 | "def my_super_scoring(est, X, y):\n",
182 | " return np.mean(est.predict(X) == y) - np.mean(est.coef_ != 0)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": null,
188 | "metadata": {
189 | "collapsed": false
190 | },
191 | "outputs": [],
192 | "source": [
193 | "from sklearn.grid_search import GridSearchCV\n",
194 | "\n",
195 | "y = iris.target\n",
196 | "grid = GridSearchCV(LinearSVC(C=.01, dual=False),\n",
197 | " param_grid={'penalty' : ['l1', 'l2']},\n",
198 | " scoring=my_super_scoring)\n",
199 | "grid.fit(X, y)\n",
200 | "print(grid.best_params_)"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "There are other ways to do cross-valiation"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {
214 | "collapsed": false
215 | },
216 | "outputs": [],
217 | "source": [
218 | "from sklearn.cross_validation import ShuffleSplit\n",
219 | "\n",
220 | "shuffle_split = ShuffleSplit(len(X), 10, test_size=.4)\n",
221 | "cross_val_score(LinearSVC(), X, y, cv=shuffle_split)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {
228 | "collapsed": true
229 | },
230 | "outputs": [],
231 | "source": [
232 | "from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit\n",
233 | "\n",
234 | "def plot_cv(cv, n_samples):\n",
235 | " masks = []\n",
236 | " for train, test in cv:\n",
237 | " mask = np.zeros(n_samples, dtype=bool)\n",
238 | " mask[test] = 1\n",
239 | " masks.append(mask)\n",
240 | " plt.matshow(masks)"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {
247 | "collapsed": false
248 | },
249 | "outputs": [],
250 | "source": [
251 | "plot_cv(StratifiedKFold(y, n_folds=5), len(y))"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {
258 | "collapsed": false
259 | },
260 | "outputs": [],
261 | "source": [
262 | "plot_cv(KFold(len(iris.target), n_folds=5), len(iris.target))"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {
269 | "collapsed": false
270 | },
271 | "outputs": [],
272 | "source": [
273 | "plot_cv(ShuffleSplit(len(iris.target), n_iter=20, test_size=.2), \n",
274 | " len(iris.target))"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "source": [
283 | "# Exercises\n",
284 | "Use KFold cross validation and StratifiedKFold cross validation (3 or 5 folds) for LinearSVC on the iris dataset.\n",
285 | "Why are the results so different? How could you get more similar results?"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "metadata": {
292 | "collapsed": false
293 | },
294 | "outputs": [],
295 | "source": [
296 | "# %load solutions/cross_validation_iris.py"
297 | ]
298 | }
299 | ],
300 | "metadata": {
301 | "kernelspec": {
302 | "display_name": "Python 2",
303 | "language": "python",
304 | "name": "python2"
305 | },
306 | "language_info": {
307 | "codemirror_mode": {
308 | "name": "ipython",
309 | "version": 2
310 | },
311 | "file_extension": ".py",
312 | "mimetype": "text/x-python",
313 | "name": "python",
314 | "nbconvert_exporter": "python",
315 | "pygments_lexer": "ipython2",
316 | "version": "2.7.9"
317 | }
318 | },
319 | "nbformat": 4,
320 | "nbformat_minor": 0
321 | }
322 |
--------------------------------------------------------------------------------
/06 - Model Complexity.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import matplotlib.pyplot as plt\n",
12 | "import numpy as np\n",
13 | "%matplotlib nbagg"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Model Complexity, Overfitting and Underfitting\n"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from plots import plot_kneighbors_regularization\n",
32 | "plot_kneighbors_regularization()"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | ""
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "source": [
48 | "# Validation Curves"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {
55 | "collapsed": true
56 | },
57 | "outputs": [],
58 | "source": [
59 | "from sklearn.datasets import load_digits\n",
60 | "from sklearn.ensemble import RandomForestClassifier\n",
61 | "from sklearn.learning_curve import validation_curve"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "digits = load_digits()\n",
73 | "X, y = digits.data, digits.target"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {
80 | "collapsed": false
81 | },
82 | "outputs": [],
83 | "source": [
84 | "model = RandomForestClassifier(n_estimators=20)\n",
85 | "param_range = range(1, 13)\n",
86 | "training_scores, validation_scores = validation_curve(model, X, y,\n",
87 | " param_name=\"max_depth\",\n",
88 | " param_range=param_range, cv=5)"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {
95 | "collapsed": false
96 | },
97 | "outputs": [],
98 | "source": [
99 | "training_scores.shape"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "collapsed": true
107 | },
108 | "outputs": [],
109 | "source": [
110 | "def plot_validation_curve(parameter_values, train_scores, validation_scores):\n",
111 | " train_scores_mean = np.mean(train_scores, axis=1)\n",
112 | " train_scores_std = np.std(train_scores, axis=1)\n",
113 | " validation_scores_mean = np.mean(validation_scores, axis=1)\n",
114 | " validation_scores_std = np.std(validation_scores, axis=1)\n",
115 | "\n",
116 | " plt.fill_between(parameter_values, train_scores_mean - train_scores_std,\n",
117 | " train_scores_mean + train_scores_std, alpha=0.1,\n",
118 | " color=\"r\")\n",
119 | " plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,\n",
120 | " validation_scores_mean + validation_scores_std, alpha=0.1, color=\"g\")\n",
121 | " plt.plot(parameter_values, train_scores_mean, 'o-', color=\"r\",\n",
122 | " label=\"Training score\")\n",
123 | " plt.plot(parameter_values, validation_scores_mean, 'o-', color=\"g\",\n",
124 | " label=\"Cross-validation score\")\n",
125 | " plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)\n",
126 | " plt.legend(loc=\"best\")"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [],
136 | "source": [
137 | "plt.figure()\n",
138 | "plot_validation_curve(param_range, training_scores, validation_scores)"
139 | ]
140 | },
141 | {
142 | "cell_type": "markdown",
143 | "metadata": {},
144 | "source": [
145 | "# Exercise\n",
146 | "\n",
147 | "Plot the validation curve on the digit dataset for:\n",
148 | "* a LinearSVC with a logarithmic range of regularization parameters ``C``.\n",
149 | "* KNeighborsClassifier with a linear range of neighbors ``k``.\n",
150 | "\n",
151 | "What do you expect them to look like? How do they actually look like?"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "collapsed": false
159 | },
160 | "outputs": [],
161 | "source": [
162 | "# %load solutions/validation_curve.py"
163 | ]
164 | }
165 | ],
166 | "metadata": {
167 | "kernelspec": {
168 | "display_name": "Python 2",
169 | "language": "python",
170 | "name": "python2"
171 | },
172 | "language_info": {
173 | "codemirror_mode": {
174 | "name": "ipython",
175 | "version": 2
176 | },
177 | "file_extension": ".py",
178 | "mimetype": "text/x-python",
179 | "name": "python",
180 | "nbconvert_exporter": "python",
181 | "pygments_lexer": "ipython2",
182 | "version": "2.7.9"
183 | }
184 | },
185 | "nbformat": 4,
186 | "nbformat_minor": 0
187 | }
188 |
--------------------------------------------------------------------------------
/07 - Grid Searches for Hyper Parameters.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Grid Searches\n",
21 | "================="
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "Grid-Search with build-in cross validation"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [],
45 | "source": [
46 | "from sklearn.grid_search import GridSearchCV\n",
47 | "from sklearn.svm import SVC"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [],
57 | "source": [
58 | "from sklearn.datasets import load_digits\n",
59 | "from sklearn.cross_validation import train_test_split\n",
60 | "digits = load_digits()\n",
61 | "X_train, X_test, y_train, y_test = train_test_split(digits.data,\n",
62 | " digits.target)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "Define parameter grid:"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {
76 | "collapsed": false
77 | },
78 | "outputs": [],
79 | "source": [
80 | "import numpy as np\n",
81 | "\n",
82 | "param_grid = {'C': 10. ** np.arange(-3, 3),\n",
83 | " 'gamma' : 10. ** np.arange(-5, 0)}\n",
84 | "\n",
85 | "np.set_printoptions(suppress=True)\n",
86 | "print(param_grid)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [],
96 | "source": [
97 | "grid_search = GridSearchCV(SVC(), param_grid, verbose=3)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "A GridSearchCV object behaves just like a normal classifier."
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": false,
112 | "scrolled": true
113 | },
114 | "outputs": [],
115 | "source": [
116 | "grid_search.fit(X_train, y_train)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {
123 | "collapsed": false
124 | },
125 | "outputs": [],
126 | "source": [
127 | "grid_search.predict(X_test)"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {
134 | "collapsed": false
135 | },
136 | "outputs": [],
137 | "source": [
138 | "grid_search.score(X_test, y_test)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {
145 | "collapsed": false
146 | },
147 | "outputs": [],
148 | "source": [
149 | "grid_search.best_params_"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {
156 | "collapsed": false
157 | },
158 | "outputs": [],
159 | "source": [
160 | "# We extract just the scores\n",
161 | "\n",
162 | "scores = [x.mean_validation_score for x in grid_search.grid_scores_]\n",
163 | "scores = np.array(scores).reshape(6, 5)\n",
164 | "\n",
165 | "plt.matshow(scores)\n",
166 | "plt.xlabel('gamma')\n",
167 | "plt.ylabel('C')\n",
168 | "plt.colorbar()\n",
169 | "plt.xticks(np.arange(5), param_grid['gamma'])\n",
170 | "plt.yticks(np.arange(6), param_grid['C']);"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "Nested Cross-validation in scikit-learn:"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {
184 | "collapsed": false
185 | },
186 | "outputs": [],
187 | "source": [
188 | "from sklearn.cross_validation import cross_val_score\n",
189 | "cross_val_score(GridSearchCV(SVC(), param_grid),\n",
190 | " digits.data, digits.target)"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {
196 | "collapsed": true
197 | },
198 | "source": [
199 | "# Exercises\n",
200 | "Use GridSearchCV to adjust n_neighbors of KNeighborsClassifier.\n"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "metadata": {
207 | "collapsed": false
208 | },
209 | "outputs": [],
210 | "source": [
211 | "# %load solutions/grid_search_k_neighbors.py"
212 | ]
213 | }
214 | ],
215 | "metadata": {
216 | "kernelspec": {
217 | "display_name": "Python 2",
218 | "language": "python",
219 | "name": "python2"
220 | },
221 | "language_info": {
222 | "codemirror_mode": {
223 | "name": "ipython",
224 | "version": 2
225 | },
226 | "file_extension": ".py",
227 | "mimetype": "text/x-python",
228 | "name": "python",
229 | "nbconvert_exporter": "python",
230 | "pygments_lexer": "ipython2",
231 | "version": "2.7.10"
232 | }
233 | },
234 | "nbformat": 4,
235 | "nbformat_minor": 0
236 | }
237 |
--------------------------------------------------------------------------------
/08 - Preprocessing and Pipelines.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "Preprocessing and Pipelines\n",
21 | "============================="
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "
"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {
35 | "collapsed": false
36 | },
37 | "outputs": [],
38 | "source": [
39 | "from sklearn.datasets import load_digits\n",
40 | "from sklearn.cross_validation import train_test_split\n",
41 | "digits = load_digits()\n",
42 | "X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.\n",
50 | "To do that, we build a pipeline."
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {
57 | "collapsed": false
58 | },
59 | "outputs": [],
60 | "source": [
61 | "from sklearn.pipeline import Pipeline, make_pipeline\n",
62 | "from sklearn.svm import SVC\n",
63 | "from sklearn.preprocessing import StandardScaler"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {
70 | "collapsed": false
71 | },
72 | "outputs": [],
73 | "source": [
74 | "pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"svm\", SVC())])\n",
75 | "# or for short:\n",
76 | "make_pipeline(StandardScaler(), SVC())"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {
83 | "collapsed": false
84 | },
85 | "outputs": [],
86 | "source": [
87 | "pipeline.fit(X_train, y_train)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {
94 | "collapsed": false
95 | },
96 | "outputs": [],
97 | "source": [
98 | "pipeline.predict(X_test)"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "
"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Cross-validation with a pipeline\n",
113 | "---------------------------------"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "metadata": {
120 | "collapsed": false
121 | },
122 | "outputs": [],
123 | "source": [
124 | "from sklearn.cross_validation import cross_val_score\n",
125 | "cross_val_score(pipeline, X_train, y_train)"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "Grid Search with a pipeline\n",
133 | "==========================="
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "collapsed": false
141 | },
142 | "outputs": [],
143 | "source": [
144 | "from sklearn.grid_search import GridSearchCV\n",
145 | "\n",
146 | "param_grid = {'svm__C': 10. ** np.arange(-3, 3),\n",
147 | " 'svm__gamma' : 10. ** np.arange(-3, 3)}\n",
148 | "\n",
149 | "grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1)"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {
156 | "collapsed": false
157 | },
158 | "outputs": [],
159 | "source": [
160 | "grid_pipeline.fit(X_train, y_train)"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {
167 | "collapsed": false
168 | },
169 | "outputs": [],
170 | "source": [
171 | "grid_pipeline.score(X_test, y_test)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {
177 | "collapsed": false
178 | },
179 | "source": [
180 | "# Exercises\n",
181 | "Add random features to the iris dataset using ``np.random.uniform`` and ``np.hstack``.\n",
182 | "\n",
183 | "Build a pipeline using the SelectKBest univariate feature selection from the sklearn.feature_selection module and the LinearSVC on the iris dataset.\n",
184 | "\n",
185 | "Use GridSearchCV to adjust C and the number of features selected in SelectKBest."
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {
192 | "collapsed": false,
193 | "scrolled": true
194 | },
195 | "outputs": [],
196 | "source": [
197 | "# %load solutions/pipeline_iris.py"
198 | ]
199 | }
200 | ],
201 | "metadata": {
202 | "kernelspec": {
203 | "display_name": "Python 2",
204 | "language": "python",
205 | "name": "python2"
206 | },
207 | "language_info": {
208 | "codemirror_mode": {
209 | "name": "ipython",
210 | "version": 2
211 | },
212 | "file_extension": ".py",
213 | "mimetype": "text/x-python",
214 | "name": "python",
215 | "nbconvert_exporter": "python",
216 | "pygments_lexer": "ipython2",
217 | "version": "2.7.9"
218 | }
219 | },
220 | "nbformat": 4,
221 | "nbformat_minor": 0
222 | }
223 |
--------------------------------------------------------------------------------
/09.1 - Linear models.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import numpy as np\n",
13 | "import matplotlib.pyplot as plt"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Linear models for regression"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "\n",
28 | "```\n",
29 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_\n",
30 | "```"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {
37 | "collapsed": false
38 | },
39 | "outputs": [],
40 | "source": [
41 | "from sklearn.datasets import make_regression\n",
42 | "from sklearn.cross_validation import train_test_split\n",
43 | "\n",
44 | "X, y, true_coefficient = make_regression(n_samples=80, n_features=30, n_informative=10, noise=100, coef=True, random_state=5)\n",
45 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)\n",
46 | "print(X_train.shape)\n",
47 | "print(y_train.shape)"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "## Linear Regression\n",
55 | "\n",
56 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 $$"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "collapsed": false,
64 | "scrolled": true
65 | },
66 | "outputs": [],
67 | "source": [
68 | "from sklearn.linear_model import LinearRegression\n",
69 | "linear_regression = LinearRegression().fit(X_train, y_train)\n",
70 | "print(\"R^2 on training set: %f\" % linear_regression.score(X_train, y_train))\n",
71 | "print(\"R^2 on test set: %f\" % linear_regression.score(X_test, y_test))"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": false
79 | },
80 | "outputs": [],
81 | "source": [
82 | "from sklearn.metrics import r2_score\n",
83 | "print(r2_score(np.dot(X, true_coefficient), y))"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [],
93 | "source": [
94 | "plt.figure(figsize=(10, 5))\n",
95 | "coefficient_sorting = np.argsort(true_coefficient)[::-1]\n",
96 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\")\n",
97 | "plt.plot(linear_regression.coef_[coefficient_sorting], \"o\", label=\"linear regression\")\n",
98 | "\n",
99 | "plt.legend()"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "## Ridge Regression (L2 penalty)\n",
107 | "\n",
108 | "$$ \\text{min}_{w,b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_2^2$$ "
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [],
118 | "source": [
119 | "from sklearn.linear_model import Ridge\n",
120 | "ridge_models = {}\n",
121 | "training_scores = []\n",
122 | "test_scores = []\n",
123 | "\n",
124 | "for alpha in [100, 10, 1, .01]:\n",
125 | " ridge = Ridge(alpha=alpha).fit(X_train, y_train)\n",
126 | " training_scores.append(ridge.score(X_train, y_train))\n",
127 | " test_scores.append(ridge.score(X_test, y_test))\n",
128 | " ridge_models[alpha] = ridge\n",
129 | "\n",
130 | "plt.figure()\n",
131 | "plt.plot(training_scores, label=\"training scores\")\n",
132 | "plt.plot(test_scores, label=\"test scores\")\n",
133 | "plt.xticks(range(4), [100, 10, 1, .01])\n",
134 | "plt.legend(loc=\"best\")"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {
141 | "collapsed": false
142 | },
143 | "outputs": [],
144 | "source": [
145 | "plt.figure(figsize=(10, 5))\n",
146 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n",
147 | "\n",
148 | "for i, alpha in enumerate([100, 10, 1, .01]):\n",
149 | " plt.plot(ridge_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n",
150 | " \n",
151 | "plt.legend(loc=\"best\")"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "## Lasso (L1 penalty)\n",
159 | "$$ \\text{min}_{w, b} \\sum_i || w^\\mathsf{T}x_i + b - y_i||^2 + \\alpha ||w||_1$$ "
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {
166 | "collapsed": false
167 | },
168 | "outputs": [],
169 | "source": [
170 | "from sklearn.linear_model import Lasso\n",
171 | "\n",
172 | "lasso_models = {}\n",
173 | "training_scores = []\n",
174 | "test_scores = []\n",
175 | "\n",
176 | "for alpha in [30, 10, 1, .01]:\n",
177 | " lasso = Lasso(alpha=alpha).fit(X_train, y_train)\n",
178 | " training_scores.append(lasso.score(X_train, y_train))\n",
179 | " test_scores.append(lasso.score(X_test, y_test))\n",
180 | " lasso_models[alpha] = lasso\n",
181 | "plt.figure()\n",
182 | "plt.plot(training_scores, label=\"training scores\")\n",
183 | "plt.plot(test_scores, label=\"test scores\")\n",
184 | "plt.xticks(range(4), [30, 10, 1, .01])\n",
185 | "plt.legend(loc=\"best\")"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "metadata": {
192 | "collapsed": false
193 | },
194 | "outputs": [],
195 | "source": [
196 | "plt.figure(figsize=(10, 5))\n",
197 | "plt.plot(true_coefficient[coefficient_sorting], \"o\", label=\"true\", c='b')\n",
198 | "\n",
199 | "for i, alpha in enumerate([30, 10, 1, .01]):\n",
200 | " plt.plot(lasso_models[alpha].coef_[coefficient_sorting], \"o\", label=\"alpha = %.2f\" % alpha, c=plt.cm.summer(i / 3.))\n",
201 | " \n",
202 | "plt.legend(loc=\"best\")"
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "## Linear models for classification"
210 | ]
211 | },
212 | {
213 | "cell_type": "markdown",
214 | "metadata": {},
215 | "source": [
216 | "\n",
217 | "```\n",
218 | "y_pred = x_test[0] * coef_[0] + ... + x_test[n_features-1] * coef_[n_features-1] + intercept_ > 0\n",
219 | "```"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "The influence of C in LinearSVC"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [],
236 | "source": [
237 | "from plots import plot_linear_svc_regularization\n",
238 | "plot_linear_svc_regularization()"
239 | ]
240 | },
241 | {
242 | "cell_type": "markdown",
243 | "metadata": {},
244 | "source": [
245 | "## Multi-Class linear classification"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {
252 | "collapsed": false
253 | },
254 | "outputs": [],
255 | "source": [
256 | "from sklearn.datasets import make_blobs\n",
257 | "plt.figure()\n",
258 | "X, y = make_blobs(random_state=42)\n",
259 | "plt.scatter(X[:, 0], X[:, 1], c=y)"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {
266 | "collapsed": false
267 | },
268 | "outputs": [],
269 | "source": [
270 | "from sklearn.svm import LinearSVC\n",
271 | "linear_svm = LinearSVC().fit(X, y)\n",
272 | "print(linear_svm.coef_.shape)\n",
273 | "print(linear_svm.intercept_.shape)"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "outputs": [],
283 | "source": [
284 | "plt.scatter(X[:, 0], X[:, 1], c=y)\n",
285 | "line = np.linspace(-15, 15)\n",
286 | "for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):\n",
287 | " plt.plot(line, -(line * coef[0] + intercept) / coef[1])\n",
288 | "plt.ylim(-10, 15)\n",
289 | "plt.xlim(-10, 8)"
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "metadata": {},
295 | "source": [
296 | "# Exercises"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "* Use GridSearchCV to tune the parameter C of LinearSVC on the digits dataset.\n",
304 | "* Compare l1 penalty and l2 penalty by plotting the coefficients as above for the digits dataset. Classify odd vs even digits to make it a binary task."
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {
311 | "collapsed": false
312 | },
313 | "outputs": [],
314 | "source": [
315 | "# %load solutions/linear_models.py"
316 | ]
317 | }
318 | ],
319 | "metadata": {
320 | "kernelspec": {
321 | "display_name": "Python 2",
322 | "language": "python",
323 | "name": "python2"
324 | },
325 | "language_info": {
326 | "codemirror_mode": {
327 | "name": "ipython",
328 | "version": 2
329 | },
330 | "file_extension": ".py",
331 | "mimetype": "text/x-python",
332 | "name": "python",
333 | "nbconvert_exporter": "python",
334 | "pygments_lexer": "ipython2",
335 | "version": "2.7.10"
336 | }
337 | },
338 | "nbformat": 4,
339 | "nbformat_minor": 0
340 | }
341 |
--------------------------------------------------------------------------------
/09.2 - Support Vector Machines.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib nbagg\n",
12 | "import numpy as np\n",
13 | "import matplotlib.pyplot as plt"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Support Vector Machines"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from sklearn.datasets import load_digits\n",
32 | "from sklearn.cross_validation import train_test_split\n",
33 | "\n",
34 | "digits = load_digits()\n",
35 | "X_train, X_test, y_train, y_test = train_test_split(digits.data / 16., digits.target % 2, random_state=2)"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [],
45 | "source": [
46 | "from sklearn.svm import LinearSVC, SVC\n",
47 | "linear_svc = LinearSVC(loss=\"hinge\").fit(X_train, y_train)\n",
48 | "svc = SVC(kernel=\"linear\").fit(X_train, y_train)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [],
58 | "source": [
59 | "np.mean(linear_svc.predict(X_test) == svc.predict(X_test))"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "## Kernel SVMs\n",
67 | "\n",
68 | "\n",
69 | "Predictions in a kernel-SVM are made using the formular\n",
70 | "\n",
71 | "$$\n",
72 | "\\hat{y} = \\alpha_0 + \\alpha_1 y_1 k(\\mathbf{x^{(1)}}, \\mathbf{x}) + ... + \\alpha_n y_n k(\\mathbf{x^{(n)}}, \\mathbf{x})> 0\n",
73 | "$$\n",
74 | "\n",
75 | "$$\n",
76 | "0 \\leq \\alpha_i \\leq C\n",
77 | "$$\n",
78 | "\n"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "Radial basis function (Gaussian) kernel:\n",
86 | "$$k(\\mathbf{x}, \\mathbf{x'}) = \\exp(-\\gamma ||\\mathbf{x} - \\mathbf{x'}||^2)$$"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [],
96 | "source": [
97 | "from sklearn.metrics.pairwise import rbf_kernel\n",
98 | "line = np.linspace(-3, 3, 100)[:, np.newaxis]\n",
99 | "kernel_value = rbf_kernel([[0]], line, gamma=1)\n",
100 | "plt.plot(line, kernel_value.T)"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": false
108 | },
109 | "outputs": [],
110 | "source": [
111 | "from plots import plot_svm_interactive\n",
112 | "plot_svm_interactive()"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "collapsed": false
120 | },
121 | "outputs": [],
122 | "source": [
123 | "svc = SVC().fit(X_train, y_train)\n",
124 | "svc.score(X_test, y_test)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {
131 | "collapsed": false
132 | },
133 | "outputs": [],
134 | "source": [
135 | "Cs = [0.001, 0.01, 0.1, 1, 10, 100]\n",
136 | "gammas = [0.001, 0.01, 0.1, 1, 10, 100]\n",
137 | "\n",
138 | "from sklearn.grid_search import GridSearchCV\n",
139 | "\n",
140 | "param_grid = {'C': Cs, 'gamma' : gammas}\n",
141 | "grid_search = GridSearchCV(SVC(), param_grid, cv=5)\n",
142 | "grid_search.fit(X_train, y_train)"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {
149 | "collapsed": false
150 | },
151 | "outputs": [],
152 | "source": [
153 | "grid_search.score(X_test, y_test)"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {
160 | "collapsed": false
161 | },
162 | "outputs": [],
163 | "source": [
164 | "# We extract just the scores\n",
165 | "scores = [x[1] for x in grid_search.grid_scores_]\n",
166 | "scores = np.array(scores).reshape(6, 6)\n",
167 | "\n",
168 | "plt.matshow(scores)\n",
169 | "plt.xlabel('gamma')\n",
170 | "plt.ylabel('C')\n",
171 | "plt.colorbar()\n",
172 | "plt.xticks(np.arange(6), param_grid['gamma'])\n",
173 | "plt.yticks(np.arange(6), param_grid['C']);"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {
179 | "collapsed": true
180 | },
181 | "source": [
182 | "# Excercise\n",
183 | "* Scale the data using StandardScaler before applying the SVC. How does the performance of the default parameters change?\n",
184 | "* Grid-Search the parameters for the scaled data. How do they differ from the previous ones?"
185 | ]
186 | }
187 | ],
188 | "metadata": {
189 | "kernelspec": {
190 | "display_name": "Python 2",
191 | "language": "python",
192 | "name": "python2"
193 | },
194 | "language_info": {
195 | "codemirror_mode": {
196 | "name": "ipython",
197 | "version": 2
198 | },
199 | "file_extension": ".py",
200 | "mimetype": "text/x-python",
201 | "name": "python",
202 | "nbconvert_exporter": "python",
203 | "pygments_lexer": "ipython2",
204 | "version": "2.7.10"
205 | }
206 | },
207 | "nbformat": 4,
208 | "nbformat_minor": 0
209 | }
210 |
--------------------------------------------------------------------------------
/09.3 - Trees and Forests.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Trees and Forests"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [],
17 | "source": [
18 | "%matplotlib nbagg\n",
19 | "import numpy as np\n",
20 | "import matplotlib.pyplot as plt"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "Decision Tree Classification\n",
28 | "==================\n"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "metadata": {
35 | "collapsed": false
36 | },
37 | "outputs": [],
38 | "source": [
39 | "from plots import plot_tree_interactive\n",
40 | "plot_tree_interactive()"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "## Random Forests"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [],
57 | "source": [
58 | "from plots import plot_forest_interactive\n",
59 | "plot_forest_interactive()"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "## Selecting the Optimal Estimator via Cross-Validation"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "collapsed": false
74 | },
75 | "outputs": [],
76 | "source": [
77 | "from sklearn import grid_search\n",
78 | "from sklearn.datasets import load_digits\n",
79 | "from sklearn.cross_validation import train_test_split\n",
80 | "from sklearn.ensemble import RandomForestClassifier\n",
81 | "\n",
82 | "digits = load_digits()\n",
83 | "X, y = digits.data, digits.target\n",
84 | "\n",
85 | "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
86 | "\n",
87 | "rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)\n",
88 | "parameters = {'max_features':['sqrt', 'log2'],\n",
89 | " 'max_depth':[5, 7, 9]}\n",
90 | "\n",
91 | "clf_grid = grid_search.GridSearchCV(rf, parameters)\n",
92 | "clf_grid.fit(X_train, y_train)"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {
99 | "collapsed": false
100 | },
101 | "outputs": [],
102 | "source": [
103 | "clf_grid.score(X_train, y_train)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {
110 | "collapsed": false
111 | },
112 | "outputs": [],
113 | "source": [
114 | "clf_grid.score(X_test, y_test)"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "collapsed": true
121 | },
122 | "source": [
123 | "# Exercises\n",
124 | "* Plot the validation curve for the maximum depth of a decision tree on the digits dataset.\n",
125 | "* Plot the validation curve for max_features of a random forest on the digits dataset."
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {
132 | "collapsed": true
133 | },
134 | "outputs": [],
135 | "source": [
136 | "# %load solutions/forests.py"
137 | ]
138 | }
139 | ],
140 | "metadata": {
141 | "kernelspec": {
142 | "display_name": "Python 2",
143 | "language": "python",
144 | "name": "python2"
145 | },
146 | "language_info": {
147 | "codemirror_mode": {
148 | "name": "ipython",
149 | "version": 2
150 | },
151 | "file_extension": ".py",
152 | "mimetype": "text/x-python",
153 | "name": "python",
154 | "nbconvert_exporter": "python",
155 | "pygments_lexer": "ipython2",
156 | "version": "2.7.10"
157 | }
158 | },
159 | "nbformat": 4,
160 | "nbformat_minor": 0
161 | }
162 |
--------------------------------------------------------------------------------
/10 - Working With Text Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "%matplotlib inline\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# Working with Text Data"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "
"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "collapsed": false
35 | },
36 | "outputs": [],
37 | "source": [
38 | "import pandas as pd\n",
39 | "import os\n",
40 | "\n",
41 | "data = pd.read_csv(os.path.join(\"data\", \"train.csv\"))"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "collapsed": false
49 | },
50 | "outputs": [],
51 | "source": [
52 | "len(data)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {
59 | "collapsed": false
60 | },
61 | "outputs": [],
62 | "source": [
63 | "data"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {
70 | "collapsed": false
71 | },
72 | "outputs": [],
73 | "source": [
74 | "y_train = np.array(data.Insult)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {
81 | "collapsed": false
82 | },
83 | "outputs": [],
84 | "source": [
85 | "y_train"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "collapsed": false
93 | },
94 | "outputs": [],
95 | "source": [
96 | "text_train = data.Comment.tolist()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {
103 | "collapsed": false
104 | },
105 | "outputs": [],
106 | "source": [
107 | "text_train[6]"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [],
117 | "source": [
118 | "data_test = pd.read_csv(os.path.join(\"data\", \"test_with_solutions.csv\"))"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {
125 | "collapsed": false
126 | },
127 | "outputs": [],
128 | "source": [
129 | "text_test, y_test = data_test.Comment.tolist(), np.array(data_test.Insult)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {
136 | "collapsed": false
137 | },
138 | "outputs": [],
139 | "source": [
140 | "from sklearn.feature_extraction.text import CountVectorizer"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": null,
146 | "metadata": {
147 | "collapsed": false
148 | },
149 | "outputs": [],
150 | "source": [
151 | "cv = CountVectorizer()\n",
152 | "cv.fit(text_train)"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {
159 | "collapsed": false
160 | },
161 | "outputs": [],
162 | "source": [
163 | "len(cv.vocabulary_)"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": null,
169 | "metadata": {
170 | "collapsed": false,
171 | "scrolled": true
172 | },
173 | "outputs": [],
174 | "source": [
175 | "print(cv.get_feature_names()[:50])\n",
176 | "print(cv.get_feature_names()[-50:])"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {
183 | "collapsed": false
184 | },
185 | "outputs": [],
186 | "source": [
187 | "X_train = cv.transform(text_train)"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {
194 | "collapsed": false
195 | },
196 | "outputs": [],
197 | "source": [
198 | "X_train"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {
205 | "collapsed": false
206 | },
207 | "outputs": [],
208 | "source": [
209 | "text_train[6]"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "collapsed": false
217 | },
218 | "outputs": [],
219 | "source": [
220 | "X_train[6, :].nonzero()[1]"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {
227 | "collapsed": false
228 | },
229 | "outputs": [],
230 | "source": [
231 | "X_test = cv.transform(text_test)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {
238 | "collapsed": false
239 | },
240 | "outputs": [],
241 | "source": [
242 | "from sklearn.svm import LinearSVC\n",
243 | "svm = LinearSVC()"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {
250 | "collapsed": false
251 | },
252 | "outputs": [],
253 | "source": [
254 | "svm.fit(X_train, y_train)"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": null,
260 | "metadata": {
261 | "collapsed": false
262 | },
263 | "outputs": [],
264 | "source": [
265 | "svm.score(X_train, y_train)"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": null,
271 | "metadata": {
272 | "collapsed": false
273 | },
274 | "outputs": [],
275 | "source": [
276 | "svm.score(X_test, y_test)"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {
283 | "collapsed": false
284 | },
285 | "outputs": [],
286 | "source": [
287 | "def visualize_coefficients(classifier, feature_names, n_top_features=25):\n",
288 | " # get coefficients with large absolute values \n",
289 | " coef = classifier.coef_.ravel()\n",
290 | " positive_coefficients = np.argsort(coef)[-n_top_features:]\n",
291 | " negative_coefficients = np.argsort(coef)[:n_top_features]\n",
292 | " interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])\n",
293 | " # plot them\n",
294 | " plt.figure(figsize=(15, 5))\n",
295 | " colors = [\"red\" if c < 0 else \"blue\" for c in coef[interesting_coefficients]]\n",
296 | " plt.bar(np.arange(50), coef[interesting_coefficients], color=colors)\n",
297 | " feature_names = np.array(feature_names)\n",
298 | " plt.xticks(np.arange(1, 51), feature_names[interesting_coefficients], rotation=60, ha=\"right\");\n"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {
305 | "collapsed": false
306 | },
307 | "outputs": [],
308 | "source": [
309 | "visualize_coefficients(svm, cv.get_feature_names())"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "metadata": {},
315 | "source": [
316 | "# Exercises\n",
317 | "* Create a pipeine using the count vectorizer and SVM (see 07). Train and score using the pipeline.\n",
318 | "* Vary the n_gram_range in the count vectorizer, visualize the changed coefficients.\n",
319 | "* Grid search the C in the LinearSVC using the pipeline.\n",
320 | "* Grid search the C in the LinearSVC together with the n_gram_range (try (1,1), (1, 2), (2, 2))"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {
327 | "collapsed": false
328 | },
329 | "outputs": [],
330 | "source": [
331 | "# %load solutions/text_pipeline.py\n"
332 | ]
333 | }
334 | ],
335 | "metadata": {
336 | "kernelspec": {
337 | "display_name": "Python 2",
338 | "language": "python",
339 | "name": "python2"
340 | },
341 | "language_info": {
342 | "codemirror_mode": {
343 | "name": "ipython",
344 | "version": 2
345 | },
346 | "file_extension": ".py",
347 | "mimetype": "text/x-python",
348 | "name": "python",
349 | "nbconvert_exporter": "python",
350 | "pygments_lexer": "ipython2",
351 | "version": "2.7.6"
352 | }
353 | },
354 | "nbformat": 4,
355 | "nbformat_minor": 0
356 | }
357 |
--------------------------------------------------------------------------------
/11 - Out Of Core Learning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "# write out some toy data\n",
12 | "from sklearn.datasets import load_digits\n",
13 | "import cPickle\n",
14 | "\n",
15 | "digits = load_digits()\n",
16 | "\n",
17 | "X, y = digits.data, digits.target\n",
18 | "\n",
19 | "for i in range(10):\n",
20 | " cPickle.dump((X[i::10], y[i::10]), open(\"data/batch_%02d.pickle\" % i, \"w\"), -1)"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {
27 | "collapsed": false
28 | },
29 | "outputs": [],
30 | "source": [
31 | "from sklearn.linear_model import SGDClassifier\n"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {
38 | "collapsed": false
39 | },
40 | "outputs": [],
41 | "source": [
42 | "sgd = SGDClassifier()\n",
43 | "\n",
44 | "for i in range(9):\n",
45 | " X_batch, y_batch = cPickle.load(open(\"data/batch_%02d.pickle\" % i))\n",
46 | " sgd.partial_fit(X_batch, y_batch, classes=range(10))"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "X_test, y_test = cPickle.load(open(\"data/batch_09.pickle\"))\n",
58 | "\n",
59 | "sgd.score(X_test, y_test)"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "Text\n",
67 | "====="
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "collapsed": false
75 | },
76 | "outputs": [],
77 | "source": [
78 | "import pandas as pd\n",
79 | "from sklearn.feature_extraction.text import HashingVectorizer\n",
80 | "\n",
81 | "sgd = SGDClassifier()\n",
82 | "hashing_vectorizer = HashingVectorizer()\n",
83 | "\n",
84 | "for i in range(10):\n",
85 | " data_batch = pd.read_csv(\"data/train_%d.csv\" % i)\n",
86 | " text_batch = data_batch.Comment.tolist()\n",
87 | " y_batch = data_batch.Insult.values\n",
88 | " X_batch = hashing_vectorizer.transform(text_batch)\n",
89 | " sgd.partial_fit(X_batch, y_batch, classes=range(10))"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {
96 | "collapsed": false
97 | },
98 | "outputs": [],
99 | "source": [
100 | "data_test = pd.read_csv(\"data/test_with_solutions.csv\")\n",
101 | "X_test = hashing_vectorizer.transform(data_test.Comment.tolist())\n",
102 | "y_test = data_test.Insult.values\n",
103 | "sgd.score(X_test, y_test)"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "Kernel Approximations\n",
111 | "======================="
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "from sklearn.kernel_approximation import RBFSampler\n",
123 | "\n",
124 | "sgd = SGDClassifier()\n",
125 | "kernel_approximation = RBFSampler(gamma=.001, n_components=400)\n",
126 | "\n",
127 | "for i in range(9):\n",
128 | " X_batch, y_batch = cPickle.load(open(\"data/batch_%02d.pickle\" % i))\n",
129 | " if i == 0:\n",
130 | " kernel_approximation.fit(X_batch)\n",
131 | " X_transformed = kernel_approximation.transform(X_batch)\n",
132 | " sgd.partial_fit(X_transformed, y_batch, classes=range(10))"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [],
142 | "source": [
143 | "X_test, y_test = cPickle.load(open(\"data/batch_09.pickle\"))\n",
144 | "\n",
145 | "sgd.score(kernel_approximation.transform(X_test), y_test)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {
152 | "collapsed": true
153 | },
154 | "outputs": [],
155 | "source": []
156 | }
157 | ],
158 | "metadata": {
159 | "kernelspec": {
160 | "display_name": "Python 2",
161 | "language": "python",
162 | "name": "python2"
163 | },
164 | "language_info": {
165 | "codemirror_mode": {
166 | "name": "ipython",
167 | "version": 2
168 | },
169 | "file_extension": ".py",
170 | "mimetype": "text/x-python",
171 | "name": "python",
172 | "nbconvert_exporter": "python",
173 | "pygments_lexer": "ipython2",
174 | "version": "2.7.6"
175 | }
176 | },
177 | "nbformat": 4,
178 | "nbformat_minor": 0
179 | }
180 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015, Andreas Mueller
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Material for Scikit-learn workshop
2 | Jupyter notebooks for an interactive scikit-learn workshop with exercises and solutions.
3 |
--------------------------------------------------------------------------------
/figures/cluster_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/sklearn_workshop/3bb698e874ea2f816855ec2bc1ae406e555bd5d0/figures/cluster_comparison.png
--------------------------------------------------------------------------------
/figures/pipeline.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
844 |
--------------------------------------------------------------------------------
/figures/randomized_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amueller/sklearn_workshop/3bb698e874ea2f816855ec2bc1ae406e555bd5d0/figures/randomized_search.png
--------------------------------------------------------------------------------
/figures/train_test_split.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/figures/train_validation_test2.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/plots/__init__.py:
--------------------------------------------------------------------------------
1 | from .plot_2d_separator import plot_2d_separator
2 | from .plot_kneighbors_regularization import plot_kneighbors_regularization, \
3 | plot_regression_datasets, make_dataset
4 | from .plot_linear_svc_regularization import plot_linear_svc_regularization
5 | from .plot_interactive_tree import plot_tree_interactive
6 | from .plot_interactive_forest import plot_forest_interactive
7 | from .plot_rbf_svm_parameters import plot_rbf_svm_parameters
8 | from .plot_rbf_svm_parameters import plot_svm_interactive
9 |
10 | __all__ = ['plot_2d_separator', 'plot_kneighbors_regularization',
11 | 'plot_linear_svc_regularization', 'plot_tree_interactive',
12 | 'plot_regression_datasets', 'make_dataset',
13 | "plot_forest_interactive", "plot_rbf_svm_parameters",
14 | "plot_svm_interactive"]
15 |
--------------------------------------------------------------------------------
/plots/plot_2d_separator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | def plot_2d_separator(classifier, X, fill=False, ax=None, eps=None):
6 | if eps is None:
7 | eps = X.std() / 2.
8 | x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
9 | y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps
10 | xx = np.linspace(x_min, x_max, 100)
11 | yy = np.linspace(y_min, y_max, 100)
12 |
13 | X1, X2 = np.meshgrid(xx, yy)
14 | X_grid = np.c_[X1.ravel(), X2.ravel()]
15 | try:
16 | decision_values = classifier.decision_function(X_grid)
17 | levels = [0]
18 | fill_levels = [decision_values.min(), 0, decision_values.max()]
19 | except AttributeError:
20 | # no decision_function
21 | decision_values = classifier.predict_proba(X_grid)[:, 1]
22 | levels = [.5]
23 | fill_levels = [0, .5, 1]
24 |
25 | if ax is None:
26 | ax = plt.gca()
27 | if fill:
28 | ax.contourf(X1, X2, decision_values.reshape(X1.shape),
29 | levels=fill_levels, colors=['blue', 'red'])
30 | else:
31 | ax.contour(X1, X2, decision_values.reshape(X1.shape), levels=levels,
32 | colors="black")
33 | ax.set_xlim(x_min, x_max)
34 | ax.set_ylim(y_min, y_max)
35 | ax.set_xticks(())
36 | ax.set_yticks(())
37 |
38 |
39 | if __name__ == '__main__':
40 | from sklearn.datasets import make_blobs
41 | from sklearn.linear_model import LogisticRegression
42 | X, y = make_blobs(centers=2, random_state=42)
43 | clf = LogisticRegression().fit(X, y)
44 | plot_2d_separator(clf, X, fill=True)
45 | plt.scatter(X[:, 0], X[:, 1], c=y)
46 | plt.show()
47 |
--------------------------------------------------------------------------------
/plots/plot_interactive_forest.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.datasets import make_blobs
5 | from sklearn.ensemble import RandomForestClassifier
6 |
7 |
8 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
9 |
10 |
11 | def plot_forest(max_depth=1):
12 | plt.figure()
13 | ax = plt.gca()
14 | h = 0.02
15 |
16 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
17 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
18 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
19 |
20 | if max_depth != 0:
21 | forest = RandomForestClassifier(n_estimators=20, max_depth=max_depth,
22 | random_state=1).fit(X, y)
23 | Z = forest.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
24 | Z = Z.reshape(xx.shape)
25 | ax.contourf(xx, yy, Z, alpha=.4)
26 | ax.set_title("max_depth = %d" % max_depth)
27 | else:
28 | ax.set_title("data set")
29 | ax.scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
30 | ax.set_xlim(x_min, x_max)
31 | ax.set_ylim(y_min, y_max)
32 | ax.set_xticks(())
33 | ax.set_yticks(())
34 |
35 |
36 | def plot_forest_interactive():
37 | from IPython.html.widgets import interactive, IntSlider
38 | slider = IntSlider(min=0, max=8, step=1, value=0)
39 | return interactive(plot_forest, max_depth=slider)
40 |
--------------------------------------------------------------------------------
/plots/plot_interactive_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.datasets import make_blobs
5 | from sklearn.tree import DecisionTreeClassifier
6 |
7 | from sklearn.externals.six import StringIO # doctest: +SKIP
8 | from sklearn.tree import export_graphviz
9 | from scipy.misc import imread
10 | from scipy import ndimage
11 | import os
12 |
13 | GRAPHVIS_PATH = r"C:\Program Files (x86)\Graphviz2.38\bin"
14 | if GRAPHVIS_PATH not in os.environ['PATH']:
15 | os.environ['PATH'] += ";" + GRAPHVIS_PATH
16 |
17 | import re
18 |
19 | X, y = make_blobs(centers=[[0, 0], [1, 1]], random_state=61526, n_samples=50)
20 |
21 |
22 | def tree_image(tree, fout=None):
23 | try:
24 | import pydot
25 | import a_reliable_dot_rendering
26 | except ImportError:
27 | return None
28 | dot_data = StringIO()
29 | export_graphviz(tree, out_file=dot_data)
30 | data = re.sub(r"gini = 0\.[0-9]+\\n", "", dot_data.getvalue())
31 | data = re.sub(r"samples = [0-9]+\\n", "", data)
32 | data = re.sub(r"\\nsamples = [0-9]+", "", data)
33 |
34 | graph = pydot.graph_from_dot_data(data)
35 | if fout is None:
36 | fout = "tmp.png"
37 | graph.write_png(fout)
38 | return imread(fout)
39 |
40 |
41 | def plot_tree(max_depth=1):
42 | fig, ax = plt.subplots(1, 2, figsize=(15, 7))
43 | h = 0.02
44 |
45 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
46 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
47 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
48 |
49 | if max_depth != 0:
50 | tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y)
51 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
52 | Z = Z.reshape(xx.shape)
53 | faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
54 | faces = faces.reshape(xx.shape)
55 | border = ndimage.laplace(faces) != 0
56 | ax[0].contourf(xx, yy, Z, alpha=.4)
57 | ax[0].scatter(xx[border], yy[border], marker='.', s=1)
58 | ax[0].set_title("max_depth = %d" % max_depth)
59 | img = tree_image(tree)
60 | if img is not None:
61 | ax[1].imshow(i)
62 | ax[1].axis("off")
63 | else:
64 | ax[1].set_visible(False)
65 | else:
66 | ax[0].set_title("data set")
67 | ax[1].set_visible(False)
68 | ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60)
69 | ax[0].set_xlim(x_min, x_max)
70 | ax[0].set_ylim(y_min, y_max)
71 | ax[0].set_xticks(())
72 | ax[0].set_yticks(())
73 |
74 |
75 | def plot_tree_interactive():
76 | from IPython.html.widgets import interactive, IntSlider
77 | slider = IntSlider(min=0, max=8, step=1, value=0)
78 | return interactive(plot_tree, max_depth=slider)
79 |
--------------------------------------------------------------------------------
/plots/plot_kneighbors_regularization.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.neighbors import KNeighborsRegressor
5 |
6 |
7 | def make_dataset(n_samples=100):
8 | rnd = np.random.RandomState(42)
9 | x = np.linspace(-3, 3, n_samples)
10 | y_no_noise = np.sin(4 * x) + x
11 | y = y_no_noise + rnd.normal(size=len(x))
12 | return x, y
13 |
14 |
15 | def plot_regression_datasets():
16 | fig, axes = plt.subplots(1, 3, figsize=(15, 5))
17 | for n_samples, ax in zip([10, 100, 1000], axes):
18 | x, y = make_dataset(n_samples)
19 | ax.plot(x, y, 'o', alpha=.6)
20 |
21 |
22 | def plot_kneighbors_regularization():
23 | rnd = np.random.RandomState(42)
24 | x = np.linspace(-3, 3, 100)
25 | y_no_noise = np.sin(4 * x) + x
26 | y = y_no_noise + rnd.normal(size=len(x))
27 | X = x[:, np.newaxis]
28 | fig, axes = plt.subplots(1, 3, figsize=(15, 5))
29 |
30 | x_test = np.linspace(-3, 3, 1000)
31 |
32 | for n_neighbors, ax in zip([2, 5, 20], axes.ravel()):
33 | kneighbor_regression = KNeighborsRegressor(n_neighbors=n_neighbors)
34 | kneighbor_regression.fit(X, y)
35 | ax.plot(x, y_no_noise, label="true function")
36 | ax.plot(x, y, "o", label="data")
37 | ax.plot(x_test, kneighbor_regression.predict(x_test[:, np.newaxis]),
38 | label="prediction")
39 | ax.legend(loc="best")
40 | ax.set_title("n_neighbors = %d" % n_neighbors)
41 |
42 | if __name__ == "__main__":
43 | plot_kneighbors_regularization()
44 | plt.show()
45 |
--------------------------------------------------------------------------------
/plots/plot_linear_svc_regularization.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.svm import SVC
4 | from sklearn.datasets import make_blobs
5 |
6 |
7 | def plot_linear_svc_regularization():
8 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
9 | fig, axes = plt.subplots(1, 3, figsize=(12, 4))
10 |
11 | # a carefully hand-designed dataset lol
12 | y[7] = 0
13 | y[27] = 0
14 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
15 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
16 |
17 | for ax, C in zip(axes, [1e-2, 1, 1e2]):
18 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
19 |
20 | svm = SVC(kernel='linear', C=C, tol=0.00001).fit(X, y)
21 | w = svm.coef_[0]
22 | a = -w[0] / w[1]
23 | xx = np.linspace(6, 13)
24 | yy = a * xx - (svm.intercept_[0]) / w[1]
25 | ax.plot(xx, yy, label="C = %.e" % C, c='k')
26 | ax.set_xlim(x_min, x_max)
27 | ax.set_ylim(y_min, y_max)
28 | ax.set_xticks(())
29 | ax.set_yticks(())
30 | ax.set_title("C = %f" % C)
31 |
32 | if __name__ == "__main__":
33 | plot_linear_svc_regularization()
34 | plt.show()
35 |
--------------------------------------------------------------------------------
/plots/plot_rbf_svm_parameters.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | from sklearn.svm import SVC
4 | from sklearn.datasets import make_blobs
5 | from sklearn.externals.joblib import Memory
6 | from .plot_2d_separator import plot_2d_separator
7 |
8 | def make_handcrafted_dataset():
9 | # a carefully hand-designed dataset lol
10 | X, y = make_blobs(centers=2, random_state=4, n_samples=30)
11 | y[np.array([7, 27])] = 0
12 | mask = np.ones(len(X), dtype=np.bool)
13 | mask[np.array([0, 1, 5, 26])] = 0
14 | X, y = X[mask], y[mask]
15 | return X, y
16 |
17 |
18 | def plot_rbf_svm_parameters():
19 | X, y = make_handcrafted_dataset()
20 |
21 | fig, axes = plt.subplots(1, 3, figsize=(12, 4))
22 | for ax, C in zip(axes, [1e0, 5, 10, 100]):
23 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
24 |
25 | svm = SVC(kernel='rbf', C=C).fit(X, y)
26 | plot_2d_separator(svm, X, ax=ax, eps=.5)
27 | ax.set_title("C = %f" % C)
28 |
29 | fig, axes = plt.subplots(1, 4, figsize=(15, 3))
30 | for ax, gamma in zip(axes, [0.1, .5, 1, 10]):
31 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
32 | svm = SVC(gamma=gamma, kernel='rbf', C=1).fit(X, y)
33 | plot_2d_separator(svm, X, ax=ax, eps=.5)
34 | ax.set_title("gamma = %f" % gamma)
35 |
36 |
37 | def plot_svm(log_C, log_gamma):
38 | X, y = make_handcrafted_dataset()
39 | C = 10. ** log_C
40 | gamma = 10. ** log_gamma
41 | svm = SVC(kernel='rbf', C=C, gamma=gamma).fit(X, y)
42 | ax = plt.gca()
43 | plot_2d_separator(svm, X, ax=ax, eps=.5)
44 | # plot data
45 | ax.scatter(X[:, 0], X[:, 1], s=150, c=np.array(['red', 'blue'])[y])
46 | # plot support vectors
47 | sv = svm.support_vectors_
48 | ax.scatter(sv[:, 0], sv[:, 1], s=230, facecolors='none', zorder=10, linewidth=3)
49 | ax.set_title("C = %.4f gamma = %.4f" % (C, gamma))
50 |
51 |
52 | def plot_svm_interactive():
53 | from IPython.html.widgets import interactive, FloatSlider
54 | C_slider = FloatSlider(min=-3, max=3, step=.1, value=0, readout=False)
55 | gamma_slider = FloatSlider(min=-2, max=2, step=.1, value=0, readout=False)
56 | return interactive(plot_svm, log_C=C_slider, log_gamma=gamma_slider)
57 |
--------------------------------------------------------------------------------
/solutions/cross_validation_iris.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.cross_validation import StratifiedKFold, KFold
3 | iris = load_iris()
4 | X, y = iris.data, iris.target
5 |
6 | print(cross_val_score(LinearSVC(), X, y, cv=KFold(len(X), 3)))
7 | print(cross_val_score(LinearSVC(), X, y, cv=StratifiedKFold(y, 3)))
8 |
--------------------------------------------------------------------------------
/solutions/digits_tsne.py:
--------------------------------------------------------------------------------
1 | from sklearn.manifold import TSNE
2 | tsne = TSNE()
3 | X_tsne = tsne.fit_transform(X)
4 | plt.title("All classes")
5 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
6 |
--------------------------------------------------------------------------------
/solutions/forests.py:
--------------------------------------------------------------------------------
1 | from sklearn.tree import DecisionTreeClassifier
2 | from sklearn.ensemble import RandomForestClassifier
3 | from sklearn.datasets import load_digits
4 | from sklearn.learning_curve import validation_curve
5 |
6 | digits = load_digits()
7 |
8 | def plot_validation_curve(parameter_values, train_scores, validation_scores):
9 | train_scores_mean = np.mean(train_scores, axis=1)
10 | train_scores_std = np.std(train_scores, axis=1)
11 | validation_scores_mean = np.mean(validation_scores, axis=1)
12 | validation_scores_std = np.std(validation_scores, axis=1)
13 |
14 | plt.fill_between(parameter_values, train_scores_mean - train_scores_std,
15 | train_scores_mean + train_scores_std, alpha=0.1,
16 | color="r")
17 | plt.fill_between(parameter_values, validation_scores_mean - validation_scores_std,
18 | validation_scores_mean + validation_scores_std, alpha=0.1, color="g")
19 | plt.plot(parameter_values, train_scores_mean, 'o-', color="r",
20 | label="Training score")
21 | plt.plot(parameter_values, validation_scores_mean, 'o-', color="g",
22 | label="Cross-validation score")
23 | plt.ylim(validation_scores_mean.min() - .1, train_scores_mean.max() + .1)
24 | plt.legend(loc="best")
25 |
26 | param_range = range(1, 50)
27 | training_scores, validation_scores = validation_curve(DecisionTreeClassifier(), digits.data, digits.target,
28 | param_name="max_depth",
29 | param_range=param_range,
30 | cv=5)
31 | plt.figure()
32 | plot_validation_curve(param_range, training_scores, validation_scores)
33 |
34 | param_range = range(1, 20, 1)
35 | training_scores, validation_scores = validation_curve(RandomForestClassifier(n_estimators=100),
36 | digits.data, digits.target,
37 | param_name="max_features",
38 | param_range=param_range,
39 | cv=5)
40 | plt.figure()
41 | plot_validation_curve(param_range, training_scores, validation_scores)
42 |
--------------------------------------------------------------------------------
/solutions/grid_search_forest.py:
--------------------------------------------------------------------------------
1 | from sklearn.ensemble import RandomForestClassifier
2 |
3 | param_grid = {'max_depth': [1, 3, 5, 7, 10], 'max_features': [5, 8, 10, 20]}
4 |
5 | grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
6 | grid.fit(X_train, y_train)
7 | print("best parameters: %s" % grid.best_params_)
8 | print("Training set accuracy: %s" % grid.score(X_train, y_train))
9 | print("Test set accuracy: %s" % grid.score(X_test, y_test))
10 |
11 | scores = [x.mean_validation_score for x in grid.grid_scores_]
12 | scores = np.array(scores).reshape(5, 4)
13 | plt.matshow(scores)
14 | plt.xlabel("max_features")
15 | plt.ylabel("max_depth")
16 |
--------------------------------------------------------------------------------
/solutions/grid_search_k_neighbors.py:
--------------------------------------------------------------------------------
1 | from sklearn.neighbors import KNeighborsClassifier
2 |
3 | param_grid = {'n_neighbors': [1, 3, 5, 7, 10]}
4 |
5 | grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid)
6 | grid.fit(X_train, y_train)
7 | print("best parameters: %s" % grid.best_params_)
8 | print("Training set accuracy: %s" % grid.score(X_train, y_train))
9 | print("Test set accuracy: %s" % grid.score(X_test, y_test))
10 |
--------------------------------------------------------------------------------
/solutions/linear_models.py:
--------------------------------------------------------------------------------
1 | from pprint import pprint
2 |
3 | from sklearn.grid_search import GridSearchCV
4 | from sklearn.datasets import load_digits
5 | from sklearn.cross_validation import train_test_split
6 | from sklearn.svm import LinearSVC
7 |
8 | digits = load_digits()
9 | X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target % 2)
10 |
11 | grid = GridSearchCV(LinearSVC(), param_grid={'C': np.logspace(-6, 2, 9)}, cv=5)
12 | grid.fit(X_train, y_train)
13 | pprint(grid.grid_scores_)
14 | pprint(grid.score(X_test, y_test))
15 |
16 |
17 | Cs = [10, 1, .01, 0.001, 0.0001]
18 | for penalty in ['l1', 'l2']:
19 | svm_models = {}
20 | training_scores = []
21 | test_scores = []
22 | for C in Cs:
23 | svm = LinearSVC(C=C, penalty=penalty, dual=False).fit(X_train, y_train)
24 | training_scores.append(svm.score(X_train, y_train))
25 | test_scores.append(svm.score(X_test, y_test))
26 | svm_models[C] = svm
27 |
28 | plt.figure()
29 | plt.plot(training_scores, label="training scores")
30 | plt.plot(test_scores, label="test scores")
31 | plt.xticks(range(4), Cs)
32 | plt.legend(loc="best")
33 |
34 | plt.figure(figsize=(10, 5))
35 | for i, C in enumerate(Cs):
36 | plt.plot(svm_models[C].coef_.ravel(), "o", label="C = %.2f" % C)
37 |
38 | plt.legend(loc="best")
39 |
--------------------------------------------------------------------------------
/solutions/load_iris.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 | from sklearn.datasets import load_iris
5 | from sklearn.cross_validation import train_test_split
6 |
7 | iris = load_iris()
8 | X, y = iris.data, iris.target
9 |
10 | print("Dataset size: %d number of features: %d number of classes: %d"
11 | % (X.shape[0], X.shape[1], len(np.unique(y))))
12 |
13 | X_train, X_test, y_train, y_test = train_test_split(X, y)
14 |
15 | plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
16 | plt.figure()
17 | plt.scatter(X_train[:, 2], X_train[:, 3], c=y_train)
18 |
--------------------------------------------------------------------------------
/solutions/pipeline_iris.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.feature_selection import SelectKBest
3 | from sklearn.pipeline import make_pipeline
4 | from sklearn.svm import LinearSVC
5 |
6 | rng = np.random.RandomState(42)
7 | iris = load_iris()
8 | X = np.hstack([iris.data, rng.uniform(size=(len(iris.data), 5))])
9 | X_train, X_test, y_train, y_test = train_test_split(X, iris.target, random_state=2)
10 |
11 | selection_pipe = make_pipeline(SelectKBest(), LinearSVC())
12 | param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3),
13 | 'selectkbest__k': [1, 2, 3, 4, 5, 7]}
14 | grid = GridSearchCV(selection_pipe, param_grid, cv=5)
15 | grid.fit(X_train, y_train)
16 | print("Best parameters: %s" % grid.best_params_)
17 | print("Test set performance: %s" % grid.score(X_test, y_test))
18 |
--------------------------------------------------------------------------------
/solutions/svms.py:
--------------------------------------------------------------------------------
1 | print("default score without scaling: %f" % SVC().fit(X_train, y_train).score(X_test, y_test))
2 |
3 | from sklearn.preprocessing import StandardScaler
4 | scaler = StandardScaler()
5 | X_train_scaled = scaler.fit_transform(X_train)
6 | X_test_scaled = scaler.transform(X_test)
7 | print("default score with scaling: %f" % SVC().fit(X_train_scaled, y_train).score(X_test_scaled, y_test))
8 |
9 | grid_search.fit(X_train_scaled, y_train)
10 |
11 | # We extract just the scores
12 | scores = [x[1] for x in grid_search.grid_scores_]
13 | scores = np.array(scores).reshape(6, 6)
14 |
15 | plt.matshow(scores)
16 | plt.xlabel('gamma')
17 | plt.ylabel('C')
18 | plt.colorbar()
19 | plt.xticks(np.arange(6), param_grid['gamma'])
20 | plt.yticks(np.arange(6), param_grid['C'])
21 |
--------------------------------------------------------------------------------
/solutions/text_pipeline.py:
--------------------------------------------------------------------------------
1 | from sklearn.pipeline import Pipeline
2 | from sklearn.grid_search import GridSearchCV
3 |
4 | pipeline = Pipeline([('vectorizer', cv), ('classifier', svm)])
5 | pipeline.fit(text_train, y_train)
6 | print("Pipeline test score: %f" % pipeline.score(text_test, y_test))
7 |
8 | param_grid = {'classifier__C': 10. ** np.arange(-3, 3)}
9 |
10 | grid_search = GridSearchCV(pipeline, param_grid=param_grid)
11 | grid_search.fit(text_train, y_train)
12 | print("best parameters : %s" % grid_search.best_params_)
13 | print("Grid-searched test score: %f" % grid_search.score(text_test, y_test))
14 |
15 |
16 | param_grid = {'classifier__C': 10. ** np.arange(-3, 3),
17 | "vectorizer__ngram_range": [(1, 1), (1, 2), (2, 2)]}
18 | grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=3)
19 | grid_search.fit(text_train, y_train)
20 |
21 | print("best parameters with n-gram search: %s" % grid_search.best_params_)
22 | print("test set score with n-gram search: %s" % grid_search.score(text_test, y_test))
23 |
--------------------------------------------------------------------------------
/solutions/train_iris.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import load_iris
2 | from sklearn.neighbors import KNeighborsClassifier
3 | from sklearn.cross_validation import train_test_split
4 |
5 | iris = load_iris()
6 | X, y = iris.data, iris.target
7 |
8 | X_train, X_test, y_train, y_test = train_test_split(X, y)
9 |
10 | knn = KNeighborsClassifier(n_neighbors=3)
11 | knn.fit(X_train, y_train)
12 |
13 | print("test set score of knn: %f" % knn.score(X_test, y_test))
14 |
--------------------------------------------------------------------------------
/solutions/validation_curve.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 |
3 | from sklearn.svm import LinearSVC
4 | from sklearn.neighbors import KNeighborsClassifier
5 | from sklearn.learning_curve import validation_curve
6 |
7 |
8 | cs = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
9 | training_scores, test_scores = validation_curve(LinearSVC(), X, y,
10 | param_name="C", param_range=cs)
11 | plt.figure()
12 | plot_validation_curve(range(7), training_scores, test_scores)
13 |
14 |
15 | ks = range(1, 10)
16 | training_scores, test_scores = validation_curve(KNeighborsClassifier(), X, y,
17 | param_name="n_neighbors", param_range=ks)
18 | plt.figure()
19 | plot_validation_curve(ks, training_scores, test_scores)
20 |
--------------------------------------------------------------------------------