├── .DS_Store
├── .gitignore
├── LICENSE
├── README.md
├── sem2-classify&generate
├── 1_my_first_nn_lasagne
├── 1_my_first_nn_lsagne.ipynb
├── 2_ae_complete.ipynb
├── 2_ae_with_gaps.ipynb
├── 3_vae_complete.ipynb
├── 3_vae_with_gaps.ipynb
├── 4_ss_vae.ipynb
├── mnist.py
└── utils.py
├── sem3-attention
├── Attention_seminar (Start here).ipynb
├── Captioning_seminar.ipynb
├── attention_part1_solution.ipynb
├── data
│ ├── Dog-and-Cat.jpg
│ └── svd.pcl
└── pretrained_lenet.py
├── sem4-GP
├── 1_GP_basics.ipynb
├── 1_GP_basics_filled.ipynb
├── 2_BayesOpt.ipynb
├── 2_BayesOpt_filled.ipynb
├── 2d_demo.mp4
├── 3_LargeScaleGP.ipynb
├── 3_LargeScaleGP_filled.ipynb
├── EI_vs_logEI.png
├── airline.mat
├── airline_result.png
├── bayes_opt.py
└── house_pricing.csv
└── sem5-gan
└── seminar.ipynb
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/.DS_Store
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # deepbayes2017
--------------------------------------------------------------------------------
/sem2-classify&generate/1_my_first_nn_lasagne:
--------------------------------------------------------------------------------
1 | Список мест с критическими ошибками в первом ноутбуке:
2 | - Размер фильтров первого сверточного слоя
3 | - Процедуры инициализации параметров сети
4 | - Параметр слоя dropout
5 | - Нелинейность на выходе классификатора
6 | - Accuracy в качестве функции потерь при обучении
7 | - Momentum и learning_rate у оптимизатора
--------------------------------------------------------------------------------
/sem2-classify&generate/1_my_first_nn_lsagne.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "
Theano "
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [],
17 | "source": [
18 | "!pip install numpy matplotlib \n",
19 | "!pip install --upgrade https://github.com/Theano/Theano/archive/master.zip\n",
20 | "!pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip"
21 | ]
22 | },
23 | {
24 | "cell_type": "markdown",
25 | "metadata": {},
26 | "source": [
27 | "### Разминка"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "collapsed": false
35 | },
36 | "outputs": [],
37 | "source": [
38 | "import theano\n",
39 | "import theano.tensor as T\n",
40 | "\n",
41 | "%pylab inline"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "#### будущий параметр функции -- символьная переменная"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {
55 | "collapsed": true
56 | },
57 | "outputs": [],
58 | "source": [
59 | "N = T.scalar('a dimension', dtype='float32')"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "#### рецепт получения квадрата -- орперации над символьными переменным"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "collapsed": true
74 | },
75 | "outputs": [],
76 | "source": [
77 | "result = T.power(N, 2)"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "#### theano.grad(cost, wrt)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "grad_result = theano.grad(result, N) "
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "#### компиляция функции \"получения квадрата\""
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {
109 | "collapsed": true
110 | },
111 | "outputs": [],
112 | "source": [
113 | "sq_function = theano.function(inputs=[N], outputs=result)\n",
114 | "gr_function = theano.function(inputs=[N], outputs=grad_result)"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "#### применение функции"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {
128 | "collapsed": true
129 | },
130 | "outputs": [],
131 | "source": [
132 | "# Заводим np.array x\n",
133 | "xv = np.arange(-10, 10)\n",
134 | "\n",
135 | "# Применяем функцию к каждому x\n",
136 | "val = map(float, [sq_function(x) for x in xv])\n",
137 | "\n",
138 | "# Посичтаем градиент в кажой точке\n",
139 | "grad = map(float, [gr_function(x) for x in xv])"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "metadata": {},
145 | "source": [
146 | "### Что мы увидим если нарисуем функцию и градиент?"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {
153 | "collapsed": false
154 | },
155 | "outputs": [],
156 | "source": [
157 | "pylab.plot(xv, val, label='x*x')\n",
158 | "pylab.plot(xv, grad, label='d x*x / dx')\n",
159 | "pylab.legend()"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "Lasagne \n",
167 | "\n",
168 | "* lasagne - это библиотека для написания нейронок произвольной формы на theano\n",
169 | "* В качестве демо-задачи выберем то же распознавание чисел, но на большем масштабе задачи, картинки 28x28, 10 цифр"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": null,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [],
179 | "source": [
180 | "from mnist import load_dataset\n",
181 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()\n",
182 | "\n",
183 | "print 'X размера', X_train.shape, 'y размера', y_train.shape"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {
190 | "collapsed": false
191 | },
192 | "outputs": [],
193 | "source": [
194 | "fig, axes = plt.subplots(nrows=1, ncols=7, figsize=(20, 20))\n",
195 | "\n",
196 | "for i, ax in enumerate(axes):\n",
197 | " ax.imshow(X_train[i, 0], cmap='gray')"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "Давайте посмотрим на DenseLayer в lasagne\n",
205 | "- http://lasagne.readthedocs.io/en/latest/modules/layers/dense.html\n",
206 | "- https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/dense.py#L16-L124 \n",
207 | "- Весь содаржательный код тут https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/dense.py#L121 "
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {
214 | "collapsed": true
215 | },
216 | "outputs": [],
217 | "source": [
218 | "import lasagne\n",
219 | "from lasagne import init\n",
220 | "from theano import tensor as T\n",
221 | "from lasagne.nonlinearities import softmax\n",
222 | "\n",
223 | "X, y = T.tensor4('X'), T.vector('y', 'int32')"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "Так задаётся архитектура нейронки"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {
237 | "collapsed": true
238 | },
239 | "outputs": [],
240 | "source": [
241 | "#входной слой (вспомогательный)\n",
242 | "net = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=X)\n",
243 | "\n",
244 | "net = lasagne.layers.Conv2DLayer(net, 15, 28, pad='valid', W=init.Constant()) # сверточный слой\n",
245 | "net = lasagne.layers.Conv2DLayer(net, 10, 2, pad='full', W=init.Constant()) # сверточный слой\n",
246 | "\n",
247 | "net = lasagne.layers.DenseLayer(net, num_units=500) # полносвязный слой\n",
248 | "net = lasagne.layers.DropoutLayer(net, 1.0) # регуляризатор\n",
249 | "net = lasagne.layers.DenseLayer(net, num_units=200) # полносвязный слой\n",
250 | "\n",
251 | "net = lasagne.layers.DenseLayer(net, num_units=10) # полносвязный слой"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {
258 | "collapsed": true
259 | },
260 | "outputs": [],
261 | "source": [
262 | "#предсказание нейронки (theano-преобразование)\n",
263 | "y_predicted = lasagne.layers.get_output(net)"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": null,
269 | "metadata": {
270 | "collapsed": false
271 | },
272 | "outputs": [],
273 | "source": [
274 | "#все веса нейронки (shared-переменные)\n",
275 | "all_weights = lasagne.layers.get_all_params(net)\n",
276 | "print all_weights"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {
283 | "collapsed": true
284 | },
285 | "outputs": [],
286 | "source": [
287 | "#функция ошибки и точности будет прямо внутри\n",
288 | "loss = lasagne.objectives.categorical_accuracy(y_predicted, y).mean()\n",
289 | "accuracy = lasagne.objectives.categorical_accuracy(y_predicted, y).mean()"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {
296 | "collapsed": true
297 | },
298 | "outputs": [],
299 | "source": [
300 | "#сразу посчитать словарь обновлённых значений с шагом по градиенту, как раньше\n",
301 | "updates = lasagne.updates.momentum(loss, all_weights, learning_rate=1.0, momentum=1.5)"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "metadata": {
308 | "collapsed": true
309 | },
310 | "outputs": [],
311 | "source": [
312 | "#функция, делает updates и возвращащет значение функции потерь и точности\n",
313 | "train_fun = theano.function([X, y], [loss, accuracy], updates=updates)\n",
314 | "accuracy_fun = theano.function([X, y], accuracy) # точность без обновления весов, для теста"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "# Процесс обучения"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "metadata": {
328 | "collapsed": false,
329 | "scrolled": false
330 | },
331 | "outputs": [],
332 | "source": [
333 | "import time \n",
334 | "from mnist import iterate_minibatches\n",
335 | "\n",
336 | "num_epochs = 5 #количество проходов по данным\n",
337 | "batch_size = 50 #размер мини-батча\n",
338 | "\n",
339 | "for epoch in range(num_epochs):\n",
340 | " train_err, train_acc, train_batches, start_time = 0, 0, 0, time.time()\n",
341 | " for inputs, targets in iterate_minibatches(X_train, y_train, batch_size):\n",
342 | " train_err_batch, train_acc_batch = train_fun(inputs, targets)\n",
343 | " train_err += train_err_batch\n",
344 | " train_acc += train_acc_batch\n",
345 | " train_batches += 1\n",
346 | "\n",
347 | " val_acc, val_batches = 0, 0\n",
348 | " for inputs, targets in iterate_minibatches(X_test, y_test, batch_size):\n",
349 | " val_acc += accuracy_fun(inputs, targets)\n",
350 | " val_batches += 1\n",
351 | "\n",
352 | " \n",
353 | " print \"Epoch %s of %s took %.3f s\" % (epoch + 1, num_epochs, time.time() - start_time)\n",
354 | " print \" train loss:\\t %.3f\" % (train_err / train_batches)\n",
355 | " print \" train acc:\\t %.3f\" % (train_acc * 100 / train_batches), '%'\n",
356 | " print \" test acc:\\t %.3f\" % (val_acc * 100 / val_batches), '%'\n",
357 | " print"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": null,
363 | "metadata": {
364 | "collapsed": true
365 | },
366 | "outputs": [],
367 | "source": [
368 | "test_acc = 0\n",
369 | "test_batches = 0\n",
370 | "for batch in iterate_minibatches(X_test, y_test, 500):\n",
371 | " inputs, targets = batch\n",
372 | " acc = accuracy_fun(inputs, targets)\n",
373 | " test_acc += acc\n",
374 | " test_batches += 1\n",
375 | "print(\"Final results: \\n test accuracy:\\t\\t{:.2f} %\".format(test_acc / test_batches * 100))"
376 | ]
377 | },
378 | {
379 | "cell_type": "markdown",
380 | "metadata": {},
381 | "source": [
382 | "# Ансамблирование с DropOut"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {
389 | "collapsed": true
390 | },
391 | "outputs": [],
392 | "source": [
393 | "#предсказание нейронки (theano-преобразование)\n",
394 | "y_predicted = T.mean([lasagne.layers.get_output(net, deterministic=False) for i in range(10)], axis=0)\n",
395 | "accuracy = lasagne.objectives.categorical_accuracy(y_predicted, y).mean()\n",
396 | "accuracy_fun = theano.function([X, y], accuracy) # точность без обновления весов, для теста"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": null,
402 | "metadata": {
403 | "collapsed": true
404 | },
405 | "outputs": [],
406 | "source": [
407 | "test_acc = 0\n",
408 | "test_batches = 0\n",
409 | "for batch in iterate_minibatches(X_test, y_test, 500):\n",
410 | " inputs, targets = batch\n",
411 | " acc = accuracy_fun(inputs, targets)\n",
412 | " test_acc += acc\n",
413 | " test_batches += 1\n",
414 | "print(\"Final results: \\n test accuracy:\\t\\t{:.2f} %\".format(test_acc / test_batches * 100))"
415 | ]
416 | }
417 | ],
418 | "metadata": {
419 | "anaconda-cloud": {},
420 | "kernelspec": {
421 | "display_name": "Python 2",
422 | "language": "python",
423 | "name": "python2"
424 | },
425 | "language_info": {
426 | "codemirror_mode": {
427 | "name": "ipython",
428 | "version": 2
429 | },
430 | "file_extension": ".py",
431 | "mimetype": "text/x-python",
432 | "name": "python",
433 | "nbconvert_exporter": "python",
434 | "pygments_lexer": "ipython2",
435 | "version": "2.7.10"
436 | }
437 | },
438 | "nbformat": 4,
439 | "nbformat_minor": 1
440 | }
441 |
--------------------------------------------------------------------------------
/sem2-classify&generate/2_ae_complete.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Автокодировщик"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {
17 | "collapsed": false,
18 | "deletable": true,
19 | "editable": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "import time\n",
24 | "\n",
25 | "import numpy as np\n",
26 | "import theano\n",
27 | "import theano.tensor as T\n",
28 | "import lasagne\n",
29 | "\n",
30 | "import matplotlib.pylab as plt\n",
31 | "from utils import load_dataset, iterate_minibatches\n",
32 | "%matplotlib inline\n",
33 | "\n",
34 | "BATCH_SIZE = 20\n",
35 | "HIDDEN_DIM = 2\n",
36 | "\n",
37 | "num_epochs = 128\n",
38 | "\n",
39 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {
45 | "deletable": true,
46 | "editable": true
47 | },
48 | "source": [
49 | "## Обучение модели\n",
50 | "\n",
51 | "tl;dr: Автокодировщик может быть использован для построения маломерных признаков данных без разметки.\n",
52 | "\n",
53 | "В процессе обучения строится пара отображений $E: \\mathbb R^D \\rightarrow R^d$ (кодировщик) и $D: \\mathbb R^d \\rightarrow R^D$ (декодировщик), чья композиция приближает тождественное отображение:\n",
54 | "\n",
55 | "$$ D(E(x)) \\approx x $$"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {
62 | "collapsed": true,
63 | "deletable": true,
64 | "editable": true
65 | },
66 | "outputs": [],
67 | "source": [
68 | "# Определим кодировщик и декодировщик с помощью пары полносвязных нейронных сетей\n",
69 | "\n",
70 | "def ae_encoder(input_var):\n",
71 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input_var)\n",
72 | " ######################################################################################\n",
73 | " # Реализуйте некоторую несложную архитектуру кодировщика, возвращающую HIDDEN_DIM-мерный код #\n",
74 | " # Какие функции активации можно поставить на выход сети? #\n",
75 | " ######################################################################################\n",
76 | " l_hid1 = lasagne.layers.DenseLayer(\n",
77 | " l_in, num_units=128,\n",
78 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
79 | " W=lasagne.init.GlorotUniform(),\n",
80 | " name='e_hid1')\n",
81 | " l_hid2 = lasagne.layers.DenseLayer(\n",
82 | " l_hid1, num_units=64,\n",
83 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
84 | " name='e_hid2')\n",
85 | " l_out = lasagne.layers.DenseLayer(\n",
86 | " l_hid2, num_units=HIDDEN_DIM,\n",
87 | " nonlinearity=None,\n",
88 | " name='e_out')\n",
89 | " return l_out\n",
90 | "\n",
91 | "\n",
92 | "def ae_decoder(input_var):\n",
93 | " l_in = lasagne.layers.InputLayer(shape=(None, HIDDEN_DIM), input_var=input_var)\n",
94 | " ##################################################################################################\n",
95 | " # Реализуйте некоторую несложную архитектуру декодировщика, возвращающую батч объектов размера (1, 28, 28) #\n",
96 | " ##################################################################################################\n",
97 | " l_hid1 = lasagne.layers.DenseLayer(\n",
98 | " l_in, num_units=64,\n",
99 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
100 | " W=lasagne.init.GlorotUniform(),\n",
101 | " name='d_hid1')\n",
102 | " l_hid2 = lasagne.layers.DenseLayer(\n",
103 | " l_hid1, num_units=128,\n",
104 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
105 | " name='d_hid2')\n",
106 | " l_out = lasagne.layers.DenseLayer(\n",
107 | " l_hid2, num_units=28 * 28,\n",
108 | " nonlinearity=lasagne.nonlinearities.sigmoid,\n",
109 | " name='d_out')\n",
110 | " l_out = lasagne.layers.reshape(l_out, shape=(-1, 1, 28, 28))\n",
111 | " return l_out"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "collapsed": true,
119 | "deletable": true,
120 | "editable": true
121 | },
122 | "outputs": [],
123 | "source": [
124 | "# Инициализируем сеть\n",
125 | "input_x = T.tensor4('input_x')\n",
126 | " \n",
127 | "encoder = ae_encoder(input_x)\n",
128 | "decoder = ae_decoder(\n",
129 | " lasagne.layers.get_output(encoder)\n",
130 | ")"
131 | ]
132 | },
133 | {
134 | "cell_type": "markdown",
135 | "metadata": {
136 | "deletable": true,
137 | "editable": true
138 | },
139 | "source": [
140 | "Для обучения автокодировщика будем использовать среднеквадратичную ошибку\n",
141 | "\n",
142 | "$$ L(X) = \\frac{1}{N}\\sum_{i=1}^{N} \\sum_{j=1}^{28^2} \\left( D(E(x_i))_j - x_{i,j} \\right)^2 = \\frac{1}{N}\\sum_{i=1}^{N} (D(E(x_i)) - x_i)^T (D(E(x_i)) - x_i) $$"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {
149 | "collapsed": false,
150 | "deletable": true,
151 | "editable": true
152 | },
153 | "outputs": [],
154 | "source": [
155 | "#####################################################################################\n",
156 | "# Определите операцию для вычисления функции потерь, а также создайте список параметров модели #\n",
157 | "# для передачи в оптимизатор #\n",
158 | "loss = lasagne.objectives.squared_error(\n",
159 | " lasagne.layers.get_output(decoder), input_x\n",
160 | ").sum(axis=(1, 2, 3)).mean()\n",
161 | "params = lasagne.layers.get_all_params([encoder, decoder])\n",
162 | "#####################################################################################\n",
163 | "\n",
164 | "updates = lasagne.updates.adam(loss, params)\n",
165 | " \n",
166 | "train = theano.function(\n",
167 | " [input_x],\n",
168 | " loss,\n",
169 | " updates=updates\n",
170 | ")\n",
171 | "test_loss = theano.function(\n",
172 | " [input_x],\n",
173 | " loss\n",
174 | ")"
175 | ]
176 | },
177 | {
178 | "cell_type": "markdown",
179 | "metadata": {
180 | "deletable": true,
181 | "editable": true
182 | },
183 | "source": [
184 | "Обучение, как и во многих других случаях, выполяется с помощью стохастического градиентного спуска"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "collapsed": false,
192 | "deletable": true,
193 | "editable": true
194 | },
195 | "outputs": [],
196 | "source": [
197 | "for epoch in range(num_epochs):\n",
198 | " train_err = 0\n",
199 | " train_batches = 0\n",
200 | " start_time = time.time()\n",
201 | " for batch in iterate_minibatches(X_train, batchsize=BATCH_SIZE):\n",
202 | " train_err += train(batch)\n",
203 | " train_batches += 1\n",
204 | " \n",
205 | " test_err = 0\n",
206 | " test_batches = 0\n",
207 | " for batch in iterate_minibatches(X_test, batchsize=BATCH_SIZE):\n",
208 | " test_err += test_loss(batch)\n",
209 | " test_batches += 1\n",
210 | " \n",
211 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n",
212 | " epoch + 1, num_epochs, time.time() - start_time))\n",
213 | " print(\"Train error {}\".format(train_err/train_batches))\n",
214 | " print(\"Test error {}\".format(test_err/test_batches))"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {
220 | "deletable": true,
221 | "editable": true
222 | },
223 | "source": [
224 | "## Визуализация\n",
225 | "\n",
226 | "Модель с двумерными скрытыми переменными легко визуализировать. Определим две функции: одну для построения пропущенных через автокодировщик изображений, вторую для вычисления скрытых представлений по изображению"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": false,
234 | "deletable": true,
235 | "editable": true
236 | },
237 | "outputs": [],
238 | "source": [
239 | "from utils import plot_reconstructions, plot_hidden_space\n",
240 | "\n",
241 | "reconstruct = theano.function(\n",
242 | " [input_x],\n",
243 | " lasagne.layers.get_output(decoder)\n",
244 | ")\n",
245 | "\n",
246 | "encode = theano.function(\n",
247 | " [input_x],\n",
248 | " lasagne.layers.get_output(encoder)\n",
249 | ")"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {
255 | "deletable": true,
256 | "editable": true
257 | },
258 | "source": [
259 | "Примеры изображений, пропущенных через автокодировщик: "
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {
266 | "collapsed": false,
267 | "deletable": true,
268 | "editable": true
269 | },
270 | "outputs": [],
271 | "source": [
272 | "plot_reconstructions(X_test, reconstruct)"
273 | ]
274 | },
275 | {
276 | "cell_type": "markdown",
277 | "metadata": {
278 | "deletable": true,
279 | "editable": true
280 | },
281 | "source": [
282 | "Визуализация признакового пространства. Насколько пространство простое? Везде ли оно плотно? Как выбрать точку в этом пространстве, которая будет соответствовать коду какого-то объекта?"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {
289 | "collapsed": false,
290 | "deletable": true,
291 | "editable": true
292 | },
293 | "outputs": [],
294 | "source": [
295 | "plot_hidden_space(X_test[:1000], encode)"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {
301 | "deletable": true,
302 | "editable": true
303 | },
304 | "source": [
305 | "Попробуйте погенерировать изображения по паре координат"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {
312 | "collapsed": true,
313 | "deletable": true,
314 | "editable": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "input_z = T.matrix('input_z')\n",
319 | "\n",
320 | "decode_a_code = theano.function(\n",
321 | " [input_z],\n",
322 | " lasagne.layers.get_output(decoder, input_z),\n",
323 | ")\n",
324 | "\n",
325 | "def generate_from_code(x, y):\n",
326 | " img = decode_a_code([[x, y]]).reshape((28, 28))\n",
327 | " plt.imshow(img, 'gray')"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "collapsed": false,
335 | "deletable": true,
336 | "editable": true
337 | },
338 | "outputs": [],
339 | "source": [
340 | "generate_from_code(50., 20.)"
341 | ]
342 | }
343 | ],
344 | "metadata": {
345 | "kernelspec": {
346 | "display_name": "Python 3",
347 | "language": "python",
348 | "name": "python3"
349 | },
350 | "language_info": {
351 | "codemirror_mode": {
352 | "name": "ipython",
353 | "version": 3
354 | },
355 | "file_extension": ".py",
356 | "mimetype": "text/x-python",
357 | "name": "python",
358 | "nbconvert_exporter": "python",
359 | "pygments_lexer": "ipython3",
360 | "version": "3.5.1"
361 | }
362 | },
363 | "nbformat": 4,
364 | "nbformat_minor": 2
365 | }
366 |
--------------------------------------------------------------------------------
/sem2-classify&generate/2_ae_with_gaps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Автокодировщик"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {
17 | "collapsed": false,
18 | "deletable": true,
19 | "editable": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "import time\n",
24 | "\n",
25 | "import numpy as np\n",
26 | "import theano\n",
27 | "import theano.tensor as T\n",
28 | "import lasagne\n",
29 | "\n",
30 | "import matplotlib.pylab as plt\n",
31 | "from utils import load_dataset, iterate_minibatches\n",
32 | "%matplotlib inline\n",
33 | "\n",
34 | "BATCH_SIZE = 20\n",
35 | "HIDDEN_DIM = 2\n",
36 | "\n",
37 | "num_epochs = 40\n",
38 | "\n",
39 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {
45 | "deletable": true,
46 | "editable": true
47 | },
48 | "source": [
49 | "## Обучение модели\n",
50 | "\n",
51 | "tl;dr: Автокодировщик может быть использован для построения маломерных признаков данных без разметки.\n",
52 | "\n",
53 | "В процессе обучения строится пара отображений $E: \\mathbb R^D \\rightarrow R^d$ (кодировщик) и $D: \\mathbb R^d \\rightarrow R^D$ (декодировщик), чья композиция приближает тождественное отображение:\n",
54 | "\n",
55 | "$$ D(E(x)) \\approx x $$"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "metadata": {
62 | "collapsed": true,
63 | "deletable": true,
64 | "editable": true
65 | },
66 | "outputs": [],
67 | "source": [
68 | "# Определим кодировщик и декодировщик с помощью пары полносвязных нейронных сетей\n",
69 | "\n",
70 | "def ae_encoder(input_var):\n",
71 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input_var)\n",
72 | " ######################################################################################\n",
73 | " # Реализуйте некоторую несложную архитектуру кодировщика, возвращающую HIDDEN_DIM-мерный код #\n",
74 | " # Какие функции активации можно поставить на выход сети? #\n",
75 | " ######################################################################################\n",
76 | " return l_out\n",
77 | "\n",
78 | "\n",
79 | "def ae_decoder(input_var):\n",
80 | " l_in = lasagne.layers.InputLayer(shape=(None, HIDDEN_DIM), input_var=input_var)\n",
81 | " ##################################################################################################\n",
82 | " # Реализуйте некоторую несложную архитектуру декодировщика, возвращающую батч объектов размера (1, 28, 28) #\n",
83 | " ##################################################################################################\n",
84 | " return l_out"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "collapsed": true,
92 | "deletable": true,
93 | "editable": true
94 | },
95 | "outputs": [],
96 | "source": [
97 | "# Инициализируем сеть\n",
98 | "input_x = T.tensor4('input_x')\n",
99 | " \n",
100 | "encoder = ae_encoder(input_x)\n",
101 | "decoder = ae_decoder(\n",
102 | " lasagne.layers.get_output(encoder)\n",
103 | ")"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {
109 | "deletable": true,
110 | "editable": true
111 | },
112 | "source": [
113 | "Для обучения автокодировщика будем использовать среднеквадратичную ошибку\n",
114 | "\n",
115 | "$$ L(X) = \\frac{1}{N}\\sum_{i=1}^{N} \\sum_{j=1}^{28^2} \\left( D(E(x_i))_j - x_{i,j} \\right)^2 = \\frac{1}{N}\\sum_{i=1}^{N} (D(E(x_i)) - x_i)^T (D(E(x_i)) - x_i) $$"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "collapsed": false,
123 | "deletable": true,
124 | "editable": true
125 | },
126 | "outputs": [],
127 | "source": [
128 | "#####################################################################################\n",
129 | "# Определите операцию для вычисления функции потерь, а также создайте список параметров модели #\n",
130 | "# для передачи в оптимизатор #\n",
131 | "loss = None\n",
132 | "params = None\n",
133 | "#####################################################################################\n",
134 | "\n",
135 | "updates = lasagne.updates.adam(loss, params)\n",
136 | " \n",
137 | "train = theano.function(\n",
138 | " [input_x],\n",
139 | " loss,\n",
140 | " updates=updates\n",
141 | ")\n",
142 | "test_loss = theano.function(\n",
143 | " [input_x],\n",
144 | " loss\n",
145 | ")"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {
151 | "deletable": true,
152 | "editable": true
153 | },
154 | "source": [
155 | "Обучение, как и во многих других случаях, выполяется с помощью стохастического градиентного спуска"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": null,
161 | "metadata": {
162 | "collapsed": false,
163 | "deletable": true,
164 | "editable": true
165 | },
166 | "outputs": [],
167 | "source": [
168 | "for epoch in range(num_epochs):\n",
169 | " train_err = 0\n",
170 | " train_batches = 0\n",
171 | " start_time = time.time()\n",
172 | " for batch in iterate_minibatches(X_train, batchsize=BATCH_SIZE):\n",
173 | " train_err += train(batch)\n",
174 | " train_batches += 1\n",
175 | " \n",
176 | " test_err = 0\n",
177 | " test_batches = 0\n",
178 | " for batch in iterate_minibatches(X_test, batchsize=BATCH_SIZE):\n",
179 | " test_err += test_loss(batch)\n",
180 | " test_batches += 1\n",
181 | " \n",
182 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n",
183 | " epoch + 1, num_epochs, time.time() - start_time))\n",
184 | " print(\"Train error {}\".format(train_err/train_batches))\n",
185 | " print(\"Test error {}\".format(test_err/test_batches))"
186 | ]
187 | },
188 | {
189 | "cell_type": "markdown",
190 | "metadata": {
191 | "deletable": true,
192 | "editable": true
193 | },
194 | "source": [
195 | "## Визуализация\n",
196 | "\n",
197 | "Модель с двумерными скрытыми переменными легко визуализировать. Определим две функции: одну для построения пропущенных через автокодировщик изображений, вторую для вычисления скрытых представлений по изображению"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {
204 | "collapsed": false,
205 | "deletable": true,
206 | "editable": true
207 | },
208 | "outputs": [],
209 | "source": [
210 | "from utils import plot_reconstructions, plot_hidden_space\n",
211 | "\n",
212 | "reconstruct = theano.function(\n",
213 | " [input_x],\n",
214 | " lasagne.layers.get_output(decoder)\n",
215 | ")\n",
216 | "\n",
217 | "encode = theano.function(\n",
218 | " [input_x],\n",
219 | " lasagne.layers.get_output(encoder)\n",
220 | ")"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {
226 | "deletable": true,
227 | "editable": true
228 | },
229 | "source": [
230 | "Примеры изображений, пропущенных через автокодировщик: "
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {
237 | "collapsed": false,
238 | "deletable": true,
239 | "editable": true
240 | },
241 | "outputs": [],
242 | "source": [
243 | "plot_reconstructions(X_test, reconstruct)"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {
249 | "deletable": true,
250 | "editable": true
251 | },
252 | "source": [
253 | "Визуализация признакового пространства. Насколько пространство простое? Везде ли оно плотно? Как выбрать точку в этом пространстве, которая будет соответствовать коду какого-то объекта?"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {
260 | "collapsed": false,
261 | "deletable": true,
262 | "editable": true
263 | },
264 | "outputs": [],
265 | "source": [
266 | "plot_hidden_space(X_test[:1000], encode)"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {
272 | "deletable": true,
273 | "editable": true
274 | },
275 | "source": [
276 | "Попробуйте погенерировать изображения по паре координат"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {
283 | "collapsed": true,
284 | "deletable": true,
285 | "editable": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "input_z = T.matrix('input_z')\n",
290 | "\n",
291 | "decode_a_code = theano.function(\n",
292 | " [input_z],\n",
293 | " lasagne.layers.get_output(decoder, input_z),\n",
294 | ")\n",
295 | "\n",
296 | "def generate_from_code(x, y):\n",
297 | " img = decode_a_code([[x, y]]).reshape((28, 28))\n",
298 | " plt.imshow(img, 'gray')"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {
305 | "collapsed": false,
306 | "deletable": true,
307 | "editable": true
308 | },
309 | "outputs": [],
310 | "source": [
311 | "generate_from_code(50., 20.)"
312 | ]
313 | }
314 | ],
315 | "metadata": {
316 | "kernelspec": {
317 | "display_name": "Python 3",
318 | "language": "python",
319 | "name": "python3"
320 | },
321 | "language_info": {
322 | "codemirror_mode": {
323 | "name": "ipython",
324 | "version": 3
325 | },
326 | "file_extension": ".py",
327 | "mimetype": "text/x-python",
328 | "name": "python",
329 | "nbconvert_exporter": "python",
330 | "pygments_lexer": "ipython3",
331 | "version": "3.5.1"
332 | }
333 | },
334 | "nbformat": 4,
335 | "nbformat_minor": 2
336 | }
337 |
--------------------------------------------------------------------------------
/sem2-classify&generate/3_vae_complete.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Вариационный автокодировщик\n",
11 | "\n",
12 | "tl;dr: Вместо тождественного отображения вариационны автокодировщик выучивает вероятностую модель данных. Стохастическия вычисления и априорное распределение кодов дополнительно регуляризуют модель."
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {
19 | "collapsed": false,
20 | "deletable": true,
21 | "editable": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import time\n",
26 | "\n",
27 | "import numpy as np\n",
28 | "import theano\n",
29 | "import theano.tensor as T\n",
30 | "import lasagne\n",
31 | "\n",
32 | "import matplotlib.pylab as plt\n",
33 | "%matplotlib inline\n",
34 | "\n",
35 | "from utils import load_dataset, iterate_minibatches\n",
36 | "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n",
37 | "\n",
38 | "BATCH_SIZE = 20\n",
39 | "HIDDEN_DIM = 2\n",
40 | "\n",
41 | "num_epochs = 128\n",
42 | "\n",
43 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {
49 | "deletable": true,
50 | "editable": true
51 | },
52 | "source": [
53 | "## Кратко о вариационных автокодировщиках\n",
54 | "\n",
55 | "Рассмотрим вариационный автокодировщик для бинарных наблюдений. Вариационный автокодировщик состоит из генеративной модели наблюдений\n",
56 | "\n",
57 | "\\begin{align}\n",
58 | "& p(x, z | \\theta) = p(x | z, \\theta) p(z) \\\\\n",
59 | "& p(x | z, \\theta) = \\prod_{i = 1}^D p_i(z, \\theta)^{x_i} (1 - p_i(z, \\theta))^{1 - x_i} \\\\\n",
60 | "& p(z) = \\mathcal N(z | 0, I)\n",
61 | "\\end{align}\n",
62 | "\n",
63 | "и приближенного апостериорного распределения\n",
64 | "\n",
65 | "\\begin{equation}\n",
66 | "q(z | x, \\phi) = \\mathcal N(z | \\mu(x, \\phi), \\operatorname{diag}(\\sigma^2(x, \\phi)))\n",
67 | "\\end{equation}\n",
68 | "\n",
69 | "Для краткости все выкладки приводятся для одного наблюдения $x$, параметры распределений по возможности опускаются. Для набора данных при обучении используется среднее значение нижней оценки. Цель обучения - максимизировать нижнюю оценку на обоснованность\n",
70 | "\n",
71 | "$$ \\mathcal L(x, \\theta, \\phi) = \\mathbb E_{q(z | x, \\phi)} p(x | z, \\theta) - \\operatorname{KL}(q(z | x, \\phi) || p(z )) = \\mathbb E_{q(z | x, \\phi)} \\log \\frac{p(x | z, \\phi)p(z)}{q(z | x, \\theta)} \\rightarrow \\max_{\\theta, \\phi} $$\n",
72 | "\n",
73 | "Как было рассказано на лекции, на практике нижняя оценка приближается оценкой \n",
74 | "\n",
75 | "\\begin{align*}\n",
76 | "&\\frac{1}{K} \\sum_{k=1}^K \\log \\frac{p(x | z_k)p(z_k)}{q(z_k | x)} \\\\\n",
77 | "& \\\\\n",
78 | "&z_k = \\mu(x, \\phi) + \\sigma^2(x, \\phi)^T \\varepsilon_k \\\\\n",
79 | "&\\varepsilon_k \\sim \\mathcal N(0, I), iid\n",
80 | "\\end{align*}\n",
81 | "\n",
82 | "с K=1, а затем максимизируется с помощью градиентного подъема.\n",
83 | "\n",
84 | "## Как это реализовать?\n",
85 | "\n",
86 | "Для вычисления приведенной выше нижней оценки необходимо уметь:\n",
87 | "1. Вычислять логарифм плотности всех распределений ($p(x | z)$, $p(z)$, $q(z | x)$)\n",
88 | "2. Сэмплировать из $q(z | x)$\n",
89 | "\n",
90 | "Следуя практике *tensorflow.distributions*, мы реализуем распределения как два класса с методами *log_prob()* и *sample()*"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {
97 | "collapsed": true,
98 | "deletable": true,
99 | "editable": true
100 | },
101 | "outputs": [],
102 | "source": [
103 | "class BinaryVector():\n",
104 | " def __init__(self, logits, rng=None):\n",
105 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n",
106 | " self.logits = logits\n",
107 | "\n",
108 | " def log_prob(self, x):\n",
109 | " # возвращает вектор вероятностей для каждого объекта в батче\n",
110 | " pixelwise_log_probs = (\n",
111 | " x * (self.logits - T.nnet.softplus(self.logits))\n",
112 | " - (1 - x) * T.nnet.softplus(self.logits)\n",
113 | " )\n",
114 | " return T.sum(pixelwise_log_probs, axis=(1, 2, 3))\n",
115 | " \n",
116 | " def sample(self):\n",
117 | " shape = self.logits.shape\n",
118 | " return T.nnet.sigmoid(self.logits) >= self.rng.uniform(shape)"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {
125 | "collapsed": true,
126 | "deletable": true,
127 | "editable": true
128 | },
129 | "outputs": [],
130 | "source": [
131 | "class MultivariateNormalDiag():\n",
132 | " def __init__(self, loc=None, scale=None, rng=None):\n",
133 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n",
134 | " self.loc= loc\n",
135 | " self.scale = scale\n",
136 | " \n",
137 | " def log_prob(self, z):\n",
138 | " normalization_constant = (\n",
139 | " - 0.5 * np.log(2 * np.pi)\n",
140 | " - T.log(self.scale)\n",
141 | " )\n",
142 | " square_term = -0.5 * ((z - self.loc) / self.scale) ** 2\n",
143 | " log_prob_vec = normalization_constant + square_term\n",
144 | " return T.sum(log_prob_vec, axis=1)\n",
145 | " \n",
146 | " def sample(self):\n",
147 | " ######################################################################\n",
148 | " # Сэмплирование из q(z | x) - ключевой момент в вариационном автокоидровщике #\n",
149 | " # Пользуясь методом self.rng.normal() реализуйте её самостоятельно #\n",
150 | " ######################################################################\n",
151 | " shape = self.loc.shape\n",
152 | " z = (self.loc + self.scale * self.rng.normal(shape))\n",
153 | " return z"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {
159 | "deletable": true,
160 | "editable": true
161 | },
162 | "source": [
163 | "Для параметров распределений построим две сети. Обратите внимание, что кодировщик теперь возвращает и код, и параметр масштаба"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": null,
169 | "metadata": {
170 | "collapsed": true,
171 | "deletable": true,
172 | "editable": true
173 | },
174 | "outputs": [],
175 | "source": [
176 | "def vae_encoder_mlp(input_x):\n",
177 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),\n",
178 | " input_var=input_x)\n",
179 | " ######################################################################################\n",
180 | " # Реализуйте некоторую несложную архитектуру кодировщика, возвращающую вектор среднего и вектор #\n",
181 | " # стандартных отклонений. Их размерность должны быть HIDDEN_DIM. Какие функции активаций ну-#\n",
182 | " # жно использовать? #\n",
183 | " ######################################################################################\n",
184 | " l_hid1 = lasagne.layers.DenseLayer(\n",
185 | " l_in, num_units=128,\n",
186 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
187 | " W=lasagne.init.GlorotUniform(),\n",
188 | " name='e_hid1')\n",
189 | " l_hid2 = lasagne.layers.DenseLayer(\n",
190 | " l_hid1, num_units=64,\n",
191 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
192 | " name='e_hid2')\n",
193 | " l_out_loc = lasagne.layers.DenseLayer(\n",
194 | " l_hid2, num_units=2,\n",
195 | " nonlinearity=None,\n",
196 | " name='e_mean')\n",
197 | " l_out_scale = lasagne.layers.DenseLayer(\n",
198 | " l_hid2, num_units=2,\n",
199 | " nonlinearity=lasagne.nonlinearities.softplus,\n",
200 | " name='e_scale')\n",
201 | " return l_out_loc, l_out_scale\n",
202 | "\n",
203 | "def vae_decoder_mlp(input_z):\n",
204 | " l_in = lasagne.layers.InputLayer(shape=(None, 2),\n",
205 | " input_var=input_z)\n",
206 | " l_hid1 = lasagne.layers.DenseLayer(\n",
207 | " l_in, num_units=64,\n",
208 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
209 | " W=lasagne.init.GlorotUniform(),\n",
210 | " name='d_hid1')\n",
211 | " l_hid2 = lasagne.layers.DenseLayer(\n",
212 | " l_hid1, num_units=128,\n",
213 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
214 | " name='d_hid2')\n",
215 | " l_out = lasagne.layers.DenseLayer(\n",
216 | " l_hid2, num_units=28 ** 2,\n",
217 | " nonlinearity=None,\n",
218 | " name='d_out')\n",
219 | " l_out = lasagne.layers.ReshapeLayer(l_out, shape=(-1, 1, 28, 28))\n",
220 | " return l_out"
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "metadata": {
226 | "deletable": true,
227 | "editable": true
228 | },
229 | "source": [
230 | "## Строим граф вычислений \n",
231 | "\n",
232 | "Входы и модель вывода $q(z | x)$"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {
239 | "collapsed": false,
240 | "deletable": true,
241 | "editable": true
242 | },
243 | "outputs": [],
244 | "source": [
245 | "input_x = T.tensor4('inputs')\n",
246 | "#####################################################\n",
247 | "# Определите encoder_mean, encoder scale, затем #\n",
248 | "# определите объект для апостериорного распределения qz_x #\n",
249 | "####################################################\n",
250 | "encoder_mean, encoder_scale = vae_encoder_mlp(input_x)\n",
251 | "qz_x = MultivariateNormalDiag(\n",
252 | " lasagne.layers.get_output(encoder_mean), \n",
253 | " lasagne.layers.get_output(encoder_scale)\n",
254 | ")"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {
260 | "deletable": true,
261 | "editable": true
262 | },
263 | "source": [
264 | "Генеративная модель $p(x, z)$"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "metadata": {
271 | "collapsed": true,
272 | "deletable": true,
273 | "editable": true
274 | },
275 | "outputs": [],
276 | "source": [
277 | "###################################################################\n",
278 | "# Определите параметр p(x | z) decoder_logits, затем #\n",
279 | "# определите объекты pz распределения p(z) и px_z распределения p(x | z) #\n",
280 | "###################################################################\n",
281 | "z = qz_x.sample()\n",
282 | "decoder_logits = vae_decoder_mlp(z)\n",
283 | "pz = MultivariateNormalDiag(T.zeros((BATCH_SIZE, HIDDEN_DIM)),\n",
284 | " T.ones((BATCH_SIZE, HIDDEN_DIM)))\n",
285 | "px_z = BinaryVector(\n",
286 | " lasagne.layers.get_output(decoder_logits)\n",
287 | ")"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "metadata": {
293 | "deletable": true,
294 | "editable": true
295 | },
296 | "source": [
297 | "ELBO и правила для обновления весов"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {
304 | "collapsed": true,
305 | "deletable": true,
306 | "editable": true
307 | },
308 | "outputs": [],
309 | "source": [
310 | "########################################################################################\n",
311 | "# Пользуясь методами px_z, p_z, qz_x определите функцию потерь для вариационного автокодировщика #\n",
312 | "# При обучении значение функции потерь должно принимать значения порядка -100 (от -150 и выше) #\n",
313 | "# Создайте список параметров сети для передачи в оптимизатор #\n",
314 | "# Что использовать в качестве функции потерь? #\n",
315 | "elbo = T.mean(px_z.log_prob(input_x)\n",
316 | " + pz.log_prob(z)\n",
317 | " - qz_x.log_prob(z))\n",
318 | "params = lasagne.layers.get_all_params([encoder_mean,\n",
319 | " encoder_scale,\n",
320 | " decoder_logits])\n",
321 | "loss = -elbo\n",
322 | "########################################################################################\n",
323 | "updates = lasagne.updates.adam(loss, params)"
324 | ]
325 | },
326 | {
327 | "cell_type": "markdown",
328 | "metadata": {
329 | "deletable": true,
330 | "editable": true
331 | },
332 | "source": [
333 | "Определяем функции"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {
340 | "collapsed": false,
341 | "deletable": true,
342 | "editable": true
343 | },
344 | "outputs": [],
345 | "source": [
346 | "train = theano.function(\n",
347 | " [input_x],\n",
348 | " elbo,\n",
349 | " updates=updates\n",
350 | ")\n",
351 | "\n",
352 | "elbo_at_test = theano.function(\n",
353 | " [input_x],\n",
354 | " elbo\n",
355 | ")"
356 | ]
357 | },
358 | {
359 | "cell_type": "markdown",
360 | "metadata": {
361 | "deletable": true,
362 | "editable": true
363 | },
364 | "source": [
365 | "И обучаем модель"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "metadata": {
372 | "collapsed": false,
373 | "deletable": true,
374 | "editable": true
375 | },
376 | "outputs": [],
377 | "source": [
378 | "for epoch in range(num_epochs):\n",
379 | " train_elbo = 0\n",
380 | " train_batches = 0\n",
381 | " start_time = time.time()\n",
382 | " for batch in iterate_minibatches(X_train, batchsize=BATCH_SIZE):\n",
383 | " \"\"\"\n",
384 | " Обратите внимание, что тут предложенна вероятностная модель для бинарных данных.\n",
385 | " MNIST содержит черно-белые изображения с градациями серого.\n",
386 | " На практике при обучении автокодировщика получают бинарные данные, всякий раз положив случайно значение пикселя равным 0 или 1\n",
387 | " в зависимости от интенсивности пикселя в объекте из данных.\n",
388 | " Такой прием называется динамическая бинаризация, он эффективно расширяет обучающую выборку и приводит к лучшим значениям \n",
389 | " правдоподобия обученных моделей.\n",
390 | " \"\"\"\n",
391 | " batch = np.random.rand(*batch.shape) <= batch\n",
392 | " train_elbo += train(batch)\n",
393 | " train_batches += 1\n",
394 | " \n",
395 | " test_elbo = 0\n",
396 | " test_batches = 0\n",
397 | " for batch in iterate_minibatches(X_test, batchsize=BATCH_SIZE):\n",
398 | " batch = np.random.rand(*batch.shape) <= batch\n",
399 | " test_elbo += elbo_at_test(batch)\n",
400 | " test_batches += 1\n",
401 | " \n",
402 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n",
403 | " epoch + 1, num_epochs, time.time() - start_time))\n",
404 | " print(\"Train error {}\".format(train_elbo/train_batches))\n",
405 | " print(\"Test error {}\".format(test_elbo/test_batches))"
406 | ]
407 | },
408 | {
409 | "cell_type": "markdown",
410 | "metadata": {
411 | "deletable": true,
412 | "editable": true
413 | },
414 | "source": [
415 | "## Что получается? Визуализации"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "metadata": {
422 | "collapsed": false,
423 | "deletable": true,
424 | "editable": true
425 | },
426 | "outputs": [],
427 | "source": [
428 | "from utils import plot_reconstructions, plot_hidden_space\n",
429 | "\n",
430 | "reconstruct = theano.function(\n",
431 | " [input_x],\n",
432 | " T.nnet.sigmoid(lasagne.layers.get_output(decoder_logits))\n",
433 | ")\n",
434 | "\n",
435 | "encode = theano.function(\n",
436 | " [input_x],\n",
437 | " qz_x.sample(),\n",
438 | ")"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "metadata": {
444 | "deletable": true,
445 | "editable": true
446 | },
447 | "source": [
448 | "Визуализируем среднее распределения $p(x | z)$"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": null,
454 | "metadata": {
455 | "collapsed": false,
456 | "deletable": true,
457 | "editable": true
458 | },
459 | "outputs": [],
460 | "source": [
461 | "plot_reconstructions(X_test, reconstruct)"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "metadata": {
467 | "deletable": true,
468 | "editable": true
469 | },
470 | "source": [
471 | "Чем отличается пространство представлений автокоидровщика от пространства представлений вариационного автокоидровщика? Почему возникло различие?"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "metadata": {
478 | "collapsed": false,
479 | "deletable": true,
480 | "editable": true
481 | },
482 | "outputs": [],
483 | "source": [
484 | "plot_hidden_space(X_test[:1000], encode)"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": null,
490 | "metadata": {
491 | "collapsed": false,
492 | "deletable": true,
493 | "editable": true
494 | },
495 | "outputs": [],
496 | "source": [
497 | "# рисуем по 25 сэмплов кода для каждого объекта\n",
498 | "x_test_repeated = np.repeat(X_test[:25], repeats=25, axis=0)\n",
499 | "plot_hidden_space(x_test_repeated, encode)"
500 | ]
501 | }
502 | ],
503 | "metadata": {
504 | "kernelspec": {
505 | "display_name": "Python 3",
506 | "language": "python",
507 | "name": "python3"
508 | },
509 | "language_info": {
510 | "codemirror_mode": {
511 | "name": "ipython",
512 | "version": 3
513 | },
514 | "file_extension": ".py",
515 | "mimetype": "text/x-python",
516 | "name": "python",
517 | "nbconvert_exporter": "python",
518 | "pygments_lexer": "ipython3",
519 | "version": "3.5.1"
520 | }
521 | },
522 | "nbformat": 4,
523 | "nbformat_minor": 2
524 | }
525 |
--------------------------------------------------------------------------------
/sem2-classify&generate/3_vae_with_gaps.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Вариационный автокодировщик\n",
11 | "\n",
12 | "tl;dr: Вместо тождественного отображения вариационны автокодировщик выучивает вероятностую модель данных. Стохастическия вычисления и априорное распределение кодов дополнительно регуляризуют модель."
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": null,
18 | "metadata": {
19 | "collapsed": false,
20 | "deletable": true,
21 | "editable": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import time\n",
26 | "\n",
27 | "import numpy as np\n",
28 | "import theano\n",
29 | "import theano.tensor as T\n",
30 | "import lasagne\n",
31 | "\n",
32 | "import matplotlib.pylab as plt\n",
33 | "%matplotlib inline\n",
34 | "\n",
35 | "from utils import load_dataset, iterate_minibatches\n",
36 | "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n",
37 | "\n",
38 | "BATCH_SIZE = 20\n",
39 | "HIDDEN_DIM = 2\n",
40 | "\n",
41 | "num_epochs = 10\n",
42 | "\n",
43 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {
49 | "deletable": true,
50 | "editable": true
51 | },
52 | "source": [
53 | "## Кратко о вариационных автокодировщиках\n",
54 | "\n",
55 | "Рассмотрим вариационный автокодировщик для бинарных наблюдений. Вариационный автокодировщик состоит из генеративной модели наблюдений\n",
56 | "\n",
57 | "\\begin{align}\n",
58 | "& p(x, z | \\theta) = p(x | z, \\theta) p(z) \\\\\n",
59 | "& p(x | z, \\theta) = \\prod_{i = 1}^D p_i(z, \\theta)^{x_i} (1 - p_i(z, \\theta))^{1 - x_i} \\\\\n",
60 | "& p(z) = \\mathcal N(z | 0, I)\n",
61 | "\\end{align}\n",
62 | "\n",
63 | "и приближенного апостериорного распределения\n",
64 | "\n",
65 | "\\begin{equation}\n",
66 | "q(z | x, \\phi) = \\mathcal N(z | \\mu(x, \\phi), \\operatorname{diag}(\\sigma^2(x, \\phi)))\n",
67 | "\\end{equation}\n",
68 | "\n",
69 | "Для краткости все выкладки приводятся для одного наблюдения $x$, параметры распределений по возможности опускаются. Для набора данных при обучении используется среднее значение нижней оценки. Цель обучения - максимизировать нижнюю оценку на обоснованность\n",
70 | "\n",
71 | "$$ \\mathcal L(x, \\theta, \\phi) = \\mathbb E_{q(z | x, \\phi)} p(x | z, \\theta) - \\operatorname{KL}(q(z | x, \\phi) || p(z )) = \\mathbb E_{q(z | x, \\phi)} \\log \\frac{p(x | z, \\phi)p(z)}{q(z | x, \\theta)} \\rightarrow \\max_{\\theta, \\phi} $$\n",
72 | "\n",
73 | "Как было рассказано на лекции, на практике нижняя оценка приближается оценкой \n",
74 | "\n",
75 | "\\begin{align*}\n",
76 | "&\\frac{1}{K} \\sum_{k=1}^K \\log \\frac{p(x | z_k)p(z_k)}{q(z_k | x)} \\\\\n",
77 | "& \\\\\n",
78 | "&z_k = \\mu(x, \\phi) + \\sigma^2(x, \\phi)^T \\varepsilon_k \\\\\n",
79 | "&\\varepsilon_k \\sim \\mathcal N(0, I), iid\n",
80 | "\\end{align*}\n",
81 | "\n",
82 | "с K=1, а затем максимизируется с помощью градиентного подъема.\n",
83 | "\n",
84 | "## Как это реализовать?\n",
85 | "\n",
86 | "Для вычисления приведенной выше нижней оценки необходимо уметь:\n",
87 | "1. Вычислять логарифм плотности всех распределений ($p(x | z)$, $p(z)$, $q(z | x)$)\n",
88 | "2. Сэмплировать из $q(z | x)$\n",
89 | "\n",
90 | "Следуя практике *tensorflow.distributions*, мы реализуем распределения как два класса с методами *log_prob()* и *sample()*"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": null,
96 | "metadata": {
97 | "collapsed": true,
98 | "deletable": true,
99 | "editable": true
100 | },
101 | "outputs": [],
102 | "source": [
103 | "class BinaryVector():\n",
104 | " def __init__(self, logits, rng=None):\n",
105 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n",
106 | " self.logits = logits\n",
107 | "\n",
108 | " def log_prob(self, x):\n",
109 | " # возвращает вектор вероятностей для каждого объекта в батче\n",
110 | " pixelwise_log_probs = (\n",
111 | " x * (self.logits - T.nnet.softplus(self.logits))\n",
112 | " - (1 - x) * T.nnet.softplus(self.logits)\n",
113 | " )\n",
114 | " return T.sum(pixelwise_log_probs, axis=(1, 2, 3))\n",
115 | " \n",
116 | " def sample(self):\n",
117 | " shape = self.logits.shape\n",
118 | " return T.nnet.sigmoid(self.logits) >= self.rng.uniform(shape)"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {
125 | "collapsed": true,
126 | "deletable": true,
127 | "editable": true
128 | },
129 | "outputs": [],
130 | "source": [
131 | "class MultivariateNormalDiag():\n",
132 | " def __init__(self, loc=None, scale=None, rng=None):\n",
133 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n",
134 | " self.loc= loc\n",
135 | " self.scale = scale\n",
136 | " \n",
137 | " def log_prob(self, z):\n",
138 | " normalization_constant = (\n",
139 | " - 0.5 * np.log(2 * np.pi)\n",
140 | " - T.log(self.scale)\n",
141 | " )\n",
142 | " square_term = -0.5 * ((z - self.loc) / self.scale) ** 2\n",
143 | " log_prob_vec = normalization_constant + square_term\n",
144 | " return T.sum(log_prob_vec, axis=1)\n",
145 | " \n",
146 | " def sample(self):\n",
147 | " ######################################################################\n",
148 | " # Сэмплирование из q(z | x) - ключевой момент в вариационном автокоидровщике #\n",
149 | " # Пользуясь методом self.rng.normal() реализуйте её самостоятельно #\n",
150 | " ######################################################################\n",
151 | " return z"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {
157 | "deletable": true,
158 | "editable": true
159 | },
160 | "source": [
161 | "Для параметров распределений построим две сети. Обратите внимание, что кодировщик теперь возвращает и код, и параметр масштаба"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "collapsed": true,
169 | "deletable": true,
170 | "editable": true
171 | },
172 | "outputs": [],
173 | "source": [
174 | "def vae_encoder_mlp(input_x):\n",
175 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),\n",
176 | " input_var=input_x)\n",
177 | " ######################################################################################\n",
178 | " # Реализуйте некоторую несложную архитектуру кодировщика, возвращающую вектор среднего и вектор #\n",
179 | " # стандартных отклонений. Их размерность должны быть HIDDEN_DIM. Какие функции активаций ну-#\n",
180 | " # жно использовать? #\n",
181 | " ######################################################################################\n",
182 | " return l_out_loc, l_out_scale\n",
183 | "\n",
184 | "def vae_decoder_mlp(input_z):\n",
185 | " l_in = lasagne.layers.InputLayer(shape=(None, 2),\n",
186 | " input_var=input_z)\n",
187 | " l_hid1 = lasagne.layers.DenseLayer(\n",
188 | " l_in, num_units=64,\n",
189 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
190 | " W=lasagne.init.GlorotUniform(),\n",
191 | " name='d_hid1')\n",
192 | " l_hid2 = lasagne.layers.DenseLayer(\n",
193 | " l_hid1, num_units=128,\n",
194 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
195 | " name='d_hid2')\n",
196 | " l_out = lasagne.layers.DenseLayer(\n",
197 | " l_hid2, num_units=28 ** 2,\n",
198 | " nonlinearity=None,\n",
199 | " name='d_out')\n",
200 | " l_out = lasagne.layers.ReshapeLayer(l_out, shape=(-1, 1, 28, 28))\n",
201 | " return l_out"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {
207 | "deletable": true,
208 | "editable": true
209 | },
210 | "source": [
211 | "## Строим граф вычислений \n",
212 | "\n",
213 | "Входы и модель вывода $q(z | x)$"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {
220 | "collapsed": false,
221 | "deletable": true,
222 | "editable": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "input_x = T.tensor4('inputs')\n",
227 | "#####################################################\n",
228 | "# Определите encoder_mean, encoder scale, затем #\n",
229 | "# определите объект для апостериорного распределения qz_x #\n",
230 | "####################################################\n",
231 | "\n",
232 | "encoder_mean, encoder_scale = # ... \n",
233 | "qz_x = # MultivariateNormalDiag ... "
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {
239 | "deletable": true,
240 | "editable": true
241 | },
242 | "source": [
243 | "Генеративная модель $p(x, z)$"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {
250 | "collapsed": true,
251 | "deletable": true,
252 | "editable": true
253 | },
254 | "outputs": [],
255 | "source": [
256 | "###################################################################\n",
257 | "# Определите параметр p(x | z) decoder_logits, затем #\n",
258 | "# определите объекты pz распределения p(z) и px_z распределения p(x | z) #\n",
259 | "###################################################################\n",
260 | "\n",
261 | "decoder_logits = # vae_decoder_mlp \n",
262 | "pz = # MultivariateNormalDiag ...\n",
263 | "px_z = # BinaryVector ..."
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {
269 | "deletable": true,
270 | "editable": true
271 | },
272 | "source": [
273 | "ELBO и правила для обновления весов"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": null,
279 | "metadata": {
280 | "collapsed": true,
281 | "deletable": true,
282 | "editable": true
283 | },
284 | "outputs": [],
285 | "source": [
286 | "########################################################################################\n",
287 | "# Пользуясь методами px_z, p_z, qz_x определите функцию потерь для вариационного автокодировщика #\n",
288 | "# При обучении значение функции потерь должно принимать значения порядка -100 (от -150 и выше) #\n",
289 | "# Создайте список параметров сети для передачи в оптимизатор #\n",
290 | "# Что использовать в качестве функции потерь? #\n",
291 | "elbo = None\n",
292 | "params = None\n",
293 | "loss = None\n",
294 | "########################################################################################\n",
295 | "updates = lasagne.updates.adam(loss, params)"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {
301 | "deletable": true,
302 | "editable": true
303 | },
304 | "source": [
305 | "Определяем функции"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {
312 | "collapsed": false,
313 | "deletable": true,
314 | "editable": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "train = theano.function(\n",
319 | " [input_x],\n",
320 | " elbo,\n",
321 | " updates=updates\n",
322 | ")\n",
323 | "\n",
324 | "elbo_at_test = theano.function(\n",
325 | " [input_x],\n",
326 | " elbo\n",
327 | ")"
328 | ]
329 | },
330 | {
331 | "cell_type": "markdown",
332 | "metadata": {
333 | "deletable": true,
334 | "editable": true
335 | },
336 | "source": [
337 | "И обучаем модель"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "metadata": {
344 | "collapsed": false,
345 | "deletable": true,
346 | "editable": true
347 | },
348 | "outputs": [],
349 | "source": [
350 | "for epoch in range(num_epochs):\n",
351 | " train_elbo = 0\n",
352 | " train_batches = 0\n",
353 | " start_time = time.time()\n",
354 | " for batch in iterate_minibatches(X_train, batchsize=BATCH_SIZE):\n",
355 | " \"\"\"\n",
356 | " Обратите внимание, что тут предложенна вероятностная модель для бинарных данных.\n",
357 | " MNIST содержит черно-белые изображения с градациями серого.\n",
358 | " На практике при обучении автокодировщика получают бинарные данные, всякий раз положив случайно значение пикселя равным 0 или 1\n",
359 | " в зависимости от интенсивности пикселя в объекте из данных.\n",
360 | " Такой прием называется динамическая бинаризация, он эффективно расширяет обучающую выборку и приводит к лучшим значениям \n",
361 | " правдоподобия обученных моделей.\n",
362 | " \"\"\"\n",
363 | " batch = np.random.rand(*batch.shape) <= batch\n",
364 | " train_elbo += train(batch)\n",
365 | " train_batches += 1\n",
366 | " \n",
367 | " test_elbo = 0\n",
368 | " test_batches = 0\n",
369 | " for batch in iterate_minibatches(X_test, batchsize=BATCH_SIZE):\n",
370 | " batch = np.random.rand(*batch.shape) <= batch\n",
371 | " test_elbo += elbo_at_test(batch)\n",
372 | " test_batches += 1\n",
373 | " \n",
374 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n",
375 | " epoch + 1, num_epochs, time.time() - start_time))\n",
376 | " print(\"Train error {}\".format(train_elbo/train_batches))\n",
377 | " print(\"Test error {}\".format(test_elbo/test_batches))"
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {
383 | "deletable": true,
384 | "editable": true
385 | },
386 | "source": [
387 | "## Что получается? Визуализации"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": null,
393 | "metadata": {
394 | "collapsed": false,
395 | "deletable": true,
396 | "editable": true
397 | },
398 | "outputs": [],
399 | "source": [
400 | "from utils import plot_reconstructions, plot_hidden_space\n",
401 | "\n",
402 | "reconstruct = theano.function(\n",
403 | " [input_x],\n",
404 | " T.nnet.sigmoid(lasagne.layers.get_output(decoder_logits))\n",
405 | ")\n",
406 | "\n",
407 | "encode = theano.function(\n",
408 | " [input_x],\n",
409 | " qz_x.sample(),\n",
410 | ")"
411 | ]
412 | },
413 | {
414 | "cell_type": "markdown",
415 | "metadata": {},
416 | "source": [
417 | "Визуализируем среднее распределения $p(x | z)$"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {
424 | "collapsed": false,
425 | "deletable": true,
426 | "editable": true
427 | },
428 | "outputs": [],
429 | "source": [
430 | "plot_reconstructions(X_test, reconstruct)"
431 | ]
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "metadata": {},
436 | "source": [
437 | "Чем отличается пространство представлений автокоидровщика от пространства представлений вариационного автокоидровщика? Почему возникло различие?"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": null,
443 | "metadata": {
444 | "collapsed": false,
445 | "deletable": true,
446 | "editable": true
447 | },
448 | "outputs": [],
449 | "source": [
450 | "plot_hidden_space(X_test[:1000], encode)"
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": null,
456 | "metadata": {
457 | "collapsed": false,
458 | "deletable": true,
459 | "editable": true
460 | },
461 | "outputs": [],
462 | "source": [
463 | "# рисуем по 25 сэмплов кода для каждого объекта\n",
464 | "x_test_repeated = np.repeat(X_test[:25], repeats=25, axis=0)\n",
465 | "plot_hidden_space(x_test_repeated, encode)"
466 | ]
467 | }
468 | ],
469 | "metadata": {
470 | "kernelspec": {
471 | "display_name": "Python 2",
472 | "language": "python",
473 | "name": "python2"
474 | },
475 | "language_info": {
476 | "codemirror_mode": {
477 | "name": "ipython",
478 | "version": 2
479 | },
480 | "file_extension": ".py",
481 | "mimetype": "text/x-python",
482 | "name": "python",
483 | "nbconvert_exporter": "python",
484 | "pygments_lexer": "ipython2",
485 | "version": "2.7.10"
486 | }
487 | },
488 | "nbformat": 4,
489 | "nbformat_minor": 2
490 | }
491 |
--------------------------------------------------------------------------------
/sem2-classify&generate/4_ss_vae.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "deletable": true,
7 | "editable": true
8 | },
9 | "source": [
10 | "# Обучение на частично размеченной выборке*\n",
11 | "\n",
12 | "Дополнительные материалы к семинару. По мотивам статьи [\"Semi-supervised Learning with\n",
13 | "Deep Generative Models\"](https://arxiv.org/pdf/1406.5298.pdf)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {
20 | "collapsed": true,
21 | "deletable": true,
22 | "editable": true
23 | },
24 | "outputs": [],
25 | "source": [
26 | "import sys\n",
27 | "import os\n",
28 | "import time\n",
29 | "\n",
30 | "import numpy as np\n",
31 | "import theano\n",
32 | "import theano.tensor as T\n",
33 | "import lasagne\n",
34 | "import matplotlib.pylab as plt\n",
35 | "%matplotlib inline\n",
36 | "\n",
37 | "from utils import load_dataset, iterate_minibatches\n",
38 | "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {
44 | "deletable": true,
45 | "editable": true
46 | },
47 | "source": [
48 | "Для этого задания мы повысим размерность скрытых компонент, а также случайным образом \"выбросим\" приблизительно 95% меток классов."
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {
55 | "collapsed": false,
56 | "deletable": true,
57 | "editable": true
58 | },
59 | "outputs": [],
60 | "source": [
61 | "BATCH_SIZE = 20\n",
62 | "HIDDEN_DIM = 16\n",
63 | "NUMBER_OF_DIGITS = 10\n",
64 | "\n",
65 | "num_epochs = 40\n",
66 | "\n",
67 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()\n",
68 | "present = np.random.rand(X_train.shape[0]) < 0.05"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {
74 | "deletable": true,
75 | "editable": true
76 | },
77 | "source": [
78 | "Классы для распределений"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {
85 | "collapsed": true,
86 | "deletable": true,
87 | "editable": true
88 | },
89 | "outputs": [],
90 | "source": [
91 | "class BinaryVector():\n",
92 | " def __init__(self, logits, rng=None):\n",
93 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n",
94 | " self.logits = logits\n",
95 | "\n",
96 | " def log_prob(self, x):\n",
97 | " pixelwise_log_probs = (\n",
98 | " x * (self.logits - T.nnet.softplus(self.logits))\n",
99 | " - (1 - x) * T.nnet.softplus(self.logits)\n",
100 | " )\n",
101 | " return T.sum(pixelwise_log_probs, axis=(1, 2, 3))\n",
102 | " \n",
103 | " def sample(self):\n",
104 | " shape = self.logits.shape\n",
105 | " return T.nnet.sigmoid(self.logits) >= self.rng.uniform(shape)\n",
106 | "\n",
107 | "class MultivariateNormalDiag():\n",
108 | " def __init__(self, loc=None, scale=None, rng=None):\n",
109 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n",
110 | " self.loc= loc\n",
111 | " self.scale = scale\n",
112 | " \n",
113 | " def log_prob(self, z):\n",
114 | " normalization_constant = (\n",
115 | " - 0.5 * np.log(2 * np.pi)\n",
116 | " - T.log(self.scale)\n",
117 | " )\n",
118 | " square_term = -0.5 * ((z - self.loc) / self.scale) ** 2\n",
119 | " log_prob_vec = normalization_constant + square_term\n",
120 | " return T.sum(log_prob_vec, axis=1)\n",
121 | " \n",
122 | " def sample(self):\n",
123 | " shape = self.loc.shape\n",
124 | " z = (self.loc + self.scale * self.rng.normal(shape))\n",
125 | " return z"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {
131 | "deletable": true,
132 | "editable": true
133 | },
134 | "source": [
135 | "## Вероятностная модель данных"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {
141 | "deletable": true,
142 | "editable": true
143 | },
144 | "source": [
145 | "В отличие от вариационного автокодировщика, генеративная модель теперь будет также включать и метки классов $y$:\n",
146 | "\n",
147 | "\\begin{align*}\n",
148 | "& p(x, y, z) = p(x | y, z) p(z) p(y) \\\\\n",
149 | "& p(y) = Cat(y | \\pi), \\pi = (1/10, \\dots, 1/10) \\\\\n",
150 | "& p(z) = \\mathcal N(z | 0, I) \\\\\n",
151 | "& p(x | y, z) = \\prod_{i=1}^D p_i(y, z)^{x_i} (1 - p_i(y, z))^{1 - x_i}\n",
152 | "\\end{align*}\n",
153 | "\n",
154 | "При обучении вариационного автокодировщика максимизируется маргинальное правдоподобие $\\log p(x)$ (нижняя оценка на него, если быть точным), а в данном случае мы будем максимизировать $\\log p(x,y)$ для объектов с метками и $\\log p(x)$ для объектов без метки. Обозначим за $P$ индексы объектов обучающей выборки с метками класса.\n",
155 | "\n",
156 | "Построим нижнюю оценку для\n",
157 | "\n",
158 | "\\begin{equation}\n",
159 | "L(X, y) = \\sum_{i \\notin P} \\log p(x_i) + \\sum_{i \\in P} \\log p(x_i, y_i)\n",
160 | "\\end{equation}\n",
161 | "\n",
162 | "Для этого определим следующее вариационное приближение:\n",
163 | "\n",
164 | "\\begin{align*}\n",
165 | "& q(y, z | x) = q(y | x) q(z | y, x)\\\\\n",
166 | "& \\\\\n",
167 | "& q(y | x) = Cat(y | \\pi(x))\\\\\n",
168 | "& q(z | y, x) = \\mathcal N(z | \\mu_\\phi(x, y), \\operatorname{diag}\\sigma^2(y, x))\n",
169 | "\\end{align*}\n",
170 | "\n",
171 | "### Оценка для $i \\in P$\n",
172 | "\n",
173 | "Случай похож на модель для вариационного автокодировщика\n",
174 | "\n",
175 | "\\begin{equation}\n",
176 | "\\log p(x, y) = \\log \\mathbb E_{p(z)} p(x, y | z) \\geq \\mathbb E_{q(z | y, x)} \\log \\frac{p(x, y|z) p(z)}{q(z | y, x)}\n",
177 | "\\end{equation}\n",
178 | "\n",
179 | "### Оценка $i \\notin P$\n",
180 | "\n",
181 | "\\begin{equation}\n",
182 | "\\log p(x) = \\log \\mathbb E_{p(y)} \\mathbb E_{p(z | y)} \\log p(x| z, y)\\geq \\mathbb E_{q(y | x)} \\mathbb E_{q(z | y, x)} \\log \\frac{p(x, y, z)}{q(z | y, x) q(y | x)}\n",
183 | "\\end{equation}\n",
184 | "\n",
185 | "### Целевая функия\n",
186 | "\n",
187 | "\\begin{equation}\n",
188 | "\\mathcal L(X, y) = \\sum_{i \\in P} \\mathbb E_{q(z_i | y_i, x_i)} \\log \\frac{p(x_i, y_i, z_i)}{q(z_i | y_i, x_i)} + \\sum_{i \\notin P} \\mathbb E_{q(y_i | x_i)} \\mathbb E_{q(z_i | y_i, x_i)} \\log \\frac{p(x_i, y_i, z_i)}{q(z_i | y_i, x_i) q(y_i | x_i)}\n",
189 | "\\end{equation}\n",
190 | "\n",
191 | "Оценку для математического ожидания по $z$ будет получать с помощью *reparametrization trick*.\n",
192 | "Пользуясь малым количеством классов, математическое ожидание по $y$ будем вычислять явно.\n",
193 | "\n",
194 | "# Как заставить модель все-таки обучаться?\n",
195 | "\n",
196 | "Максимизация нижней оценки на обоснованность на практике может не приводит к построению хорошей модели вывода $q(y | x)$.\n",
197 | "\n",
198 | "Естественно искать модели $q(y | x)$ среди тех, которые согласуются с размеченными объектами обучающей выборки $(x_i, y_i)$. В статье, в которой была впервые предложена описанная в семинаре модель, с весом $\\alpha$ добавляется дополнительное слагаемое к функции потерь:\n",
199 | "\n",
200 | "\\begin{equation}\n",
201 | "\\frac{1}{|P|}\\sum_{i \\in P} y_i^T \\log q(y | x).\n",
202 | "\\end{equation}\n",
203 | "\n",
204 | "Оно соответствует кросс-энтропии классификатора $q(y|x)$ на размеченных объектах."
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {
210 | "deletable": true,
211 | "editable": true
212 | },
213 | "source": [
214 | "### Особенности реализации\n",
215 | "В данной реализации мы передаем на вход кодировщика и декодировщика one-hot коды для $y$.\n",
216 | "\n",
217 | "Это находит свое отражение в размерах входов сетей:"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "metadata": {
224 | "collapsed": false,
225 | "deletable": true,
226 | "editable": true
227 | },
228 | "outputs": [],
229 | "source": [
230 | "def classifier_mlp(input_x):\n",
231 | " # takes x to produce posterior class assignment probabilities\n",
232 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),\n",
233 | " input_var=input_x)\n",
234 | " l_hid1 = lasagne.layers.DenseLayer(\n",
235 | " l_in, num_units=256,\n",
236 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
237 | " W=lasagne.init.GlorotUniform(),\n",
238 | " name='cl_hid1')\n",
239 | " l_out = lasagne.layers.DenseLayer(\n",
240 | " l_hid1, num_units=10,\n",
241 | " nonlinearity=lasagne.nonlinearities.softmax,\n",
242 | " name='cl_out')\n",
243 | " return l_out\n",
244 | "\n",
245 | "def vae_encoder_cond(input_xy):\n",
246 | " l_in = lasagne.layers.InputLayer(shape=(None, 28 * 28 + NUMBER_OF_DIGITS),\n",
247 | " input_var=input_xy)\n",
248 | " l_hid1 = lasagne.layers.DenseLayer(\n",
249 | " l_in, num_units=256,\n",
250 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
251 | " W=lasagne.init.GlorotUniform(),\n",
252 | " name='e_hid')\n",
253 | " l_out_loc = lasagne.layers.DenseLayer(\n",
254 | " l_hid1, num_units=HIDDEN_DIM,\n",
255 | " nonlinearity=None,\n",
256 | " name='e_mean')\n",
257 | " l_out_scale = lasagne.layers.DenseLayer(\n",
258 | " l_hid1, num_units=HIDDEN_DIM,\n",
259 | " nonlinearity=lasagne.nonlinearities.softplus,\n",
260 | " name='e_scale')\n",
261 | " \n",
262 | " return l_out_loc, l_out_scale\n",
263 | " \n",
264 | " \n",
265 | "def vae_decoder_cond(input_zy):\n",
266 | " l_in = lasagne.layers.InputLayer(shape=(None, HIDDEN_DIM + NUMBER_OF_DIGITS),\n",
267 | " input_var=input_zy)\n",
268 | " l_hid1 = lasagne.layers.DenseLayer(\n",
269 | " l_in, num_units=256,\n",
270 | " nonlinearity=lasagne.nonlinearities.rectify,\n",
271 | " W=lasagne.init.GlorotUniform(),\n",
272 | " name='d_hid1')\n",
273 | " l_out = lasagne.layers.DenseLayer(\n",
274 | " l_hid1, num_units=28 * 28,\n",
275 | " nonlinearity=None,\n",
276 | " name='d_out')\n",
277 | " l_out = lasagne.layers.ReshapeLayer(l_out, shape=(-1, 1, 28, 28))\n",
278 | " return l_out"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {
284 | "deletable": true,
285 | "editable": true
286 | },
287 | "source": [
288 | "При обучении мы будем вычислять выходы нейросети на всех возможных значениях $y$, все они нужны в 95% случаев для подсчета нижней оценки на обоснованность. Для этого здесь написаны две вспомонательные функции:"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "metadata": {
295 | "collapsed": true,
296 | "deletable": true,
297 | "editable": true
298 | },
299 | "outputs": [],
300 | "source": [
301 | "input_x = T.tensor4('input_x')\n",
302 | "input_y = T.ivector('input_y')\n",
303 | "input_p = T.bvector('input_present')\n",
304 | "\n",
305 | "def add_all_possible_labels(input_x):\n",
306 | " # создает десять копий объекта из батча и приписывает к каждой из них код для y\n",
307 | " input_x = T.reshape(input_x, newshape=(BATCH_SIZE, -1))\n",
308 | " input_x = T.repeat(input_x, repeats=NUMBER_OF_DIGITS, axis=0)\n",
309 | " input_y = T.repeat(T.eye(NUMBER_OF_DIGITS), repeats=BATCH_SIZE, axis=0)\n",
310 | " input_xy = T.concatenate([input_x, input_y], axis=1)\n",
311 | " return input_xy\n",
312 | "\n",
313 | "def add_corresponding_labels(input_z):\n",
314 | " # приписывает код n % 10 (остаток деления) для n объекта в батче\n",
315 | " input_y = T.repeat(T.eye(NUMBER_OF_DIGITS), repeats=BATCH_SIZE, axis=0)\n",
316 | " input_zy = T.concatenate([input_z, input_y], axis=1)\n",
317 | " return input_zy"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {
323 | "deletable": true,
324 | "editable": true
325 | },
326 | "source": [
327 | "Модель вывода"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "collapsed": true,
335 | "deletable": true,
336 | "editable": true
337 | },
338 | "outputs": [],
339 | "source": [
340 | "input_xy = add_all_possible_labels(input_x)\n",
341 | "encoder_mean, encoder_scale = vae_encoder_cond(input_xy)\n",
342 | "qz_xy = MultivariateNormalDiag(\n",
343 | " lasagne.layers.get_output(encoder_mean), \n",
344 | " lasagne.layers.get_output(encoder_scale)\n",
345 | ")\n",
346 | "\n",
347 | "input_zy = add_corresponding_labels(qz_xy.sample())\n"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {
353 | "deletable": true,
354 | "editable": true
355 | },
356 | "source": [
357 | "Генеративная модель"
358 | ]
359 | },
360 | {
361 | "cell_type": "code",
362 | "execution_count": null,
363 | "metadata": {
364 | "collapsed": false,
365 | "deletable": true,
366 | "editable": true
367 | },
368 | "outputs": [],
369 | "source": [
370 | "decoder_logits = vae_decoder_cond(input_zy)\n",
371 | "pz = MultivariateNormalDiag(T.zeros((NUMBER_OF_DIGITS * BATCH_SIZE, HIDDEN_DIM)),\n",
372 | " T.ones((NUMBER_OF_DIGITS * BATCH_SIZE, HIDDEN_DIM)))\n",
373 | "# здесь мы не стали реализовывать отдельный класс\n",
374 | "p_y = -np.log(NUMBER_OF_DIGITS * np.ones([BATCH_SIZE * NUMBER_OF_DIGITS]))\n",
375 | "\n",
376 | "px_zy = BinaryVector(\n",
377 | " lasagne.layers.get_output(decoder_logits)\n",
378 | ")\n",
379 | "\n",
380 | "classifier = classifier_mlp(input_x)\n",
381 | "qy_x_probs = lasagne.layers.get_output(classifier)"
382 | ]
383 | },
384 | {
385 | "cell_type": "markdown",
386 | "metadata": {
387 | "deletable": true,
388 | "editable": true
389 | },
390 | "source": [
391 | "Функция потерь"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": null,
397 | "metadata": {
398 | "collapsed": false,
399 | "deletable": true,
400 | "editable": true
401 | },
402 | "outputs": [],
403 | "source": [
404 | "alpha = 5.\n",
405 | "\n",
406 | "elbo_vec = (+ p_y\n",
407 | " + px_zy.log_prob(T.repeat(input_x, repeats=NUMBER_OF_DIGITS, axis=0))\n",
408 | " + pz.log_prob(qz_xy.sample())\n",
409 | " - qz_xy.log_prob(qz_xy.sample()))\n",
410 | "elbo_vec = T.reshape(elbo_vec, newshape=(BATCH_SIZE, NUMBER_OF_DIGITS))\n",
411 | "\n",
412 | "elbo_vec = (\n",
413 | " input_p * T.sum(elbo_vec * lasagne.utils.one_hot(input_y, m=NUMBER_OF_DIGITS), axis=1)\n",
414 | " + (1 - input_p) * T.sum(qy_x_probs * elbo_vec - qy_x_probs * T.log(qy_x_probs), axis=1)\n",
415 | ")\n",
416 | "\n",
417 | "loss = T.mean(elbo_vec - alpha * input_p * lasagne.objectives.categorical_crossentropy(qy_x_probs, input_y))"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": null,
423 | "metadata": {
424 | "collapsed": false,
425 | "deletable": true,
426 | "editable": true
427 | },
428 | "outputs": [],
429 | "source": [
430 | "params = lasagne.layers.get_all_params(\n",
431 | " [encoder_mean, encoder_scale, decoder_logits, classifier]\n",
432 | ")\n",
433 | "updates = lasagne.updates.adam(-loss, params)\n",
434 | "\n",
435 | "train_fn = theano.function([input_x, input_y, input_p], loss, updates=updates)\n",
436 | "accuracy = theano.function(\n",
437 | " [input_x, input_y],\n",
438 | " T.mean(T.eq(T.argmax(qy_x_probs, axis=1), input_y), dtype=theano.config.floatX)\n",
439 | ")"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": null,
445 | "metadata": {
446 | "collapsed": false,
447 | "deletable": true,
448 | "editable": true
449 | },
450 | "outputs": [],
451 | "source": [
452 | "for epoch in range(num_epochs):\n",
453 | " # In each epoch, we do a full pass over the training data:\n",
454 | " train_err = 0\n",
455 | " train_batches = 0\n",
456 | " start_time = time.time()\n",
457 | " for batch in iterate_minibatches(X_train, y_train, present=present, batchsize=BATCH_SIZE):\n",
458 | " inputs, targets, batch_present = batch\n",
459 | " inputs = np.random.rand(*inputs.shape) < inputs\n",
460 | " train_err += train_fn(inputs, targets, batch_present)\n",
461 | " train_batches += 1\n",
462 | " \n",
463 | " test_accuracy = 0\n",
464 | " test_batches = 0\n",
465 | " for batch in iterate_minibatches(X_test, y_test, batchsize=BATCH_SIZE, shuffle=False):\n",
466 | " inputs, targets = batch\n",
467 | " inputs = np.random.rand(*inputs.shape) < inputs\n",
468 | " test_accuracy += accuracy(inputs, targets)\n",
469 | " test_batches += 1\n",
470 | " \n",
471 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n",
472 | " epoch + 1, num_epochs, time.time() - start_time))\n",
473 | " print(\"Train elbo {}\".format(train_err/train_batches))\n",
474 | " print(\"Test accuracy {}\".format(test_accuracy/test_batches))"
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {
480 | "deletable": true,
481 | "editable": true
482 | },
483 | "source": [
484 | "# Задание*\n",
485 | "\n",
486 | "Ниже приведен код, генерирующий случайные цифры из заданного класса.\n",
487 | "\n",
488 | "Эксперементируя с архитектурами сети и параметрами модели, попробуйте обучить модель, для которой успешно выполняется это сэмплирование (см. эксперементальные результаты статьи https://arxiv.org/pdf/1406.5298.pdf)"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {
495 | "collapsed": false,
496 | "deletable": true,
497 | "editable": true
498 | },
499 | "outputs": [],
500 | "source": [
501 | "input_z = T.matrix('input_z')\n",
502 | "\n",
503 | "decode_a_code = theano.function(\n",
504 | " [input_z],\n",
505 | " lasagne.layers.get_output(decoder_logits, input_z),\n",
506 | ")"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": null,
512 | "metadata": {
513 | "collapsed": false,
514 | "deletable": true,
515 | "editable": true
516 | },
517 | "outputs": [],
518 | "source": [
519 | "digit_to_draw = 4\n",
520 | "\n",
521 | "z_samples = np.random.randn(64, HIDDEN_DIM)\n",
522 | "y_samples = np.zeros((64, NUMBER_OF_DIGITS))\n",
523 | "y_samples[:, digit_to_draw] = 1\n",
524 | "zy_samples = np.concatenate([z_samples, y_samples], axis=1)\n",
525 | "\n",
526 | "decoded_images = decode_a_code(zy_samples)\n",
527 | "\n",
528 | "fig, axes = plt.subplots(8, 8, figsize=(8, 8),\n",
529 | " subplot_kw={'xticks': [], 'yticks': []}\n",
530 | ")\n",
531 | "fig.subplots_adjust(hspace=0.04, wspace=0.02)\n",
532 | "\n",
533 | "for ax, i in zip(axes.flat, range(64)):\n",
534 | " ax.imshow(decoded_images[i].reshape((28, 28)), cmap='gray')"
535 | ]
536 | }
537 | ],
538 | "metadata": {
539 | "kernelspec": {
540 | "display_name": "Python 3",
541 | "language": "python",
542 | "name": "python3"
543 | },
544 | "language_info": {
545 | "codemirror_mode": {
546 | "name": "ipython",
547 | "version": 3
548 | },
549 | "file_extension": ".py",
550 | "mimetype": "text/x-python",
551 | "name": "python",
552 | "nbconvert_exporter": "python",
553 | "pygments_lexer": "ipython3",
554 | "version": "3.5.1"
555 | }
556 | },
557 | "nbformat": 4,
558 | "nbformat_minor": 2
559 | }
560 |
--------------------------------------------------------------------------------
/sem2-classify&generate/mnist.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import time
4 |
5 | import numpy as np
6 |
7 |
8 | def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
9 | assert len(inputs) == len(targets)
10 | if shuffle:
11 | indices = np.arange(len(inputs))
12 | np.random.shuffle(indices)
13 | for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
14 | if shuffle:
15 | excerpt = indices[start_idx:start_idx + batchsize]
16 | else:
17 | excerpt = slice(start_idx, start_idx + batchsize)
18 | yield inputs[excerpt], targets[excerpt]
19 |
20 |
21 | __doc__="""taken from https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py"""
22 |
23 | def load_dataset():
24 | # We first define a download function, supporting both Python 2 and 3.
25 | if sys.version_info[0] == 2:
26 | from urllib import urlretrieve
27 | else:
28 | from urllib.request import urlretrieve
29 |
30 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
31 | print("Downloading %s" % filename)
32 | urlretrieve(source + filename, filename)
33 |
34 | # We then define functions for loading MNIST images and labels.
35 | # For convenience, they also download the requested files if needed.
36 | import gzip
37 |
38 | def load_mnist_images(filename):
39 | if not os.path.exists(filename):
40 | download(filename)
41 | # Read the inputs in Yann LeCun's binary format.
42 | with gzip.open(filename, 'rb') as f:
43 | data = np.frombuffer(f.read(), np.uint8, offset=16)
44 | # The inputs are vectors now, we reshape them to monochrome 2D images,
45 | # following the shape convention: (examples, channels, rows, columns)
46 | data = data.reshape(-1, 1, 28, 28)
47 | # The inputs come as bytes, we convert them to float32 in range [0,1].
48 | # (Actually to range [0, 255/256], for compatibility to the version
49 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.)
50 | return data / np.float32(256)
51 |
52 | def load_mnist_labels(filename):
53 | if not os.path.exists(filename):
54 | download(filename)
55 | # Read the labels in Yann LeCun's binary format.
56 | with gzip.open(filename, 'rb') as f:
57 | data = np.frombuffer(f.read(), np.uint8, offset=8)
58 | # The labels are vectors of integers now, that's exactly what we want.
59 | return data
60 |
61 | # We can now download and read the training and test set images and labels.
62 | X_train = load_mnist_images('train-images-idx3-ubyte.gz')
63 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
64 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
65 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
66 |
67 | # We reserve the last 10000 training examples for validation.
68 | X_train, X_val = X_train[:-10000], X_train[-10000:]
69 | y_train, y_val = y_train[:-10000], y_train[-10000:]
70 |
71 | # We just return all the arrays in order, as expected in main().
72 | # (It doesn't matter how we do this as long as we can read them again.)
73 | return X_train, y_train, X_val, y_val, X_test, y_test
74 |
75 |
76 |
77 |
78 |
--------------------------------------------------------------------------------
/sem2-classify&generate/utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 |
4 | import numpy as np
5 |
6 | import matplotlib.pylab as plt
7 | from matplotlib.offsetbox import OffsetImage, AnnotationBbox
8 |
9 | def load_dataset():
10 | if sys.version_info[0] == 2:
11 | from urllib import urlretrieve
12 | else:
13 | from urllib.request import urlretrieve
14 |
15 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'):
16 | print("Downloading %s" % filename)
17 | urlretrieve(source + filename, filename)
18 |
19 | import gzip
20 |
21 | def load_mnist_images(filename):
22 | if not os.path.exists(filename):
23 | download(filename)
24 | with gzip.open(filename, 'rb') as f:
25 | data = np.frombuffer(f.read(), np.uint8, offset=16)
26 | data = data.reshape(-1, 1, 28, 28)
27 | return data / np.float32(256)
28 |
29 | def load_mnist_labels(filename):
30 | if not os.path.exists(filename):
31 | download(filename)
32 | with gzip.open(filename, 'rb') as f:
33 | data = np.frombuffer(f.read(), np.uint8, offset=8)
34 | return data
35 |
36 | X = {}
37 | y = {}
38 |
39 | X_train= load_mnist_images('train-images-idx3-ubyte.gz')
40 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz')
41 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz')
42 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz')
43 |
44 | X_train, X_val = X_train[:-10000], X_train[-10000:]
45 | y_train, y_val = y_train[:-10000], y_train[-10000:]
46 | return X_train, y_train, X_val, y_val, X_test, y_test
47 |
48 | def iterate_minibatches(inputs, targets=None, batchsize=20, present=None, shuffle=True):
49 | if shuffle:
50 | indices = np.arange(len(inputs))
51 | np.random.shuffle(indices)
52 | for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
53 | if shuffle:
54 | excerpt = indices[start_idx:start_idx + batchsize]
55 | else:
56 | excerpt = slice(start_idx, start_idx + batchsize)
57 | if targets is None:
58 | yield inputs[excerpt]
59 | elif present is None:
60 | yield inputs[excerpt], targets[excerpt]
61 | else:
62 | yield inputs[excerpt], targets[excerpt], present[excerpt]
63 |
64 | def plot_reconstructions(x_test, reconstruction_func):
65 | decoded_imgs = reconstruction_func(x_test)
66 |
67 | indices = np.random.choice(x_test.shape[0], 64)
68 |
69 | n = x_test.shape[0] # how many digits we will display
70 |
71 | fig, axes = plt.subplots(8, 16, figsize=(16, 8),
72 | subplot_kw={'xticks': [], 'yticks': []}
73 | )
74 | fig.subplots_adjust(hspace=0.04, wspace=0.02)
75 |
76 | for ax, i in zip(axes[:, :8].flat, indices):
77 | ax.imshow(x_test[i].reshape((28, 28)), cmap='gray')
78 |
79 | for ax, i in zip(axes[:, 8:].flat, indices):
80 | ax.imshow(decoded_imgs[i].reshape((28, 28)), cmap='gray')
81 |
82 | plt.show()
83 |
84 |
85 | def imscatter(x, y, image, ax=None, zoom=1):
86 | if ax is None:
87 | ax = plt.gca()
88 | #im = OffsetImage(image.reshape((-1, 28, 28)), zoom=zoom)
89 | #x, y = np.atleast_1d(x, y)
90 | artists = []
91 | #assert len(x) == len(y) == len(image)
92 | n = len(x)
93 | for i in range(n):
94 | im = OffsetImage(image[i], zoom=zoom, cmap='gray')
95 | ab = AnnotationBbox(im, (x[i], y[i]), xycoords='data', frameon=False)
96 | artists.append(ax.add_artist(ab))
97 | ax.update_datalim(np.column_stack([x, y]))
98 | ax.autoscale()
99 | return artists
100 |
101 | def plot_hidden_space(x_test, encode_func, zoom=0.5):
102 | encoded = encode_func(x_test)
103 |
104 | fig, ax = plt.subplots(figsize=(11, 11))
105 | imscatter(encoded[:, 0], encoded[:, 1], x_test.reshape((-1, 28, 28)), zoom=zoom, ax=ax)
106 |
107 | ax.spines['left'].set_position('center')
108 | ax.spines['bottom'].set_position('center')
109 | ax.spines['right'].set_color('none')
110 | ax.spines['top'].set_color('none')
111 | ax.xaxis.set_ticks_position('bottom')
112 | ax.yaxis.set_ticks_position('left')
113 |
114 | plt.gray()
115 | plt.show()
--------------------------------------------------------------------------------
/sem3-attention/Attention_seminar (Start here).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Attention\n",
8 | "* Alexandr Panin, Arseniy Ashuha, you can text me ```ars.ashuha@gmail.com```,\n",
9 | "* Based on https://github.com/ebenolson/pydata2015\n",
10 | "\n",
11 | "\n",
12 | " Part I: Attention mechanism at toy problems \n",
13 | "\n",
14 | " \n",
15 | "\n",
16 | "In this seminar you will implement attention mechanism and apply it to a simple task of associative recall.\n",
17 | "\n",
18 | "# Install me:\n",
19 | "```(bash)\n",
20 | "sudo pip install --upgrade https://github.com/yandexdataschool/agentnet/archive/master.zip\n",
21 | "```\n"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "collapsed": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "import lasagne\n",
33 | "import numpy as np\n",
34 | "from lasagne.layers import *\n",
35 | "import matplotlib.pyplot as plt\n",
36 | "import theano,theano.tensor as T\n",
37 | "\n",
38 | "%matplotlib inline"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Problem description:\n",
46 | "\n",
47 | "You are given a sequence of pairs [key,value]. \n",
48 | "\n",
49 | "Both keys and values are one-hot encoded integers. \n",
50 | "\n",
51 | "The network should learn to generate values in order of ascension of keys.\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {
58 | "collapsed": true
59 | },
60 | "outputs": [],
61 | "source": [
62 | "CODE_SIZE = 10\n",
63 | "def generate_sample(min_length = 3, max_length = 10, code_size=CODE_SIZE):\n",
64 | " assert code_size >= max_length\n",
65 | " length = np.random.randint(min_length, max_length)\n",
66 | " \n",
67 | " keys = np.random.permutation(length)\n",
68 | " values = np.random.permutation(length)\n",
69 | " input_pairs = zip(keys,values)\n",
70 | " \n",
71 | " input_1hot = np.zeros([length+1,code_size*2])\n",
72 | " for i,(k,v) in enumerate(input_pairs):\n",
73 | " input_1hot[i+1][k] = 1\n",
74 | " input_1hot[i+1][code_size + v] = 1\n",
75 | " \n",
76 | " sorted_pairs = sorted(input_pairs,key=lambda (k,v):k)\n",
77 | " \n",
78 | " target_1hot = np.zeros([length+1,code_size*2])\n",
79 | " for i,(k,v) in enumerate(sorted_pairs):\n",
80 | " target_1hot[i+1][k] = 1\n",
81 | " target_1hot[i+1][code_size + v] = 1\n",
82 | " \n",
83 | " \n",
84 | " return input_1hot,target_1hot"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "collapsed": false
92 | },
93 | "outputs": [],
94 | "source": [
95 | "inp,out = generate_sample(max_length=5,code_size=5)\n",
96 | "print '-'*9 + \"KEY\" + '-'*9 + ' ' + '+'*9 + \"VAL\" + \"+\"*9\n",
97 | "print \"Input pairs:\\n\",inp\n",
98 | "print \"Target pairs:\\n\",out"
99 | ]
100 | },
101 | {
102 | "cell_type": "markdown",
103 | "metadata": {},
104 | "source": [
105 | "### Attention!\n",
106 | "\n",
107 | "We're now going to implement attention mechanism, or more specifically, _additive attention_ (a.k.a. Bahdanau's attention).\n",
108 | "\n",
109 | "We'll do so in two steps:\n",
110 | "\n",
111 | "* __AttentionWeights(encoder_seq,attn_query)__ - a layer that returns attention weights (aka probabilities of taking each value).\n",
112 | "* __AttentionOutput(encoder_seq,attn_weights)__ - a layer that averages inputs given probabilities from AttentionWeights.\n",
113 | "\n",
114 | "If you're not feeling familiar with this procedure, just follow the step-by-step instructions in code."
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "collapsed": false
122 | },
123 | "outputs": [],
124 | "source": [
125 | "from lasagne.init import Normal\n",
126 | "class AttentionWeights(MergeLayer):\n",
127 | " def __init__(self, encoder_seq, attn_query, num_units):\n",
128 | " MergeLayer.__init__(self, [encoder_seq, attn_query])\n",
129 | " \n",
130 | " enc_units = encoder_seq.output_shape[2]\n",
131 | " dec_units = attn_query.output_shape[1]\n",
132 | " \n",
133 | " self.W_enc = self.add_param(Normal(), (enc_units, num_units), name='enc_to_hid')\n",
134 | " self.W_query = self.add_param(Normal(), (dec_units, num_units), name='dec_to_hid')\n",
135 | " self.W_out = self.add_param(Normal(), (num_units, 1),name='hid_to_logit')\n",
136 | " \n",
137 | " def get_output_for(self, inputs):\n",
138 | " # the encoder_sequence shape = [batch, time,units]\n",
139 | " # the query shapeshape = [batch, units]\n",
140 | " encoder_sequence, query = inputs\n",
141 | " \n",
142 | " # Hidden layer activations, shape [batch,seq_len,hid_units]\n",
143 | " \n",
144 | " query_to_hid = query.dot(self.W_query)[:,None,:]\n",
145 | " \n",
146 | " enc_to_hid = \n",
147 | " \n",
148 | " hid = T.tanh()\n",
149 | " \n",
150 | " # Logits from hidden, [batch_size, seq_len]\n",
151 | " logits = \n",
152 | " \n",
153 | " assert logits.ndim ==2, \"Logits must have shape [batch,time] and be 2-dimensional.\"\\\n",
154 | " \"Current amount of dimensions:\"+str(logits.ndim)\n",
155 | " \n",
156 | " attn_weights = T.nnet.softmax(logits)\n",
157 | " \n",
158 | " return attn_weights\n",
159 | " \n",
160 | " def get_output_shape_for(self,input_shapes):\n",
161 | " enc_shape,query_shape = input_shapes\n",
162 | " return enc_shape[:-1]"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [],
172 | "source": [
173 | "class AttentionOutput(MergeLayer):\n",
174 | " def __init__(self, encoder_seq, attn_weights):\n",
175 | " MergeLayer.__init__(self,[encoder_seq,attn_weights])\n",
176 | " \n",
177 | " def get_output_for(self,inputs):\n",
178 | " # encoder_sequence shape = [batch,time,units]\n",
179 | " # attn_weights shape = [batch,time]\n",
180 | " encoder_sequence, attn_weights = inputs\n",
181 | " \n",
182 | " #Reshape attn_weights to make 'em 3-dimensional: [batch,time,1] - so you could multiply by encoder sequence\n",
183 | " attn_weights = attn_weights.reshape([attn_weights.shape[0],attn_weights.shape[1],1])\n",
184 | " \n",
185 | " #Compute attention response by summing encoder elements with weights along time axis (axis=1)\n",
186 | " attn_output = \n",
187 | " \n",
188 | " return attn_output\n",
189 | " \n",
190 | " def get_output_shape_for(self,input_shapes):\n",
191 | " enc_shape,query_shape = input_shapes\n",
192 | " return (enc_shape[0],enc_shape[-1])"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "metadata": {},
198 | "source": [
199 | "We now define a single step of recurrent neural network using attention"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "metadata": {
206 | "collapsed": true
207 | },
208 | "outputs": [],
209 | "source": [
210 | "input_sequence = T.itensor3(\"Input tokens [batch,time,code]\")\n",
211 | "reference_answers = T.itensor3(\"Reference answers[batch,time,code]\")\n",
212 | "\n",
213 | "l_inputs = InputLayer((None,None,CODE_SIZE*2),input_sequence)\n",
214 | "l_prev_answers = InputLayer((None,None,CODE_SIZE*2),reference_answers[:,:-1])"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": null,
220 | "metadata": {
221 | "collapsed": false
222 | },
223 | "outputs": [],
224 | "source": [
225 | "from agentnet.memory import RNNCell\n",
226 | "class step:\n",
227 | " prev_output = InputLayer((None, CODE_SIZE*2), name='previous output')\n",
228 | " input_sequence = InputLayer((None, None, CODE_SIZE*2), name='input sequence for attention')\n",
229 | " prev_rnn = InputLayer((None, 64), name='last rnn state')\n",
230 | " \n",
231 | " #TODO your code here\n",
232 | " attention_weights = AttentionWeights(input_sequence, prev_rnn,32)\n",
233 | " attention_value = AttentionOutput(input_sequence, attention_weights)\n",
234 | " \n",
235 | " new_rnn = RNNCell(prev_rnn,concat([attention_value, prev_output]))\n",
236 | " \n",
237 | " output_probs = DenseLayer(\n",
238 | " concat([new_rnn,attention_value]),\n",
239 | " num_units=CODE_SIZE*2, nonlinearity=T.nnet.sigmoid)"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": null,
245 | "metadata": {
246 | "collapsed": false
247 | },
248 | "outputs": [],
249 | "source": [
250 | "from agentnet import Recurrence\n",
251 | "#This layer applies RNN to itself in a symbolic loop.\n",
252 | "#Please wait for DeepBayes' staff to explain how it works.\n",
253 | "\n",
254 | "rnn = Recurrence(\n",
255 | " input_sequences = {step.prev_output: l_prev_answers},\n",
256 | " input_nonsequences = {step.input_sequence: l_inputs},\n",
257 | " state_variables = {step.new_rnn: step.prev_rnn},\n",
258 | " tracked_outputs = [step.output_probs,step.attention_weights],\n",
259 | " unroll_scan=False,\n",
260 | ")"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {
267 | "collapsed": false
268 | },
269 | "outputs": [],
270 | "source": [
271 | "output_probs,attn_weights = get_output(\n",
272 | " [rnn[step.output_probs], rnn[step.attention_weights]])\n",
273 | "\n",
274 | "predict = theano.function(\n",
275 | " [input_sequence,reference_answers],\n",
276 | " [output_probs,attn_weights],\n",
277 | " allow_input_downcast=True)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {
284 | "collapsed": false,
285 | "scrolled": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "next_answers = reference_answers[:,1:]\n",
290 | "\n",
291 | "loss = -T.log(output_probs)*next_answers -T.log(1-output_probs)*(1-next_answers)\n",
292 | "loss = T.mean(loss)\n",
293 | "\n",
294 | "updates = \n",
295 | "\n",
296 | "train = theano.function([input_sequence, reference_answers], loss, updates=updates,allow_input_downcast=True)"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "### Training"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": null,
309 | "metadata": {
310 | "collapsed": false
311 | },
312 | "outputs": [],
313 | "source": [
314 | "from tqdm import tnrange\n",
315 | "from IPython.display import clear_output\n",
316 | "loss_history = []\n",
317 | "\n",
318 | "for i in tnrange(10000):\n",
319 | " bx,by = generate_sample()\n",
320 | " loss_history.append(train([bx],[by]))\n",
321 | " \n",
322 | " if i%500==0:\n",
323 | " clear_output(True)\n",
324 | " plt.plot(loss_history)\n",
325 | " plt.show()\n",
326 | " \n",
327 | " #draw attention map\n",
328 | " bx,by = generate_sample()\n",
329 | " probs,attentions = predict([bx],[by])\n",
330 | "\n",
331 | " input_kv = zip(bx[:,:CODE_SIZE].argmax(-1),bx[:,CODE_SIZE:].argmax(-1))\n",
332 | " target_kv = zip(by[:,:CODE_SIZE].argmax(-1),by[:,CODE_SIZE:].argmax(-1))\n",
333 | " plt.imshow(attentions[0])\n",
334 | " plt.xticks(*zip(*enumerate(map(str,input_kv))),rotation=45)\n",
335 | " plt.yticks(*zip(*enumerate(map(str,target_kv))),rotation=45)\n",
336 | " plt.show()"
337 | ]
338 | }
339 | ],
340 | "metadata": {
341 | "kernelspec": {
342 | "display_name": "Python 2",
343 | "language": "python",
344 | "name": "python2"
345 | },
346 | "language_info": {
347 | "codemirror_mode": {
348 | "name": "ipython",
349 | "version": 2
350 | },
351 | "file_extension": ".py",
352 | "mimetype": "text/x-python",
353 | "name": "python",
354 | "nbconvert_exporter": "python",
355 | "pygments_lexer": "ipython2",
356 | "version": "2.7.10"
357 | },
358 | "widgets": {
359 | "state": {
360 | "1efdd72be63d457dafc441ce841e39f5": {
361 | "views": [
362 | {
363 | "cell_index": 15
364 | }
365 | ]
366 | }
367 | },
368 | "version": "1.2.0"
369 | }
370 | },
371 | "nbformat": 4,
372 | "nbformat_minor": 2
373 | }
374 |
--------------------------------------------------------------------------------
/sem3-attention/Captioning_seminar.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": false
7 | },
8 | "source": [
9 | "@authors\n",
10 | "* Arseniy Ashuha, you can text me ```ars.ashuha@gmail.com```,\n",
11 | "* Based on https://github.com/ebenolson/pydata2015\n",
12 | "\n",
13 | " Part II: Attention mechanism @ Image Captioning \n",
14 | "\n",
15 | " \n",
16 | "\n",
17 | "In this seminar you'll be going through the image captioning pipeline.\n",
18 | "\n",
19 | "To begin with, let us download the dataset of image features from a pre-trained GoogleNet (see instructions in chat)"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Data preprocessing"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [],
36 | "source": [
37 | "# Load dataset\n",
38 | "import numpy as np\n",
39 | "\n",
40 | "captions = np.load(\"./data/train-data-captions.npy\")\n",
41 | "img_codes = np.load(\"./data/train-data-features.npy\").astype('float32')"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "collapsed": false
49 | },
50 | "outputs": [],
51 | "source": [
52 | "print (\"each image code is a 6x6 feature matrix from GoogleNet:\", img_codes.shape)\n",
53 | "print (img_codes[0,:10,0,0])\n",
54 | "print ('\\n\\n')\n",
55 | "print (\"for each image there are 5-7 descriptions, e.g.:\\n\")\n",
56 | "print ('\\n'.join(captions[0]))"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "collapsed": false
64 | },
65 | "outputs": [],
66 | "source": [
67 | "#split descriptions into tokens\n",
68 | "for img_i in range(len(captions)):\n",
69 | " for caption_i in range(len(captions[img_i])):\n",
70 | " sentence = captions[img_i][caption_i] \n",
71 | " captions[img_i][caption_i] = [\"#START#\"]+sentence.split(' ')+[\"#END#\"]"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": false
79 | },
80 | "outputs": [],
81 | "source": [
82 | "# Build a Vocabulary\n",
83 | "from collections import Counter\n",
84 | "word_counts = Counter()\n",
85 | "for img_captions in captions:\n",
86 | " for caption in img_captions:\n",
87 | " word_counts.update(caption)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": null,
93 | "metadata": {
94 | "collapsed": false
95 | },
96 | "outputs": [],
97 | "source": [
98 | "vocab = ['#UNK#', '#START#', '#END#']\n",
99 | "vocab += [k for k, v in word_counts.items() if v >= 5]\n",
100 | "vocab = list(set(vocab))\n",
101 | "n_tokens = len(vocab)\n",
102 | "\n",
103 | "assert 12000 <= n_tokens <= 15000\n",
104 | "\n",
105 | "word_to_index = {w: i for i, w in enumerate(vocab)}"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "We'll use this function to convert sentences into a network-readible matrix of token indices.\n",
113 | "\n",
114 | "When given several sentences of different length, it pads them with -1."
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "collapsed": true
122 | },
123 | "outputs": [],
124 | "source": [
125 | "PAD_ix = -1\n",
126 | "UNK_ix = vocab.index('#UNK#')\n",
127 | "START_ix = vocab.index(\"#START#\")\n",
128 | "END_ix = vocab.index(\"#END#\")\n",
129 | "\n",
130 | "#good old as_matrix for the third time\n",
131 | "def as_matrix(sequences,max_len=None):\n",
132 | " max_len = max_len or max(map(len,sequences))\n",
133 | " \n",
134 | " matrix = np.zeros((len(sequences),max_len),dtype='int32')+PAD_ix\n",
135 | " for i,seq in enumerate(sequences):\n",
136 | " row_ix = [word_to_index.get(word,UNK_ix) for word in seq[:max_len]]\n",
137 | " matrix[i,:len(row_ix)] = row_ix\n",
138 | " \n",
139 | " return matrix\n",
140 | "\n",
141 | "def to_string(tokens_ix):\n",
142 | " assert len(np.shape(tokens_ix))==1,\"to_string works on one sequence at a time\"\n",
143 | " tokens_ix = list(tokens_ix)[1:]\n",
144 | " if END_ix in tokens_ix:\n",
145 | " tokens_ix = tokens_ix[:tokens_ix.index(END_ix)]\n",
146 | " return \" \".join([vocab[i] for i in tokens_ix])"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": null,
152 | "metadata": {
153 | "collapsed": false
154 | },
155 | "outputs": [],
156 | "source": [
157 | "#try it out on several descriptions of a random image\n",
158 | "as_matrix(captions[1337])"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [],
168 | "source": [
169 | "to_string(as_matrix(captions[1337])[0])"
170 | ]
171 | },
172 | {
173 | "cell_type": "markdown",
174 | "metadata": {},
175 | "source": [
176 | "### The neural network\n",
177 | "\n",
178 | "Since the image encoder CNN is already applied, the only remaining part is to write a sentence decoder.\n"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {
185 | "collapsed": true
186 | },
187 | "outputs": [],
188 | "source": [
189 | "import theano, theano.tensor as T\n",
190 | "import lasagne\n",
191 | "from lasagne.layers import *\n",
192 | "\n",
193 | "# network shapes. \n",
194 | "EMBEDDING_SIZE = 128 #Change at your will\n",
195 | "LSTM_SIZE = 256 #Change at your will\n",
196 | "ATTN_SIZE = 256 #Change at your will\n",
197 | "FEATURES,HEIGHT,WIDTH = img_codes.shape[1:]\n"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {
203 | "collapsed": false
204 | },
205 | "source": [
206 | "We will define a single LSTM step here. An LSTM step should\n",
207 | "* take previous cell/out and input\n",
208 | "* compute next cell/out and next token probabilities\n",
209 | "* use attention to work with image features"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "metadata": {
216 | "collapsed": true
217 | },
218 | "outputs": [],
219 | "source": [
220 | "#"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {
227 | "collapsed": false
228 | },
229 | "outputs": [],
230 | "source": [
231 | "from agentnet.resolver import ProbabilisticResolver\n",
232 | "from agentnet.memory import LSTMCell\n",
233 | "\n",
234 | "temperature = theano.shared(1.)\n",
235 | "class decoder:\n",
236 | " prev_word = InputLayer((None,),name='index of previous word')\n",
237 | " image_features = InputLayer((None,FEATURES,HEIGHT,WIDTH),name='img features')\n",
238 | "\n",
239 | " prev_cell = InputLayer((None,LSTM_SIZE),name='previous LSTM cell goes here')\n",
240 | " prev_out = InputLayer((None,LSTM_SIZE),name='previous LSTM output goes here')\n",
241 | " \n",
242 | " prev_word_emb = EmbeddingLayer(prev_word,len(vocab),EMBEDDING_SIZE)\n",
243 | " \n",
244 | " ###Attention part:\n",
245 | " # Please implement attention part of rnn architecture\n",
246 | " \n",
247 | " #First we reshape image into a sequence of image vectors\n",
248 | " image_features_seq = reshape(dimshuffle(image_features,[0,2,3,1]),[[0],-1,[3]])\n",
249 | " \n",
250 | " #Then we apply attention just as usual\n",
251 | " attn_probs = \n",
252 | " attn = \n",
253 | "\n",
254 | " lstm_input = concat([attn,prev_word_emb],axis=-1)\n",
255 | "\n",
256 | " new_cell,new_out = LSTMCell(prev_cell,prev_out,lstm_input)\n",
257 | " \n",
258 | " \n",
259 | " output_probs = DenseLayer(new_out,len(vocab),nonlinearity=T.nnet.softmax)\n",
260 | "\n",
261 | " \n",
262 | " output_probs_scaled = ExpressionLayer(output_probs,lambda p: p**temperature)\n",
263 | " output_tokens = ProbabilisticResolver(output_probs_scaled,assume_normalized=False)\n",
264 | " \n",
265 | " \n",
266 | " # recurrent state transition dict\n",
267 | " # on next step, {key} becomes {value}\n",
268 | " transition = {\n",
269 | " new_cell:prev_cell,\n",
270 | " new_out:prev_out\n",
271 | " }"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "### Training\n",
279 | "\n",
280 | "During training, we should feed our decoder RNN with reference captions from the dataset. Training then comes down to simple likelihood maximization problem.\n",
281 | "\n",
282 | "Deep learning people also know this as minimizing crossentropy."
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {
289 | "collapsed": false
290 | },
291 | "outputs": [],
292 | "source": [
293 | "# Inputs for sentences\n",
294 | "sentences = T.imatrix(\"[batch_size x time] of word ids\")\n",
295 | "l_sentences = InputLayer((None,None),sentences)\n",
296 | "\n",
297 | "# Input layer for image features\n",
298 | "image_vectors = T.tensor4(\"image features [batch,channels,h,w]\")\n",
299 | "l_image_features = InputLayer((None,FEATURES,HEIGHT,WIDTH),image_vectors)\n"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {
306 | "collapsed": false
307 | },
308 | "outputs": [],
309 | "source": [
310 | "from agentnet import Recurrence\n",
311 | "\n",
312 | "decoder_trainer = Recurrence(\n",
313 | " input_sequences={decoder.prev_word:l_sentences},\n",
314 | " input_nonsequences={decoder.image_features:l_image_features},\n",
315 | " state_variables=decoder.transition,\n",
316 | " tracked_outputs=[decoder.output_probs],\n",
317 | " unroll_scan = False,\n",
318 | ")"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": null,
324 | "metadata": {
325 | "collapsed": false
326 | },
327 | "outputs": [],
328 | "source": [
329 | "#get predictions and define loss\n",
330 | "next_token_probs = get_output(decoder_trainer[decoder.output_probs])\n",
331 | "\n",
332 | "next_token_probs = next_token_probs[:,:-1].reshape([-1,len(vocab)])\n",
333 | "next_tokens = sentences[:,1:].ravel()\n",
334 | "\n",
335 | "loss = T.nnet.categorical_crossentropy(next_token_probs,next_tokens)\n",
336 | "\n",
337 | "#apply mask\n",
338 | "mask = T.neq(next_tokens,PAD_ix)\n",
339 | "loss = T.sum(loss*mask)/T.sum(mask)"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {
346 | "collapsed": true
347 | },
348 | "outputs": [],
349 | "source": [
350 | "#trainable NN weights\n",
351 | "weights = get_all_params(decoder_trainer,trainable=True)\n",
352 | "updates = lasagne.updates.adam(loss,weights)"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {
359 | "collapsed": false
360 | },
361 | "outputs": [],
362 | "source": [
363 | "#compile a functions for training and evaluation\n",
364 | "#please not that your functions must accept image features as FIRST param and sentences as second one\n",
365 | "train_step = theano.function([image_vectors,sentences],loss,updates=updates,allow_input_downcast=True)\n",
366 | "val_step = theano.function([image_vectors,sentences],loss,allow_input_downcast=True)\n",
367 | "#for val_step use deterministic=True if you have any dropout/noize"
368 | ]
369 | },
370 | {
371 | "cell_type": "markdown",
372 | "metadata": {
373 | "collapsed": false
374 | },
375 | "source": [
376 | "# Training\n",
377 | "\n",
378 | "* You first have to implement a batch generator\n",
379 | "* Than the network will get trained the usual way"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {
386 | "collapsed": true
387 | },
388 | "outputs": [],
389 | "source": [
390 | "from random import choice\n",
391 | "\n",
392 | "def generate_batch(images,captions,batch_size,max_caption_len=None):\n",
393 | " \n",
394 | " #sample random numbers for image/caption indicies\n",
395 | " random_image_ix = np.random.randint(0,len(images),size=batch_size)\n",
396 | " \n",
397 | " #get images\n",
398 | " batch_images = images[random_image_ix]\n",
399 | " \n",
400 | " #5-7 captions for each image\n",
401 | " captions_for_batch_images = captions[random_image_ix]\n",
402 | " \n",
403 | " #pick 1 from 5-7 captions for each image\n",
404 | " batch_captions = list(map(choice,captions_for_batch_images))\n",
405 | " \n",
406 | " #convert to matrix\n",
407 | " batch_captions_ix = as_matrix(batch_captions,max_len=max_caption_len)\n",
408 | " \n",
409 | " return batch_images, batch_captions_ix"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": null,
415 | "metadata": {
416 | "collapsed": false
417 | },
418 | "outputs": [],
419 | "source": [
420 | "bx,by = generate_batch(img_codes,captions,3)\n",
421 | "bx[0,:10,0,0],by"
422 | ]
423 | },
424 | {
425 | "cell_type": "markdown",
426 | "metadata": {},
427 | "source": [
428 | "### Main loop\n",
429 | "* We recommend you to periodically evaluate the network using the next \"apply trained model\" block\n",
430 | " * its safe to interrupt training, run a few examples and start training again"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": null,
436 | "metadata": {
437 | "collapsed": true
438 | },
439 | "outputs": [],
440 | "source": [
441 | "batch_size=50 #adjust me\n",
442 | "n_epochs=100 #adjust me\n",
443 | "n_batches_per_epoch = 50 #adjust me\n",
444 | "n_validation_batches = 5 #how many batches are used for validation after each epoch\n"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": null,
450 | "metadata": {
451 | "collapsed": false,
452 | "scrolled": false
453 | },
454 | "outputs": [],
455 | "source": [
456 | "from tqdm import tqdm\n",
457 | "\n",
458 | "for epoch in range(n_epochs):\n",
459 | " \n",
460 | " train_loss=0\n",
461 | " for _ in tqdm(range(n_batches_per_epoch)):\n",
462 | " train_loss += train_step(*generate_batch(img_codes,captions,batch_size))\n",
463 | " train_loss /= n_batches_per_epoch\n",
464 | " \n",
465 | " \n",
466 | " print('Epoch: {}, train loss: {}'.format(epoch, train_loss))\n",
467 | "\n",
468 | "print(\"Finish :)\")"
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "metadata": {
474 | "collapsed": false
475 | },
476 | "source": [
477 | "### apply trained model"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": null,
483 | "metadata": {
484 | "collapsed": true
485 | },
486 | "outputs": [],
487 | "source": [
488 | "batch_size = theano.shared(np.int32(1))\n",
489 | "MAX_LENGTH = 20 #Change at your will"
490 | ]
491 | },
492 | {
493 | "cell_type": "code",
494 | "execution_count": null,
495 | "metadata": {
496 | "collapsed": false
497 | },
498 | "outputs": [],
499 | "source": [
500 | "#set up recurrent network that generates tokens and feeds them back to itself\n",
501 | "unroll_dict = dict(decoder.transition)\n",
502 | "unroll_dict[decoder.output_tokens] = decoder.prev_word #on next iter, output goes to input\n",
503 | "\n",
504 | "first_output = T.repeat(T.constant(START_ix,dtype='int32'),batch_size)\n",
505 | "init_dict = {\n",
506 | " decoder.output_tokens:InputLayer([None],first_output)\n",
507 | "}\n",
508 | "\n",
509 | "decoder_applier = Recurrence(\n",
510 | " input_nonsequences={decoder.image_features:l_image_features},\n",
511 | " state_variables=unroll_dict,\n",
512 | " state_init = init_dict,\n",
513 | " tracked_outputs=[decoder.output_probs,decoder.output_tokens],\n",
514 | " n_steps = MAX_LENGTH,\n",
515 | ")"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": null,
521 | "metadata": {
522 | "collapsed": false
523 | },
524 | "outputs": [],
525 | "source": [
526 | "generated_tokens = get_output(decoder_applier[decoder.output_tokens])\n",
527 | "\n",
528 | "generate = theano.function([image_vectors],generated_tokens,allow_input_downcast=True)"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": null,
534 | "metadata": {
535 | "collapsed": false
536 | },
537 | "outputs": [],
538 | "source": [
539 | "from pretrained_lenet import image_to_features\n",
540 | "import matplotlib.pyplot as plt\n",
541 | "%matplotlib inline\n",
542 | "\n",
543 | "img = plt.imread(\"./data/Dog-and-Cat.jpg\")\n",
544 | "plt.imshow(img)"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": null,
550 | "metadata": {
551 | "collapsed": false
552 | },
553 | "outputs": [],
554 | "source": [
555 | "output_ix = generate([image_to_features(img)])[0]\n",
556 | "\n",
557 | "for _ in range(100):\n",
558 | " temperature.set_value(10)\n",
559 | " print to_string(output_ix)"
560 | ]
561 | },
562 | {
563 | "cell_type": "markdown",
564 | "metadata": {},
565 | "source": [
566 | "### Some tricks (for further research)\n",
567 | "\n",
568 | "* Initialize LSTM with some function of image features.\n",
569 | "\n",
570 | "* Try other attention functions\n",
571 | "\n",
572 | "* If you train large network, it is usually a good idea to make a 2-stage prediction\n",
573 | " 1. (large recurrent state) -> (bottleneck e.g. 256)\n",
574 | " 2. (bottleneck) -> (vocabulary size)\n",
575 | " * this way you won't need to store/train (large_recurrent_state x vocabulary size) matrix\n",
576 | " \n",
577 | "* Use [hierarchical softmax](https://gist.github.com/justheuristic/581853c6d6b87eae9669297c2fb1052d) or [byte pair encodings](https://github.com/rsennrich/subword-nmt)\n",
578 | "\n",
579 | "\n"
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": null,
585 | "metadata": {
586 | "collapsed": true
587 | },
588 | "outputs": [],
589 | "source": []
590 | }
591 | ],
592 | "metadata": {
593 | "kernelspec": {
594 | "display_name": "Python 2",
595 | "language": "python",
596 | "name": "python2"
597 | },
598 | "language_info": {
599 | "codemirror_mode": {
600 | "name": "ipython",
601 | "version": 2
602 | },
603 | "file_extension": ".py",
604 | "mimetype": "text/x-python",
605 | "name": "python",
606 | "nbconvert_exporter": "python",
607 | "pygments_lexer": "ipython2",
608 | "version": "2.7.6"
609 | }
610 | },
611 | "nbformat": 4,
612 | "nbformat_minor": 0
613 | }
614 |
--------------------------------------------------------------------------------
/sem3-attention/data/Dog-and-Cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem3-attention/data/Dog-and-Cat.jpg
--------------------------------------------------------------------------------
/sem3-attention/pretrained_lenet.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import theano,theano.tensor as T
4 |
5 | from lasagne.layers import *
6 | from lasagne.layers import Conv2DLayer as ConvLayer
7 | from lasagne.layers import MaxPool2DLayer as PoolLayerDNN
8 | from lasagne.layers import MaxPool2DLayer as PoolLayer
9 | from lasagne.layers import LocalResponseNormalization2DLayer as LRNLayer
10 | from lasagne.nonlinearities import softmax, linear
11 |
12 |
13 | def build_inception_module(name, input_layer, nfilters):
14 | # nfilters: (pool_proj, 1x1, 3x3_reduce, 3x3, 5x5_reduce, 5x5)
15 | net = {}
16 | net['pool'] = PoolLayerDNN(input_layer, pool_size=3, stride=1, pad=1)
17 | net['pool_proj'] = ConvLayer(net['pool'], nfilters[0], 1)
18 |
19 | net['1x1'] = ConvLayer(input_layer, nfilters[1], 1)
20 |
21 | net['3x3_reduce'] = ConvLayer(input_layer, nfilters[2], 1)
22 | net['3x3'] = ConvLayer(net['3x3_reduce'], nfilters[3], 3, pad=1)
23 |
24 | net['5x5_reduce'] = ConvLayer(input_layer, nfilters[4], 1)
25 | net['5x5'] = ConvLayer(net['5x5_reduce'], nfilters[5], 5, pad=2)
26 |
27 | net['output'] = ConcatLayer([
28 | net['1x1'],
29 | net['3x3'],
30 | net['5x5'],
31 | net['pool_proj'],
32 | ])
33 |
34 | return {'{}/{}'.format(name, k): v for k, v in net.items()}
35 |
36 |
37 | def build_model():
38 | net = {}
39 | net['input'] = InputLayer((None, 3, None, None))
40 | net['conv1/7x7_s2'] = ConvLayer(net['input'], 64, 7, stride=2, pad=3)
41 | net['pool1/3x3_s2'] = PoolLayer(net['conv1/7x7_s2'],
42 | pool_size=3,
43 | stride=2,
44 | ignore_border=False)
45 | net['pool1/norm1'] = LRNLayer(net['pool1/3x3_s2'], alpha=0.00002, k=1)
46 | net['conv2/3x3_reduce'] = ConvLayer(net['pool1/norm1'], 64, 1)
47 | net['conv2/3x3'] = ConvLayer(net['conv2/3x3_reduce'], 192, 3, pad=1)
48 | net['conv2/norm2'] = LRNLayer(net['conv2/3x3'], alpha=0.00002, k=1)
49 | net['pool2/3x3_s2'] = PoolLayer(net['conv2/norm2'], pool_size=3, stride=2)
50 |
51 | net.update(build_inception_module('inception_3a',
52 | net['pool2/3x3_s2'],
53 | [32, 64, 96, 128, 16, 32]))
54 | net.update(build_inception_module('inception_3b',
55 | net['inception_3a/output'],
56 | [64, 128, 128, 192, 32, 96]))
57 | net['pool3/3x3_s2'] = PoolLayer(net['inception_3b/output'],
58 | pool_size=3, stride=2)
59 |
60 | net.update(build_inception_module('inception_4a',
61 | net['pool3/3x3_s2'],
62 | [64, 192, 96, 208, 16, 48]))
63 | net.update(build_inception_module('inception_4b',
64 | net['inception_4a/output'],
65 | [64, 160, 112, 224, 24, 64]))
66 | net.update(build_inception_module('inception_4c',
67 | net['inception_4b/output'],
68 | [64, 128, 128, 256, 24, 64]))
69 | net.update(build_inception_module('inception_4d',
70 | net['inception_4c/output'],
71 | [64, 112, 144, 288, 32, 64]))
72 | net.update(build_inception_module('inception_4e',
73 | net['inception_4d/output'],
74 | [128, 256, 160, 320, 32, 128]))
75 | net['pool4/3x3_s2'] = PoolLayer(net['inception_4e/output'],
76 | pool_size=3, stride=2)
77 |
78 | net.update(build_inception_module('inception_5a',
79 | net['pool4/3x3_s2'],
80 | [128, 256, 160, 320, 32, 128]))
81 | net.update(build_inception_module('inception_5b',
82 | net['inception_5a/output'],
83 | [128, 384, 192, 384, 48, 128]))
84 |
85 | net['pool5/7x7_s1'] = GlobalPoolLayer(net['inception_5b/output'])
86 | net['loss3/classifier'] = DenseLayer(net['pool5/7x7_s1'],
87 | num_units=1000,
88 | nonlinearity=linear)
89 | net['prob'] = NonlinearityLayer(net['loss3/classifier'],
90 | nonlinearity=softmax)
91 | return net
92 |
93 |
94 | import skimage.transform
95 | import numpy as np
96 | MEAN_VALUES = np.array([104, 117, 123]).reshape((3,1,1))
97 | def preprocess(im):
98 | if len(im.shape) == 2:
99 | im = im[:, :, np.newaxis]
100 | im = np.repeat(im, 3, axis=2)
101 | # Resize so smallest dim = 224, preserving aspect ratio
102 | h, w, _ = im.shape
103 | if h < w:
104 | im = skimage.transform.resize(im, (224, w*224//h), preserve_range=True)
105 | else:
106 | im = skimage.transform.resize(im, (h*224//w, 224), preserve_range=True)
107 |
108 | # Central crop to 224x224
109 | h, w, _ = im.shape
110 | im = im[h//2-112:h//2+112, w//2-112:w//2+112]
111 |
112 | rawim = np.copy(im).astype('uint8')
113 |
114 | # Shuffle axes to c01
115 | im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1)
116 |
117 | # Convert to BGR
118 | im = im[::-1, :, :]
119 |
120 | im = im - MEAN_VALUES
121 | return im[np.newaxis].astype('float32')
122 |
123 | #build and compile model
124 | import pickle
125 | lenet = build_model()
126 | lenet_weights = pickle.load(open('data/blvc_googlenet.pkl'))['param values']
127 | set_all_param_values(lenet["prob"], lenet_weights)
128 |
129 | cnn_input_var = lenet['input'].input_var
130 | cnn_feature_layer = lenet['inception_5b/output']
131 | get_cnn_features = theano.function([cnn_input_var], get_output(cnn_feature_layer))
132 |
133 | pca = pickle.load(open("./data/svd.pcl"))
134 |
135 |
136 | def image_to_features(im):
137 | assert len(im.shape) ==3 and im.shape[2]==3,"You should provide an RGB image of shape [h,w,3]"
138 | im = preprocess(im)
139 | cnn_features = get_cnn_features(im)[0]
140 | H,W = cnn_features.shape[-2:]
141 | cnn_features_flat = cnn_features.transpose([1,2,0]).reshape([-1,1024])
142 | svd_features_flat = pca.transform(cnn_features_flat)
143 | return svd_features_flat.reshape([H,W,-1]).transpose([2,0,1])
144 |
--------------------------------------------------------------------------------
/sem4-GP/2_BayesOpt.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "slide"
8 | }
9 | },
10 | "source": [
11 | "# Bayesian optimization\n",
12 | "\n",
13 | "* Mainly used for optimization of \"heavy\" functions (computationally complex, expensive to evaluate)\n",
14 | "* The objective function can be \"black box\"\n",
15 | "* Uses approximation of the objective function\n",
16 | "* Takes into account quality of the approximation"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "slideshow": {
23 | "slide_type": "slide"
24 | }
25 | },
26 | "source": [
27 | "#### Optimization procedure:\n",
28 | "1. Build approximation $\\hat{f}(x)$ of function $f(x)$\n",
29 | "2. Choose new point as an argmax of the criterion\n",
30 | "$$\n",
31 | "x_{new} = \\arg\\max\\limits_x a(x)\n",
32 | "$$\n",
33 | "3. Evaluate $f(x)$ at new point\n",
34 | "4. Update model $\\hat{f}(x)$ and go to step 2.\n"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {
40 | "slideshow": {
41 | "slide_type": "slide"
42 | }
43 | },
44 | "source": [
45 | "### Expected Improvement\n",
46 | "\n",
47 | "$$\n",
48 | "EI(x) = \\mathbb{E}_{p(\\hat{f})} \\left [\\max(0, y_{min} - \\hat{f}) \\right ]\n",
49 | "$$\n",
50 | "where $\\hat{y}, \\sigma$ - mean and variance of GP model at point $x$,\n",
51 | "$\\Phi(\\cdot)$ - cdf of standard normal distribution,\n",
52 | "$\\phi(\\cdot)$ - pdf of standard normal distribution.\n",
53 | "\n",
54 | "Usually logarithm of EI is used."
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {
60 | "slideshow": {
61 | "slide_type": "slide"
62 | }
63 | },
64 | "source": [
65 | " "
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {
71 | "slideshow": {
72 | "slide_type": "slide"
73 | }
74 | },
75 | "source": [
76 | "### Optimization of criterion\n",
77 | "\n",
78 | "Any optimization algorithm could be used.\n",
79 | "\n",
80 | "In this seminar we will use multi-start with L-BFGS optimization algorithm\n",
81 | "\n",
82 | "Multi-start procedure:\n",
83 | "1. Generate initial set of points $x_1, \\ldots, x_n$. Calculate criterion at each point to obtain $(a(x_1), \\ldots, a(x_n))$.\n",
84 | "2. Choose $k$ points with smallest values of criterion.\n",
85 | "3. Using each point as an initial point run the optimization algorithm (L-BFGS) and obtain $k$ optimization results.\n",
86 | "4. From all optimization results choose the best one."
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {
92 | "slideshow": {
93 | "slide_type": "skip"
94 | }
95 | },
96 | "source": [
97 | "### L-BFGS \n",
98 | "\n",
99 | "It's a quasi-Newton method of optimization and it is based on second order Taylor expansion\n",
100 | "$$\n",
101 | "f(x_k + p) \\approx f(x_k) + \\nabla f^T(x_k) p + \\frac12 p^T \\mathbf{H}p\n",
102 | "$$\n",
103 | "$$\n",
104 | "p = -\\mathbf{H}^{-1}\\nabla f^T(x_k) \\approx -\\mathbf{B}_k^{-1} \\nabla f^T(x_k),\n",
105 | "$$\n",
106 | "where $\\mathbf{B}_k$ is an approximation of hessian $\\mathbf{H}$.\n",
107 | "\n",
108 | "Approximation $\\mathbf{B}_k$ is updated at every step by the following rule:\n",
109 | "$$\n",
110 | "\\mathbf{B}_{k + 1} = \\mathbf{B}_k - \\frac{\\mathbf{B}_k s_k s_k^T \\mathbf{B}_k}{s_k^T \\mathbf{B}_k s_k} + \\frac{y_k y_k^T}{y_k^T s_k},\n",
111 | "$$\n",
112 | "where $s_k = x_{k + 1} - x_k$, $y_k = \\nabla f(x_{k + 1}) - \\nabla f(x_k)$."
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "collapsed": true,
120 | "slideshow": {
121 | "slide_type": "skip"
122 | }
123 | },
124 | "outputs": [],
125 | "source": [
126 | "%matplotlib inline\n",
127 | "\n",
128 | "from __future__ import print_function\n",
129 | "\n",
130 | "import numpy as np\n",
131 | "from matplotlib import pyplot\n",
132 | "from mpl_toolkits.mplot3d import Axes3D\n",
133 | "from matplotlib import cm\n",
134 | "from scipy.optimize import minimize\n",
135 | "\n",
136 | "\n",
137 | "import GPy\n",
138 | "\n",
139 | "import bayes_opt\n",
140 | "\n",
141 | "\n",
142 | "def f(x):\n",
143 | " return (6 * x - 2)**2 * np.sin(12 * x - 4) \n",
144 | "\n",
145 | "def get_1d_data():\n",
146 | " np.random.seed(239)\n",
147 | " x_train = np.array([0.0, 0.58, 0.38, 0.95]).reshape(-1, 1)\n",
148 | " y_train = f(x_train)\n",
149 | " return x_train, y_train"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {
156 | "collapsed": true,
157 | "slideshow": {
158 | "slide_type": "skip"
159 | }
160 | },
161 | "outputs": [],
162 | "source": [
163 | "x_train, y_train = get_1d_data()\n",
164 | "kernel = GPy.kern.RBF(1, variance=0.5, lengthscale=0.2)\n",
165 | "model = GPy.models.GPRegression(x_train, y_train, kernel)"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {
172 | "collapsed": true,
173 | "slideshow": {
174 | "slide_type": "skip"
175 | }
176 | },
177 | "outputs": [],
178 | "source": [
179 | "x_grid = np.linspace(0, 1, 100).reshape(-1, 1)\n",
180 | "y_grid = f(x_grid)\n",
181 | "prediction, std = model.predict(x_grid)\n",
182 | "prediction = prediction.ravel()\n",
183 | "std = std.ravel()\n",
184 | "pyplot.figure(figsize=(8, 6))\n",
185 | "pyplot.plot(x_train, y_train, 'or', markersize=8, label='Training set')\n",
186 | "pyplot.plot(x_grid, prediction, '-k', linewidth=2, label='Approximation')\n",
187 | "pyplot.fill_between(x_grid.ravel(), prediction - 2 * std, prediction + 2 * std, alpha=0.3)\n",
188 | "pyplot.plot(x_grid, y_grid, '--b', label='True function')\n",
189 | "pyplot.ylim([-15, 20])\n",
190 | "pyplot.legend(loc='best')"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "### Task\n",
198 | "\n",
199 | "Derive expression for EI: express it in terms of $\\Phi(\\cdot)$ and $\\phi(\\cdot)$ - cdf and pdf of $\\mathcal{N}(0, 1)$."
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {
205 | "slideshow": {
206 | "slide_type": "slide"
207 | }
208 | },
209 | "source": [
210 | "### Task\n",
211 | "Implement multi-start optimization procedure"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {
218 | "collapsed": true,
219 | "slideshow": {
220 | "slide_type": "skip"
221 | }
222 | },
223 | "outputs": [],
224 | "source": [
225 | "def get_new_point(model, lb, ub, data=None, multistart=10, criterion='ei', k=1, random_state=None):\n",
226 | " \"\"\"\n",
227 | " Parameters:\n",
228 | " model - GP model of the objective function\n",
229 | " lb, ub - array-like, lower and upper bounds of x\n",
230 | " data - tuple(x_train, y_train)\n",
231 | " multistart - number of multistart runs\n",
232 | " criterion - aqcuisition function, by default EI\n",
233 | " k - parameter of the LowerConfidenceBound function\n",
234 | " random_state - np.random.RandomState\n",
235 | " Returns\n",
236 | " tuple - argmin of the objective function and min value of the objective \n",
237 | " \"\"\"\n",
238 | " if random_state is None:\n",
239 | " random_state = np.random.RandomState()\n",
240 | "\n",
241 | " lb = np.array(lb).reshape(1, -1)\n",
242 | " ub = np.array(ub).reshape(1, -1)\n",
243 | " \n",
244 | " # 1. Generate inital X points (number of points == multistart) in [lb, ub]\n",
245 | " \n",
246 | " ######## Your code here ########\n",
247 | " x_random = \n",
248 | "\n",
249 | " \n",
250 | " ######## ########\n",
251 | " def objective(x):\n",
252 | " if x.ndim == 1:\n",
253 | " x = x.reshape(1, -1)\n",
254 | " mean_values, variance = model.predict(x)\n",
255 | " std_values = np.sqrt(variance)\n",
256 | " if criterion == 'ei':\n",
257 | " return -log_expected_improvement(mean_values, std_values, data[1].min())\n",
258 | " elif criterion == 'lcb':\n",
259 | " return lcb(mean_values, std_values, params)\n",
260 | " else:\n",
261 | " raise NotImplementedError('Criterion is not implemented!')\n",
262 | "\n",
263 | " criterion_value = objective(x_random)\n",
264 | " \n",
265 | " # 2. From each points from x_random run L-BFGS optimization algorithm, \n",
266 | " # choose the best result and return it\n",
267 | " # Use function minimize: minimize(objective, x_init, method='L-BFGS-B',\n",
268 | " # bounds=np.vstack((lb, ub)).T)\n",
269 | " # it returns object with fields 'fun' - optimum function value, 'x' - argmin.\n",
270 | "\n",
271 | " best_result = None\n",
272 | " best_value = np.inf\n",
273 | "\n",
274 | " ######## Your code here ########\n",
275 | " \n",
276 | " return best_result.x, best_result.fun"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {
283 | "collapsed": true,
284 | "slideshow": {
285 | "slide_type": "skip"
286 | }
287 | },
288 | "outputs": [],
289 | "source": [
290 | "# Check your code \n",
291 | "lb = [0]\n",
292 | "ub = [1]\n",
293 | "kernel = GPy.kern.RBF(1, variance=0.5, lengthscale=0.1)\n",
294 | "model = GPy.models.GPRegression(x_train, y_train, kernel)\n",
295 | "x_new, f_new = get_new_point(model, lb, ub, data=(x_train, y_train), random_state=np.random.RandomState(42))\n",
296 | "\n",
297 | "assert(np.isclose(x_new, 0.29985639))\n",
298 | "assert(np.isclose(f_new, 0.86480674))\n",
299 | "print('Correct!')"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {
306 | "collapsed": true
307 | },
308 | "outputs": [],
309 | "source": [
310 | "def optimization_step(x_train, y_train, kernel, objective, lb=None, ub=None, criterion='ei', k=1, plot=False):\n",
311 | " model = GPy.models.GPRegression(x_train, y_train, kernel)\n",
312 | " model.optimize_restarts(num_restarts=10, verbose=False)\n",
313 | "\n",
314 | " x_new, criterion_value = get_new_point(model, data=(x_train, y_train), lb=lb, ub=ub, criterion=criterion, k=k)\n",
315 | " if plot:\n",
316 | " bayes_opt.plot1d(x_train, y_train, model, objective, x_new, criterion_value)\n",
317 | " pyplot.show()\n",
318 | "\n",
319 | " x_new = x_new.reshape(1, -1)\n",
320 | " x_train = np.vstack([x_train, x_new])\n",
321 | " y_train = np.vstack([y_train, np.asarray(objective(x_new)).reshape(1, -1)])\n",
322 | " return x_train, y_train, model"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {
328 | "slideshow": {
329 | "slide_type": "skip"
330 | }
331 | },
332 | "source": [
333 | "## 1D example"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {
340 | "collapsed": true,
341 | "scrolled": false,
342 | "slideshow": {
343 | "slide_type": "skip"
344 | }
345 | },
346 | "outputs": [],
347 | "source": [
348 | "x_train, y_train = get_1d_data()\n",
349 | "kernel = GPy.kern.RBF(1, variance=0.5, lengthscale=0.2)\n",
350 | "model = GPy.models.GPRegression(x_train, y_train, kernel)\n",
351 | "for i in range(6):\n",
352 | " x_train, y_train, model = bayes_opt.optimization_step(x_train, y_train, kernel, f, lb=[0], ub=[1], criterion='ei', plot=True)"
353 | ]
354 | },
355 | {
356 | "cell_type": "markdown",
357 | "metadata": {
358 | "slideshow": {
359 | "slide_type": "skip"
360 | }
361 | },
362 | "source": [
363 | "## 2D demo"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {
370 | "collapsed": true,
371 | "scrolled": false,
372 | "slideshow": {
373 | "slide_type": "skip"
374 | }
375 | },
376 | "outputs": [],
377 | "source": [
378 | "budget = 30\n",
379 | "n_init = 10\n",
380 | "\n",
381 | "kernel = GPy.kern.RBF(2, variance=1, lengthscale=0.5, ARD=False)\n",
382 | "\n",
383 | "save_path = '2d_demo.mp4'\n",
384 | "bayes_opt.demo_2d(n_init, budget, kernel, save_path=save_path)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": null,
390 | "metadata": {
391 | "collapsed": true,
392 | "scrolled": false,
393 | "slideshow": {
394 | "slide_type": "skip"
395 | }
396 | },
397 | "outputs": [],
398 | "source": [
399 | "import io\n",
400 | "import base64\n",
401 | "from IPython.display import HTML\n",
402 | "\n",
403 | "video = io.open(save_path, 'r+b').read()\n",
404 | "encoded = base64.b64encode(video)\n",
405 | "HTML(data='''\n",
406 | " \n",
407 | " '''.format(encoded.decode('ascii')))"
408 | ]
409 | },
410 | {
411 | "cell_type": "markdown",
412 | "metadata": {
413 | "collapsed": true,
414 | "slideshow": {
415 | "slide_type": "slide"
416 | }
417 | },
418 | "source": [
419 | "### Hyperparmeters tuning\n",
420 | "\n",
421 | "* Almost all machine learning have hyperparameters\n",
422 | "* Quality of the model depends on the hyperparameters\n",
423 | "* Quality estimation for one set of hyperparameters can take long time\n",
424 | "* => Bayesian optimization can be used for hyperparameters tuning."
425 | ]
426 | },
427 | {
428 | "cell_type": "markdown",
429 | "metadata": {
430 | "slideshow": {
431 | "slide_type": "slide"
432 | }
433 | },
434 | "source": [
435 | "#### Bayesian optimization for hyperparameter tuning\n",
436 | "\n",
437 | "Objective function to optimize\n",
438 | "* Takes hyperparameters as input\n",
439 | "* Builds a model (maybe several times in case of cross-validation)\n",
440 | "* Calculates and returns model quality"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": null,
446 | "metadata": {
447 | "collapsed": true,
448 | "slideshow": {
449 | "slide_type": "skip"
450 | }
451 | },
452 | "outputs": [],
453 | "source": [
454 | "from sklearn.ensemble import RandomForestRegressor\n",
455 | "from sklearn.model_selection import train_test_split, cross_val_score\n",
456 | "from sklearn.preprocessing import StandardScaler\n",
457 | "\n",
458 | "from IPython import display"
459 | ]
460 | },
461 | {
462 | "cell_type": "markdown",
463 | "metadata": {
464 | "slideshow": {
465 | "slide_type": "skip"
466 | }
467 | },
468 | "source": [
469 | "##### House pricing dataset\n",
470 | "\n",
471 | "In this task you need to predict House Sale Price. There are 25 numerical input features like lot area, overall condition rating, house quality, number of kitchens and so on (there were a lot of categorical variables which we removed in this example for simplicity).\n",
472 | "\n",
473 | "We are going to tune XGBoost parameters using Bayesian Optimization to obtain more accurate model."
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "metadata": {
480 | "collapsed": true,
481 | "slideshow": {
482 | "slide_type": "skip"
483 | }
484 | },
485 | "outputs": [],
486 | "source": [
487 | "data = np.loadtxt('house_pricing.csv')\n",
488 | "\n",
489 | "X = data[:, :-1]\n",
490 | "y = data[:, -1:]\n",
491 | "\n",
492 | "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)"
493 | ]
494 | },
495 | {
496 | "cell_type": "markdown",
497 | "metadata": {
498 | "slideshow": {
499 | "slide_type": "skip"
500 | }
501 | },
502 | "source": [
503 | "We implement `model_error_cv()` function that will be our objective function. \n",
504 | "We are going to use RBF kernel in our Bayesian Optimization, the result of optimization will be continuous variables,\n",
505 | "so we need to preprocess parameters - cast integer parameters to int."
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": null,
511 | "metadata": {
512 | "collapsed": true,
513 | "slideshow": {
514 | "slide_type": "skip"
515 | }
516 | },
517 | "outputs": [],
518 | "source": [
519 | "def wrap_parameters(parameters, scaler=None):\n",
520 | " if scaler:\n",
521 | " parameters = scaler.transform(parameters)\n",
522 | " return parameters\n",
523 | "\n",
524 | "\n",
525 | "def unwrap_parameters(parameters, scaler=None):\n",
526 | " if scaler:\n",
527 | " parameters = scaler.inverse_transform(parameters)\n",
528 | " p = [int(parameters[0]), parameters[1], int(parameters[2]),\n",
529 | " max(0, min(parameters[3], 1))]\n",
530 | " return p\n",
531 | "\n",
532 | "\n",
533 | "def model_error_cv(parameters, X, y, scaler=None):\n",
534 | " errors = []\n",
535 | " for p in parameters:\n",
536 | " p = unwrap_parameters(p, scaler)\n",
537 | " model = xgboost.XGBRegressor(max_depth=p[0],\n",
538 | " learning_rate=p[1],\n",
539 | " n_estimators=p[2],\n",
540 | " subsample=p[3],\n",
541 | " )\n",
542 | "\n",
543 | " score = cross_val_score(model, X, y, cv=3).mean()\n",
544 | " errors.append(score)\n",
545 | " return np.array(errors).reshape(-1, 1)"
546 | ]
547 | },
548 | {
549 | "cell_type": "markdown",
550 | "metadata": {
551 | "slideshow": {
552 | "slide_type": "skip"
553 | }
554 | },
555 | "source": [
556 | "We scale the parameters using StandardScaler() from sklearn - it is nice to have all the parameters with unit variance and mean zero\n",
557 | "when using RBF kernel as it is easier to tune lengthscale parameters, because these parameters depend on the range of input variables."
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": null,
563 | "metadata": {
564 | "collapsed": true,
565 | "slideshow": {
566 | "slide_type": "skip"
567 | }
568 | },
569 | "outputs": [],
570 | "source": [
571 | "# xgboost params: max_depth, learning_rate, n_estimators, subsample\n",
572 | "lower_bound = np.array([1, 0.001, 100, 0.2])\n",
573 | "upper_bound = np.array([6, 0.1, 1000, 1])\n",
574 | "\n",
575 | "np.random.seed(42)\n",
576 | "n_init_points = 10\n",
577 | "initial_parameters = np.random.rand(n_init_points, len(lower_bound)) * (upper_bound - lower_bound) + lower_bound\n",
578 | "initial_errors = -model_error_cv(initial_parameters, X, y)\n",
579 | "\n",
580 | "scaler = StandardScaler()\n",
581 | "scaler.fit(initial_parameters)\n",
582 | "lower_bound = scaler.transform(lower_bound)\n",
583 | "upper_bound = scaler.transform(upper_bound)\n",
584 | "initial_parameters = wrap_parameters(initial_parameters, scaler)"
585 | ]
586 | },
587 | {
588 | "cell_type": "markdown",
589 | "metadata": {
590 | "slideshow": {
591 | "slide_type": "skip"
592 | }
593 | },
594 | "source": [
595 | "It is also nice idea to explicitly constrain lengthscale parameter - it shouldn't be much larger than distance between points in the training set, it shouldn't be much smaller than the distance between points in the training set."
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": null,
601 | "metadata": {
602 | "collapsed": true,
603 | "slideshow": {
604 | "slide_type": "skip"
605 | }
606 | },
607 | "outputs": [],
608 | "source": [
609 | "kernel = GPy.kern.RBF(len(lower_bound), lengthscale=(upper_bound - lower_bound).min() / n_init_points, ARD=False)\n",
610 | "gp_model = GPy.models.GPRegression(initial_parameters, initial_errors, kernel=kernel)\n",
611 | "gp_model.rbf.lengthscale.constrain_bounded(0.001, 10)\n",
612 | "gp_model.optimize()\n",
613 | "print(gp_model)\n",
614 | "print(gp_model.rbf.lengthscale)"
615 | ]
616 | },
617 | {
618 | "cell_type": "code",
619 | "execution_count": null,
620 | "metadata": {
621 | "collapsed": true,
622 | "scrolled": false,
623 | "slideshow": {
624 | "slide_type": "-"
625 | }
626 | },
627 | "outputs": [],
628 | "source": [
629 | "budget = 40\n",
630 | "\n",
631 | "hyperparameters = initial_parameters\n",
632 | "errors = initial_errors\n",
633 | "error_history = [-initial_errors[:i].min() for i in range(1, n_init_points + 1)]\n",
634 | "objective = lambda x: -model_error_cv(x, X, y, scaler)\n",
635 | "for i in range(budget):\n",
636 | " hyperparameters, errors, gp_model = bayes_opt.optimization_step(hyperparameters, errors, kernel, objective,\n",
637 | " lb=lower_bound, ub=upper_bound)\n",
638 | " error_history.append(-errors.min())\n",
639 | " # Visualize\n",
640 | " display.clear_output(wait=True)\n",
641 | " pyplot.figure(figsize=(8, 6))\n",
642 | " \n",
643 | " pyplot.xlabel(\"#iteration\")\n",
644 | " pyplot.ylabel(\"R2\")\n",
645 | " pyplot.plot(error_history)\n",
646 | " pyplot.show()\n",
647 | " \n",
648 | " print(\"New parameters: {}, new error:\\t{}\\nbest parameters: {}, best error:\\t{}\".format(\n",
649 | " unwrap_parameters(hyperparameters[-1], scaler), -errors[-1, 0],\n",
650 | " unwrap_parameters(hyperparameters[errors.argmin()], scaler), -errors.min()))\n",
651 | " print(gp_model.rbf.lengthscale)"
652 | ]
653 | }
654 | ],
655 | "metadata": {
656 | "kernelspec": {
657 | "display_name": "Python 3",
658 | "language": "python",
659 | "name": "python3"
660 | },
661 | "language_info": {
662 | "codemirror_mode": {
663 | "name": "ipython",
664 | "version": 3
665 | },
666 | "file_extension": ".py",
667 | "mimetype": "text/x-python",
668 | "name": "python",
669 | "nbconvert_exporter": "python",
670 | "pygments_lexer": "ipython3",
671 | "version": "3.5.2"
672 | }
673 | },
674 | "nbformat": 4,
675 | "nbformat_minor": 2
676 | }
677 |
--------------------------------------------------------------------------------
/sem4-GP/2d_demo.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem4-GP/2d_demo.mp4
--------------------------------------------------------------------------------
/sem4-GP/3_LargeScaleGP.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "slideshow": {
7 | "slide_type": "slide"
8 | }
9 | },
10 | "source": [
11 | "### Large-scale GP\n",
12 | "\n",
13 | "Predictive mean and variance of GPR model:\n",
14 | "$$\n",
15 | "m(x_*) = \\mathbf{k}^T \\mathbf{K}_y^{-1} \\mathbf{y},\n",
16 | "$$\n",
17 | "$$\n",
18 | "\\sigma^2(x_*) = k(x_*, x_*) - \\mathbf{k}^T\\mathbf{K}_y^{-1}\\mathbf{k}\n",
19 | "$$\n",
20 | "\n",
21 | "**Issue**: the computational complexity is $\\mathcal{O}(N^3)$, where $N$ is the training size."
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {
27 | "slideshow": {
28 | "slide_type": "slide"
29 | }
30 | },
31 | "source": [
32 | "### Nystrom approximation\n",
33 | "\n",
34 | "Idea: introduce inducing points $(X_u, \\mathbf{u})$ which are used for low-rank approximation of covariance matrix:\n",
35 | "$$\n",
36 | "\\mathbf{K} \\approx \\mathbf{K}_{NM} \\mathbf{K}_{MM}^{-1} \\mathbf{K}_{MN}, \\quad \\mathbf{K}_{NM} = k(X, X_u), \\mathbf{K}_{MM} = k(X_u, X_u), \\mathbf{K}_{MN} = k(X_u, X)\n",
37 | "$$\n",
38 | "\n",
39 | "Predictive distribtion:\n",
40 | "$$\n",
41 | "f_* \\; | \\; x_*, X, \\mathbf{y} \\sim \\mathcal{N}\\left (m(x_*), \\; \\sigma^2(x_*)\\right ),\n",
42 | "$$\n",
43 | "$$\n",
44 | "m(x_*) = \\mathbf{k}^T \\left (\\mathbf{K}_{NM}\\mathbf{K}_{MM}^{-1} \\mathbf{K}_{MN} + \\sigma_n^2 I \\right )^{-1} \\mathbf{y}\n",
45 | "$$\n",
46 | "$$\n",
47 | "\\sigma^2(x_*) = k(x_*, x_*) - \\mathbf{k}^T\\left (\\mathbf{K}_{NM}\\mathbf{K}_{MM}^{-1}\\mathbf{K}_{MN} + \\sigma^2_n I \\right)^{-1} k(x_*, x_*)\n",
48 | "$$"
49 | ]
50 | },
51 | {
52 | "cell_type": "markdown",
53 | "metadata": {
54 | "slideshow": {
55 | "slide_type": "slide"
56 | }
57 | },
58 | "source": [
59 | "Using Woodbury matrix identity we can calculate the inverse more efficiently:\n",
60 | "$$\n",
61 | "\\left (\\sigma_n^2 I + \\mathbf{K}_{NM}\\mathbf{K}_{MM}^{-1} \\mathbf{K}_{MN} \\right)^{-1} = \\sigma_n^{-2} \\left (\n",
62 | "I - \\mathbf{K}_{NM} \\left (\\sigma_n^2 \\mathbf{K}_{MM} + \\mathbf{K}_{MN} \\mathbf{K}_{NM} \\right )^{-1} \\mathbf{K}_{MN}\n",
63 | "\\right )\n",
64 | "$$\n",
65 | "\n",
66 | "The computational complexity is $\\mathcal{O}(NM^2)$."
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": null,
72 | "metadata": {
73 | "collapsed": true,
74 | "slideshow": {
75 | "slide_type": "skip"
76 | }
77 | },
78 | "outputs": [],
79 | "source": [
80 | "from __future__ import print_function\n",
81 | "\n",
82 | "import pandas as pd\n",
83 | "import numpy as np\n",
84 | "import GPy\n",
85 | "from sklearn.model_selection import train_test_split\n",
86 | "from sklearn.metrics import mean_squared_error, r2_score\n",
87 | "from sklearn.preprocessing import StandardScaler\n",
88 | "from sklearn.model_selection import cross_val_predict\n",
89 | "from sklearn.pipeline import Pipeline\n",
90 | "from sklearn import svm\n",
91 | "\n",
92 | "from matplotlib import pyplot\n",
93 | "\n",
94 | "%matplotlib notebook"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {
100 | "slideshow": {
101 | "slide_type": "skip"
102 | }
103 | },
104 | "source": [
105 | "For convenience we wrap GPy model to have sklearn-like API to use it in `cross_val_predict()` function from sklearn\n",
106 | "\n",
107 | "Note, that in this implementation we generate random inducing inputs and fix them. However, inducing points can be optimized."
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {
114 | "collapsed": true,
115 | "slideshow": {
116 | "slide_type": "skip"
117 | }
118 | },
119 | "outputs": [],
120 | "source": [
121 | "from sklearn.base import BaseEstimator\n",
122 | "\n",
123 | "class SparseGPModel(BaseEstimator):\n",
124 | " def __init__(self, kernel, num_inducing=100):\n",
125 | " self.kernel_ = kernel\n",
126 | " self.num_inducing = num_inducing\n",
127 | " \n",
128 | " def fit(self, X, y):\n",
129 | " idx = np.random.permutation(X.shape[0])\n",
130 | " Z = X[idx[:self.num_inducing]]\n",
131 | " self.model_ = GPy.models.SparseGPRegression(X, y, kernel=self.kernel_, Z=Z)\n",
132 | " self.model_.inducing_inputs.fix()\n",
133 | "\n",
134 | " self.model_.optimize(max_iters=100)\n",
135 | " \n",
136 | " def predict(self, X):\n",
137 | " prediction, _ = self.model_.predict(X)\n",
138 | " return prediction\n",
139 | " \n",
140 | " def score(self, X, y):\n",
141 | " prediction = self.predict(X)\n",
142 | " return r2_score(y, prediction)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {
148 | "slideshow": {
149 | "slide_type": "skip"
150 | }
151 | },
152 | "source": [
153 | "Let's load house pricing data again."
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {
160 | "collapsed": true,
161 | "slideshow": {
162 | "slide_type": "skip"
163 | }
164 | },
165 | "outputs": [],
166 | "source": [
167 | "data = np.loadtxt('house_pricing.csv')\n",
168 | "\n",
169 | "scaler = StandardScaler()\n",
170 | "\n",
171 | "X = scaler.fit_transform(data[:, :-1])\n",
172 | "y = data[:, -1:]\n",
173 | "y_log = np.log(y)\n",
174 | "\n",
175 | "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {
182 | "collapsed": true,
183 | "scrolled": false,
184 | "slideshow": {
185 | "slide_type": "skip"
186 | }
187 | },
188 | "outputs": [],
189 | "source": [
190 | "%%time\n",
191 | "kernel = GPy.kern.RBF(X.shape[1])\n",
192 | "\n",
193 | "model = SparseGPModel(kernel, num_inducing=100)\n",
194 | "prediction = cross_val_predict(model, X, np.log1p(y), cv=3, n_jobs=1)"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "collapsed": true,
202 | "slideshow": {
203 | "slide_type": "skip"
204 | }
205 | },
206 | "outputs": [],
207 | "source": [
208 | "prediction = np.expm1(prediction)\n",
209 | "R2 = r2_score(y, prediction)\n",
210 | "print(R2)\n",
211 | "\n",
212 | "def scatter_plot(y_test, prediction):\n",
213 | " pyplot.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--k')\n",
214 | " pyplot.scatter(y_test, prediction)\n",
215 | " pyplot.xlabel('Actual value')\n",
216 | " pyplot.ylabel('Predicted value')\n",
217 | " pyplot.show()\n",
218 | " \n",
219 | "scatter_plot(y, prediction)"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {
225 | "slideshow": {
226 | "slide_type": "skip"
227 | }
228 | },
229 | "source": [
230 | "### Task\n",
231 | "\n",
232 | "For different number of inducing points (100, 200, 300, 500) build GP model and plot figure of how model accuracy and building time changes."
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {
239 | "collapsed": true,
240 | "slideshow": {
241 | "slide_type": "skip"
242 | }
243 | },
244 | "outputs": [],
245 | "source": [
246 | "######## Your code here ########\n",
247 | "import time\n",
248 | "\n",
249 | "n_inducing = [100, 200, 300, 500]\n",
250 | "errors = []\n",
251 | "times = []\n"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {
257 | "slideshow": {
258 | "slide_type": "skip"
259 | }
260 | },
261 | "source": [
262 | "Plot figures"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {
269 | "collapsed": true,
270 | "scrolled": false,
271 | "slideshow": {
272 | "slide_type": "skip"
273 | }
274 | },
275 | "outputs": [],
276 | "source": [
277 | "figure, ax = pyplot.subplots(1, 2, figsize=(6, 3))\n",
278 | "ax[0].plot(n_inducing, errors, '.', label='R2')\n",
279 | "ax[0].plot(n_inducing, errors, '-', label='R2')\n",
280 | "ax[1].plot(n_inducing, times, '.', label='Training time')\n",
281 | "ax[1].plot(n_inducing, times, '-', label='Training time')\n",
282 | "figure.tight_layout()\n",
283 | "pyplot.show()"
284 | ]
285 | },
286 | {
287 | "cell_type": "markdown",
288 | "metadata": {
289 | "slideshow": {
290 | "slide_type": "slide"
291 | }
292 | },
293 | "source": [
294 | "### Random Fourier Features\n",
295 | "\n",
296 | "Idea: shift-invariant kernel can be represented as\n",
297 | "$$\n",
298 | "k(x, y) = k(x - y) = \\int p(w) e^{jw^T(x - y)} dw\n",
299 | "$$\n",
300 | "\n",
301 | "Let's calculate integral approximately by Monte Carlo\n",
302 | "$$\n",
303 | "k(x, y) \\approx \\frac{1}{M} \\sum_{i=1}^M \\phi_i^T(x) \\phi_i(y),\n",
304 | "$$\n",
305 | "where $\\phi_i(x) = \\sqrt{2}\\cos(w^Tx + b)$, $w \\sim p(w), b \\sim Uniform([0, 2\\pi])$.\n",
306 | "\n",
307 | "This means that the covariance matrix is approximated by $\\mathbf{K} = \\Phi \\Phi^T$, where $\\Phi = \\|\\boldsymbol{\\phi}(x_i)\\|_{i = 1}^N, \\quad \\boldsymbol{\\phi}(x) = (\\phi_1(x), \\ldots, \\phi_M(x))$"
308 | ]
309 | },
310 | {
311 | "cell_type": "markdown",
312 | "metadata": {
313 | "slideshow": {
314 | "slide_type": "slide"
315 | }
316 | },
317 | "source": [
318 | "So, go back from functional space view to weight-space view:\n",
319 | "$$\n",
320 | "y = \\beta^T\\phi(x) + \\varepsilon, \\quad \\beta \\sim \\mathcal{N}(0, \\; \\Sigma), \\quad \\varepsilon \\sim \\mathcal{N}(0, \\; \\sigma_n^2)\n",
321 | "$$\n",
322 | "The predictive distribution in this case:\n",
323 | "$$\n",
324 | "f_* \\; | \\; x_*, X, \\mathbf{y} = \\mathcal{N}\\left (\\frac{1}{\\sigma_n^2}\\boldsymbol{\\phi}(x_*)^TA^{-1}\\Phi^T \\mathbf{y},\\;\n",
325 | "\\boldsymbol{\\phi}(x_*)^T A^{-1}\\boldsymbol{\\phi}(x_*)\n",
326 | "\\right ), \\quad A = \\sigma_n^{-2}\\Phi^T \\Phi + \\Sigma^{-1}\n",
327 | "$$\n",
328 | "The computational complexity is $\\mathcal{O}(NM^2)$."
329 | ]
330 | },
331 | {
332 | "cell_type": "markdown",
333 | "metadata": {},
334 | "source": [
335 | "### Task\n",
336 | "\n",
337 | "Implement generation of RFF"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "metadata": {
344 | "collapsed": true,
345 | "slideshow": {
346 | "slide_type": "skip"
347 | }
348 | },
349 | "outputs": [],
350 | "source": [
351 | "from sklearn.base import BaseEstimator\n",
352 | "from sklearn.exceptions import NotFittedError\n",
353 | "from scipy.stats import cauchy, laplace\n",
354 | "from sklearn.metrics.pairwise import rbf_kernel, laplacian_kernel\n",
355 | "\n",
356 | "\n",
357 | "class RFF(BaseEstimator):\n",
358 | " def __init__(self, gamma=1, n_components=50, kernel=\"rbf\"):\n",
359 | " self.gamma = gamma\n",
360 | " self.kernel = kernel\n",
361 | " # Number of features (Monte Carlo samples)\n",
362 | " self.n_components = n_components\n",
363 | " self.fitted = False\n",
364 | " \n",
365 | " def fit(self, X, y=None):\n",
366 | " \"\"\" Generates MonteCarlo random samples \"\"\"\n",
367 | " d = X.shape[1]\n",
368 | " \n",
369 | " ######## Your coder here ########\n",
370 | " #Generate D iid samples from p(w)\n",
371 | " \n",
372 | " if self.kernel == \"rbf\": # for RBF kernel p(w) ~ exp(-gamma * w^2)\n",
373 | " self.w = \n",
374 | " elif self.kernel == \"laplace\": # for Laplace distribution p(w) ~ Cauchy(gamma)\n",
375 | " self.w = \n",
376 | " \n",
377 | " #Generate D iid samples from Uniform(0,2*pi) \n",
378 | " self.u = \n",
379 | " self.fitted = True\n",
380 | " return self\n",
381 | " \n",
382 | " def transform(self, X):\n",
383 | " \"\"\" Transforms the data X (n_samples, n_features) to the new map space Z(X) (n_samples, n_components)\"\"\"\n",
384 | " if not self.fitted:\n",
385 | " raise NotFittedError(\"RBF_MonteCarlo must be fitted beform computing the feature map Z\")\n",
386 | " \n",
387 | " ######## Your coder here ########\n",
388 | " #Compute feature map Z(x):\n",
389 | " Z = \n",
390 | " return Z\n",
391 | " \n",
392 | " def compute_kernel(self, X):\n",
393 | " \"\"\" Computes the approximated kernel matrix K \"\"\"\n",
394 | " if not self.fitted:\n",
395 | " raise NotFittedError(\"RBF_MonteCarlo must be fitted beform computing the kernel matrix\")\n",
396 | " Z = self.transform(X)\n",
397 | " K = Z.dot(Z.T)\n",
398 | " return K"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "metadata": {},
404 | "source": [
405 | "Now, generate 100 random points from [0, 1]^d, calculate exact kernel matrix for RBF and Exponential kernels,\n",
406 | "calculate their approximations using RFF and check that they are close."
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": null,
412 | "metadata": {
413 | "collapsed": true
414 | },
415 | "outputs": [],
416 | "source": [
417 | "######## Your code here ########\n",
418 | "dim = 4\n",
419 | "rbf = GPy.kern.RBF(dim, lengthscale=1 / np.sqrt(2)).K\n",
420 | "\n",
421 | "exponential = GPy.kern.Exponential(dim).K\n",
422 | "\n",
423 | "np.random.seed(42)\n",
424 | "x = np.random.rand(100, dim)\n",
425 | "\n",
426 | "######## Your code here ########\n",
427 | "# 1. Calculate exact kernel matrix for RBF kernel and Exponential kernels\n",
428 | "# 2. Calculate approximations using RFF\n",
429 | "# 3. Calculate approximation error ||K_exact - K_approx|| / ||K_exact|| and check whether the norm is small\n"
430 | ]
431 | },
432 | {
433 | "cell_type": "markdown",
434 | "metadata": {
435 | "slideshow": {
436 | "slide_type": "skip"
437 | }
438 | },
439 | "source": [
440 | "### Task\n",
441 | "\n",
442 | "For different number of inducing points (100, 200, 300, 500) build GP model and plot figure of how model accuracy and building time changes."
443 | ]
444 | },
445 | {
446 | "cell_type": "code",
447 | "execution_count": null,
448 | "metadata": {
449 | "collapsed": true,
450 | "slideshow": {
451 | "slide_type": "skip"
452 | }
453 | },
454 | "outputs": [],
455 | "source": [
456 | "######## Your code here ########\n",
457 | "# Hint: use Pipeline from sklearn\n",
458 | "\n",
459 | "n_inducing = [100, 200, 300, 500]\n",
460 | "errors = []\n",
461 | "times = []\n"
462 | ]
463 | },
464 | {
465 | "cell_type": "markdown",
466 | "metadata": {
467 | "slideshow": {
468 | "slide_type": "skip"
469 | }
470 | },
471 | "source": [
472 | "Plot figures"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": null,
478 | "metadata": {
479 | "collapsed": true,
480 | "slideshow": {
481 | "slide_type": "skip"
482 | }
483 | },
484 | "outputs": [],
485 | "source": [
486 | "figure, ax = pyplot.subplots(1, 2, figsize=(6, 3))\n",
487 | "ax[0].plot(n_inducing, errors, '.', label='R2')\n",
488 | "ax[0].plot(n_inducing, errors, '-', label='R2')\n",
489 | "ax[0].legend(loc='best')\n",
490 | "ax[1].plot(n_inducing, times, '.', label='Training time')\n",
491 | "ax[1].plot(n_inducing, times, '-', label='Training time')\n",
492 | "ax[1].legend(loc='best')\n",
493 | "figure.tight_layout()\n",
494 | "\n",
495 | "pyplot.show()"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": null,
501 | "metadata": {
502 | "collapsed": true
503 | },
504 | "outputs": [],
505 | "source": []
506 | }
507 | ],
508 | "metadata": {
509 | "kernelspec": {
510 | "display_name": "Python 3",
511 | "language": "python",
512 | "name": "python3"
513 | },
514 | "language_info": {
515 | "codemirror_mode": {
516 | "name": "ipython",
517 | "version": 3
518 | },
519 | "file_extension": ".py",
520 | "mimetype": "text/x-python",
521 | "name": "python",
522 | "nbconvert_exporter": "python",
523 | "pygments_lexer": "ipython3",
524 | "version": "3.5.2"
525 | }
526 | },
527 | "nbformat": 4,
528 | "nbformat_minor": 2
529 | }
530 |
--------------------------------------------------------------------------------
/sem4-GP/EI_vs_logEI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem4-GP/EI_vs_logEI.png
--------------------------------------------------------------------------------
/sem4-GP/airline.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem4-GP/airline.mat
--------------------------------------------------------------------------------
/sem4-GP/airline_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem4-GP/airline_result.png
--------------------------------------------------------------------------------
/sem4-GP/bayes_opt.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from scipy.stats import norm
4 | from scipy.optimize import minimize
5 |
6 | from matplotlib import pyplot
7 |
8 | import GPy
9 |
10 |
11 | def lower_confidence_bound(mean_values, std_values, coefficient=2):
12 | return mean_values.ravel() - coefficient * std_values.ravel()
13 |
14 |
15 | def log_expected_improvement(mean_values, variance_values, opt_value):
16 | estimated_values = mean_values.ravel()
17 | eps = 0.05/len(estimated_values)
18 |
19 | delta = (opt_value - estimated_values - eps).ravel()
20 |
21 | estimated_errors = (variance_values ** 0.5).ravel()
22 |
23 | non_zero_error_inds = np.where(estimated_errors > 1e-6)[0]
24 | Z = np.zeros(len(delta))
25 | Z[non_zero_error_inds] = delta[non_zero_error_inds]/estimated_errors[non_zero_error_inds]
26 | log_EI = np.log(estimated_errors) + norm.logpdf(Z) + np.log(1 + Z * np.exp(norm.logcdf(Z) - norm.logpdf(Z)))
27 | return log_EI
28 |
29 |
30 | def expected_improvement(mean_values, std_values, opt_values):
31 | improvement = (opt_values.ravel()[0] - mean_values).ravel()
32 | std_values = std_values.ravel()
33 | EI = improvement * norm.cdf(improvement / std_values) + std_values * norm.pdf(improvement / std_values)
34 | return EI
35 |
36 |
37 | def get_new_point(model, lb, ub, data=None, multistart=10, criterion='ei', k=1, random_state=None):
38 | if random_state is None:
39 | random_state = np.random.RandomState()
40 |
41 | lb = np.array(lb).reshape(1, -1)
42 | ub = np.array(ub).reshape(1, -1)
43 | x_random = random_state.uniform(size=(multistart, np.array(lb).ravel().shape[0]))
44 | x_random *= ub - lb
45 | x_random += lb
46 |
47 | def objective(x):
48 | if x.ndim == 1:
49 | x = x.reshape(1, -1)
50 | mean_values, variance = model.predict(x)
51 | if criterion == 'ei':
52 | return -log_expected_improvement(mean_values, variance, data[1].min())
53 | elif criterion == 'lcb':
54 | return lower_confidence_bound(mean_values, std_values, k)
55 | else:
56 | raise NotImplementedError('Criterion is not implemented!')
57 |
58 | criterion_value = objective(x_random)
59 |
60 | best_result = None
61 | best_value = np.inf
62 | for x_init in x_random:
63 | optimization_result = minimize(objective, x_init, method='L-BFGS-B', bounds=np.vstack((lb, ub)).T)
64 |
65 | if optimization_result.fun < best_value:
66 | best_result = optimization_result
67 | best_value = best_result.fun[0]
68 | return best_result.x, best_result.fun
69 |
70 |
71 | def optimization_step(x_train, y_train, kernel, objective, lb=None, ub=None, criterion='ei', k=1, plot=False):
72 | model = GPy.models.GPRegression(x_train, y_train, kernel)
73 | model.optimize_restarts(num_restarts=10, verbose=False)
74 |
75 | x_new, criterion_value = get_new_point(model, data=(x_train, y_train), lb=lb, ub=ub, criterion=criterion, k=k)
76 | if plot:
77 | plot1d(x_train, y_train, model, objective, x_new, criterion_value)
78 | pyplot.show()
79 |
80 | x_new = x_new.reshape(1, -1)
81 | x_train = np.vstack([x_train, x_new])
82 | y_train = np.vstack([y_train, np.asarray(objective(x_new)).reshape(1, -1)])
83 | return x_train, y_train, model
84 |
85 |
86 | def plot1d(x_train, y_train, model, objective, x_new, criterion_value):
87 | x_grid = np.linspace(0, 1, 100).reshape(-1, 1)
88 | y_grid = objective(x_grid)
89 |
90 | prediction, variance = model.predict(x_grid)
91 | std = np.sqrt(variance)
92 | prediction = prediction.ravel()
93 | std = std.ravel()
94 |
95 | pyplot.figure(figsize=(8, 6))
96 | pyplot.plot(x_train, y_train, 'or', markersize=8, label='Training set')
97 | pyplot.plot(x_grid, y_grid, '--b', linewidth=2, label='True function')
98 | pyplot.plot(x_grid, prediction, '-k', linewidth=2, label='Approximation')
99 | pyplot.fill_between(x_grid.ravel(), prediction - 2 * std, prediction + 2 * std, alpha=0.3)
100 | pyplot.plot(x_new, objective(x_new), 'og', markersize=10, label='New point')
101 | pyplot.ylim([-15, 20])
102 | pyplot.legend(loc='best')
103 |
104 |
105 | def plot2d(objective, x_train, y_train, model):
106 | grid_size = 50
107 | x = np.meshgrid(np.linspace(-1, 1, grid_size), np.linspace(-1, 1, grid_size))
108 | x = np.hstack((x[0].reshape(-1, 1), x[1].reshape(-1, 1)))
109 | y = objective(x)
110 |
111 | prediction, variance = model.predict(x)
112 | std = np.sqrt(variance).ravel()
113 |
114 | x_train = (x_train + 1) * grid_size / 2
115 | log_EI = np.exp(log_expected_improvement(prediction, std, y_train.min()))
116 |
117 | values = [prediction, y, std, log_EI]
118 | names = ['Predicted values', 'Exact values', 'Predicted std', 'log EI']
119 |
120 | figure, axes = pyplot.subplots(nrows=2, ncols=2, figsize=(6, 6))
121 |
122 | for i, ax in enumerate(axes.ravel()):
123 | if i < 3:
124 | ax.imshow(values[i].reshape(grid_size, grid_size), vmin=0, vmax=1, alpha=0.8)
125 | else:
126 | ax.imshow(values[i].reshape(grid_size, grid_size), alpha=0.8)
127 | ax.scatter(x_train[:-1, 0], x_train[:-1, 1], c='r', s=20)
128 | ax.scatter(x_train[-1, 0], x_train[-1, 1], marker='d', edgecolor='k', c='g', s=180)
129 | ax.set_xlim([-0.5, grid_size + 0.5])
130 | ax.set_ylim([-0.5, grid_size + 0.5])
131 | ax.axis('off')
132 | ax.set_title(names[i])
133 |
134 | figure.tight_layout()
135 |
136 |
137 | def demo_2d(n_init, budget, kernel, save_path='./library/2d_demo.mp4'):
138 | global x_train, y_train, model
139 |
140 | def f2d(x):
141 | t = np.sum((x + 0.6)**2, axis=1) - 0.3
142 | y = np.sin(t)**2 / np.tanh(t**2 + 0.4)
143 | return y.reshape(-1, 1)
144 |
145 | lb = [-1, -1]
146 | ub = [1, 1]
147 | np.random.seed(42)
148 | x_train = np.random.rand(n_init, 2) * 2 - 1
149 | y_train = f2d(x_train)
150 |
151 | model = GPy.models.GPRegression(x_train, y_train, kernel)
152 | model.optimize()
153 |
154 | # Set up formatting for the movie files
155 | import matplotlib.animation as animation
156 | from mpl_toolkits.axes_grid1 import make_axes_locatable
157 |
158 | Writer = animation.writers['ffmpeg_file']
159 | writer = Writer(fps=1, metadata=dict(artist='Yermek Kapushev'))
160 |
161 | grid_size = 50
162 | x = np.meshgrid(np.linspace(-1, 1, grid_size), np.linspace(-1, 1, grid_size))
163 | x = np.hstack((x[0].reshape(-1, 1), x[1].reshape(-1, 1)))
164 | y = f2d(x)
165 |
166 |
167 | def get_model_values(model, x, x_train):
168 | prediction, variance = model.predict(x)
169 | std = np.sqrt(variance).ravel()
170 |
171 | log_EI = np.exp(log_expected_improvement(prediction, std, y_train.min()))
172 |
173 | values = [prediction, y, log_EI]
174 | return values
175 |
176 |
177 | values = get_model_values(model, x, x_train)
178 | history = [y_train.min()]
179 | names = ['Predicted values', 'Exact values', 'log EI']
180 |
181 | # Set up initial canvas
182 | figure, axes = pyplot.subplots(nrows=2, ncols=2, figsize=(6, 6))
183 | heatmaps = []
184 | scatters = []
185 | new_point_scatters = []
186 | for i, ax in enumerate(axes.ravel()[:-1]):
187 | heatmaps.append(ax.matshow(values[i].reshape(grid_size, grid_size), alpha=0.8))
188 | x_scatter = (x_train + 1) * grid_size / 2
189 | scatters.append(ax.scatter(x_scatter[:-1, 0], x_scatter[:-1, 1], c='r', s=20))
190 | new_point_scatters.append(ax.scatter(x_scatter[-1, 0], x_scatter[-1, 1], marker='d', edgecolor='k',
191 | c='g', s=180))
192 |
193 | divider = make_axes_locatable(ax)
194 | cax = divider.append_axes("right", size="5%", pad=0.05)
195 | figure.colorbar(heatmaps[-1], cax=cax)
196 | ax.set_xlim([-0.5, grid_size + 0.5])
197 | ax.set_ylim([-0.5, grid_size + 0.5])
198 | ax.axis('off')
199 | ax.set_title(names[i])
200 |
201 | convergence_plot = axes.ravel()[-1].plot([y_train.shape[0]], [y_train.min()], '-')
202 | axes.ravel()[-1].set_xlabel('iteration')
203 | axes.ravel()[-1].set_ylabel(r'$y_{min}$')
204 | axes.ravel()[-1].set_xlim([n_init - 1, n_init + budget])
205 | axes.ravel()[-1].set_ylim([0, 0.0073])
206 | figure.tight_layout()
207 |
208 |
209 | # Define function that updates figure
210 | def update_fig(iteration):
211 | global x_train, y_train, model
212 | # global y_train
213 | # global model
214 |
215 | if iteration == 0:
216 | return heatmaps + scatters + new_point_scatters + convergence_plot
217 |
218 | model = GPy.models.GPRegression(x_train, y_train, model.kern)
219 | model.optimize()
220 |
221 | x_new, criterion = get_new_point(model, lb, ub, data=(x_train, y_train), multistart=10, random_state=None)
222 | x_new = x_new.reshape(1, -1)
223 | x_train = np.vstack([x_train, x_new])
224 | y_train = np.vstack([y_train, f2d(x_new)])
225 | history.append(y_train.min())
226 |
227 | values = get_model_values(model, x, x_train)
228 |
229 | for i, val in enumerate(values):
230 | heatmaps[i].set_array(val.reshape(grid_size, -1))
231 | x_scatter = (x_train + 1) * grid_size / 2
232 | scatters[i].set_offsets(x_scatter[:-1])
233 | new_point_scatters[i].set_offsets(x_scatter[-1:])
234 |
235 | # adjust colorbar for std and log EI plot
236 | vmin = val.min()
237 | vmax = val.max()
238 | heatmaps[i].set_clim(vmin, vmax)
239 |
240 | convergence_plot[0].set_data(range(n_init, y_train.shape[0] + 1), history)
241 |
242 | return heatmaps + scatters + new_point_scatters + convergence_plot
243 |
244 |
245 |
246 | anim = animation.FuncAnimation(figure, update_fig,
247 | blit=False,
248 | repeat=False,
249 | frames=budget)
250 | anim.save(save_path, writer=writer)
251 |
--------------------------------------------------------------------------------