├── .DS_Store ├── .gitignore ├── LICENSE ├── README.md ├── sem2-classify&generate ├── 1_my_first_nn_lasagne ├── 1_my_first_nn_lsagne.ipynb ├── 2_ae_complete.ipynb ├── 2_ae_with_gaps.ipynb ├── 3_vae_complete.ipynb ├── 3_vae_with_gaps.ipynb ├── 4_ss_vae.ipynb ├── mnist.py └── utils.py ├── sem3-attention ├── Attention_seminar (Start here).ipynb ├── Captioning_seminar.ipynb ├── attention_part1_solution.ipynb ├── data │ ├── Dog-and-Cat.jpg │ └── svd.pcl └── pretrained_lenet.py ├── sem4-GP ├── 1_GP_basics.ipynb ├── 1_GP_basics_filled.ipynb ├── 2_BayesOpt.ipynb ├── 2_BayesOpt_filled.ipynb ├── 2d_demo.mp4 ├── 3_LargeScaleGP.ipynb ├── 3_LargeScaleGP_filled.ipynb ├── EI_vs_logEI.png ├── airline.mat ├── airline_result.png ├── bayes_opt.py └── house_pricing.csv └── sem5-gan └── seminar.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # deepbayes2017 -------------------------------------------------------------------------------- /sem2-classify&generate/1_my_first_nn_lasagne: -------------------------------------------------------------------------------- 1 | Список мест с критическими ошибками в первом ноутбуке: 2 | - Размер фильтров первого сверточного слоя 3 | - Процедуры инициализации параметров сети 4 | - Параметр слоя dropout 5 | - Нелинейность на выходе классификатора 6 | - Accuracy в качестве функции потерь при обучении 7 | - Momentum и learning_rate у оптимизатора -------------------------------------------------------------------------------- /sem2-classify&generate/1_my_first_nn_lsagne.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Theano

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "!pip install numpy matplotlib \n", 19 | "!pip install --upgrade https://github.com/Theano/Theano/archive/master.zip\n", 20 | "!pip install --upgrade https://github.com/Lasagne/Lasagne/archive/master.zip" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Разминка" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "import theano\n", 39 | "import theano.tensor as T\n", 40 | "\n", 41 | "%pylab inline" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "#### будущий параметр функции -- символьная переменная" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "N = T.scalar('a dimension', dtype='float32')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "#### рецепт получения квадрата -- орперации над символьными переменным" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "result = T.power(N, 2)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "#### theano.grad(cost, wrt)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "grad_result = theano.grad(result, N) " 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "#### компиляция функции \"получения квадрата\"" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": { 109 | "collapsed": true 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "sq_function = theano.function(inputs=[N], outputs=result)\n", 114 | "gr_function = theano.function(inputs=[N], outputs=grad_result)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "#### применение функции" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "# Заводим np.array x\n", 133 | "xv = np.arange(-10, 10)\n", 134 | "\n", 135 | "# Применяем функцию к каждому x\n", 136 | "val = map(float, [sq_function(x) for x in xv])\n", 137 | "\n", 138 | "# Посичтаем градиент в кажой точке\n", 139 | "grad = map(float, [gr_function(x) for x in xv])" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "### Что мы увидим если нарисуем функцию и градиент?" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "pylab.plot(xv, val, label='x*x')\n", 158 | "pylab.plot(xv, grad, label='d x*x / dx')\n", 159 | "pylab.legend()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "

Lasagne

\n", 167 | "\n", 168 | "* lasagne - это библиотека для написания нейронок произвольной формы на theano\n", 169 | "* В качестве демо-задачи выберем то же распознавание чисел, но на большем масштабе задачи, картинки 28x28, 10 цифр" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "from mnist import load_dataset\n", 181 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()\n", 182 | "\n", 183 | "print 'X размера', X_train.shape, 'y размера', y_train.shape" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "collapsed": false 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "fig, axes = plt.subplots(nrows=1, ncols=7, figsize=(20, 20))\n", 195 | "\n", 196 | "for i, ax in enumerate(axes):\n", 197 | " ax.imshow(X_train[i, 0], cmap='gray')" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "Давайте посмотрим на DenseLayer в lasagne\n", 205 | "- http://lasagne.readthedocs.io/en/latest/modules/layers/dense.html\n", 206 | "- https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/dense.py#L16-L124 \n", 207 | "- Весь содаржательный код тут https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/dense.py#L121 " 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "import lasagne\n", 219 | "from lasagne import init\n", 220 | "from theano import tensor as T\n", 221 | "from lasagne.nonlinearities import softmax\n", 222 | "\n", 223 | "X, y = T.tensor4('X'), T.vector('y', 'int32')" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "Так задаётся архитектура нейронки" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "#входной слой (вспомогательный)\n", 242 | "net = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=X)\n", 243 | "\n", 244 | "net = lasagne.layers.Conv2DLayer(net, 15, 28, pad='valid', W=init.Constant()) # сверточный слой\n", 245 | "net = lasagne.layers.Conv2DLayer(net, 10, 2, pad='full', W=init.Constant()) # сверточный слой\n", 246 | "\n", 247 | "net = lasagne.layers.DenseLayer(net, num_units=500) # полносвязный слой\n", 248 | "net = lasagne.layers.DropoutLayer(net, 1.0) # регуляризатор\n", 249 | "net = lasagne.layers.DenseLayer(net, num_units=200) # полносвязный слой\n", 250 | "\n", 251 | "net = lasagne.layers.DenseLayer(net, num_units=10) # полносвязный слой" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": true 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "#предсказание нейронки (theano-преобразование)\n", 263 | "y_predicted = lasagne.layers.get_output(net)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": false 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "#все веса нейронки (shared-переменные)\n", 275 | "all_weights = lasagne.layers.get_all_params(net)\n", 276 | "print all_weights" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "collapsed": true 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "#функция ошибки и точности будет прямо внутри\n", 288 | "loss = lasagne.objectives.categorical_accuracy(y_predicted, y).mean()\n", 289 | "accuracy = lasagne.objectives.categorical_accuracy(y_predicted, y).mean()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": true 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "#сразу посчитать словарь обновлённых значений с шагом по градиенту, как раньше\n", 301 | "updates = lasagne.updates.momentum(loss, all_weights, learning_rate=1.0, momentum=1.5)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "collapsed": true 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "#функция, делает updates и возвращащет значение функции потерь и точности\n", 313 | "train_fun = theano.function([X, y], [loss, accuracy], updates=updates)\n", 314 | "accuracy_fun = theano.function([X, y], accuracy) # точность без обновления весов, для теста" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "# Процесс обучения" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": { 328 | "collapsed": false, 329 | "scrolled": false 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "import time \n", 334 | "from mnist import iterate_minibatches\n", 335 | "\n", 336 | "num_epochs = 5 #количество проходов по данным\n", 337 | "batch_size = 50 #размер мини-батча\n", 338 | "\n", 339 | "for epoch in range(num_epochs):\n", 340 | " train_err, train_acc, train_batches, start_time = 0, 0, 0, time.time()\n", 341 | " for inputs, targets in iterate_minibatches(X_train, y_train, batch_size):\n", 342 | " train_err_batch, train_acc_batch = train_fun(inputs, targets)\n", 343 | " train_err += train_err_batch\n", 344 | " train_acc += train_acc_batch\n", 345 | " train_batches += 1\n", 346 | "\n", 347 | " val_acc, val_batches = 0, 0\n", 348 | " for inputs, targets in iterate_minibatches(X_test, y_test, batch_size):\n", 349 | " val_acc += accuracy_fun(inputs, targets)\n", 350 | " val_batches += 1\n", 351 | "\n", 352 | " \n", 353 | " print \"Epoch %s of %s took %.3f s\" % (epoch + 1, num_epochs, time.time() - start_time)\n", 354 | " print \" train loss:\\t %.3f\" % (train_err / train_batches)\n", 355 | " print \" train acc:\\t %.3f\" % (train_acc * 100 / train_batches), '%'\n", 356 | " print \" test acc:\\t %.3f\" % (val_acc * 100 / val_batches), '%'\n", 357 | " print" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "test_acc = 0\n", 369 | "test_batches = 0\n", 370 | "for batch in iterate_minibatches(X_test, y_test, 500):\n", 371 | " inputs, targets = batch\n", 372 | " acc = accuracy_fun(inputs, targets)\n", 373 | " test_acc += acc\n", 374 | " test_batches += 1\n", 375 | "print(\"Final results: \\n test accuracy:\\t\\t{:.2f} %\".format(test_acc / test_batches * 100))" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "# Ансамблирование с DropOut" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "collapsed": true 390 | }, 391 | "outputs": [], 392 | "source": [ 393 | "#предсказание нейронки (theano-преобразование)\n", 394 | "y_predicted = T.mean([lasagne.layers.get_output(net, deterministic=False) for i in range(10)], axis=0)\n", 395 | "accuracy = lasagne.objectives.categorical_accuracy(y_predicted, y).mean()\n", 396 | "accuracy_fun = theano.function([X, y], accuracy) # точность без обновления весов, для теста" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "collapsed": true 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "test_acc = 0\n", 408 | "test_batches = 0\n", 409 | "for batch in iterate_minibatches(X_test, y_test, 500):\n", 410 | " inputs, targets = batch\n", 411 | " acc = accuracy_fun(inputs, targets)\n", 412 | " test_acc += acc\n", 413 | " test_batches += 1\n", 414 | "print(\"Final results: \\n test accuracy:\\t\\t{:.2f} %\".format(test_acc / test_batches * 100))" 415 | ] 416 | } 417 | ], 418 | "metadata": { 419 | "anaconda-cloud": {}, 420 | "kernelspec": { 421 | "display_name": "Python 2", 422 | "language": "python", 423 | "name": "python2" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 2 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython2", 435 | "version": "2.7.10" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 1 440 | } 441 | -------------------------------------------------------------------------------- /sem2-classify&generate/2_ae_complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Автокодировщик" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "collapsed": false, 18 | "deletable": true, 19 | "editable": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import time\n", 24 | "\n", 25 | "import numpy as np\n", 26 | "import theano\n", 27 | "import theano.tensor as T\n", 28 | "import lasagne\n", 29 | "\n", 30 | "import matplotlib.pylab as plt\n", 31 | "from utils import load_dataset, iterate_minibatches\n", 32 | "%matplotlib inline\n", 33 | "\n", 34 | "BATCH_SIZE = 20\n", 35 | "HIDDEN_DIM = 2\n", 36 | "\n", 37 | "num_epochs = 128\n", 38 | "\n", 39 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "deletable": true, 46 | "editable": true 47 | }, 48 | "source": [ 49 | "## Обучение модели\n", 50 | "\n", 51 | "tl;dr: Автокодировщик может быть использован для построения маломерных признаков данных без разметки.\n", 52 | "\n", 53 | "В процессе обучения строится пара отображений $E: \\mathbb R^D \\rightarrow R^d$ (кодировщик) и $D: \\mathbb R^d \\rightarrow R^D$ (декодировщик), чья композиция приближает тождественное отображение:\n", 54 | "\n", 55 | "$$ D(E(x)) \\approx x $$" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true, 63 | "deletable": true, 64 | "editable": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "# Определим кодировщик и декодировщик с помощью пары полносвязных нейронных сетей\n", 69 | "\n", 70 | "def ae_encoder(input_var):\n", 71 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input_var)\n", 72 | " ######################################################################################\n", 73 | " # Реализуйте некоторую несложную архитектуру кодировщика, возвращающую HIDDEN_DIM-мерный код #\n", 74 | " # Какие функции активации можно поставить на выход сети? #\n", 75 | " ######################################################################################\n", 76 | " l_hid1 = lasagne.layers.DenseLayer(\n", 77 | " l_in, num_units=128,\n", 78 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 79 | " W=lasagne.init.GlorotUniform(),\n", 80 | " name='e_hid1')\n", 81 | " l_hid2 = lasagne.layers.DenseLayer(\n", 82 | " l_hid1, num_units=64,\n", 83 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 84 | " name='e_hid2')\n", 85 | " l_out = lasagne.layers.DenseLayer(\n", 86 | " l_hid2, num_units=HIDDEN_DIM,\n", 87 | " nonlinearity=None,\n", 88 | " name='e_out')\n", 89 | " return l_out\n", 90 | "\n", 91 | "\n", 92 | "def ae_decoder(input_var):\n", 93 | " l_in = lasagne.layers.InputLayer(shape=(None, HIDDEN_DIM), input_var=input_var)\n", 94 | " ##################################################################################################\n", 95 | " # Реализуйте некоторую несложную архитектуру декодировщика, возвращающую батч объектов размера (1, 28, 28) #\n", 96 | " ##################################################################################################\n", 97 | " l_hid1 = lasagne.layers.DenseLayer(\n", 98 | " l_in, num_units=64,\n", 99 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 100 | " W=lasagne.init.GlorotUniform(),\n", 101 | " name='d_hid1')\n", 102 | " l_hid2 = lasagne.layers.DenseLayer(\n", 103 | " l_hid1, num_units=128,\n", 104 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 105 | " name='d_hid2')\n", 106 | " l_out = lasagne.layers.DenseLayer(\n", 107 | " l_hid2, num_units=28 * 28,\n", 108 | " nonlinearity=lasagne.nonlinearities.sigmoid,\n", 109 | " name='d_out')\n", 110 | " l_out = lasagne.layers.reshape(l_out, shape=(-1, 1, 28, 28))\n", 111 | " return l_out" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true, 119 | "deletable": true, 120 | "editable": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "# Инициализируем сеть\n", 125 | "input_x = T.tensor4('input_x')\n", 126 | " \n", 127 | "encoder = ae_encoder(input_x)\n", 128 | "decoder = ae_decoder(\n", 129 | " lasagne.layers.get_output(encoder)\n", 130 | ")" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": { 136 | "deletable": true, 137 | "editable": true 138 | }, 139 | "source": [ 140 | "Для обучения автокодировщика будем использовать среднеквадратичную ошибку\n", 141 | "\n", 142 | "$$ L(X) = \\frac{1}{N}\\sum_{i=1}^{N} \\sum_{j=1}^{28^2} \\left( D(E(x_i))_j - x_{i,j} \\right)^2 = \\frac{1}{N}\\sum_{i=1}^{N} (D(E(x_i)) - x_i)^T (D(E(x_i)) - x_i) $$" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "collapsed": false, 150 | "deletable": true, 151 | "editable": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "#####################################################################################\n", 156 | "# Определите операцию для вычисления функции потерь, а также создайте список параметров модели #\n", 157 | "# для передачи в оптимизатор #\n", 158 | "loss = lasagne.objectives.squared_error(\n", 159 | " lasagne.layers.get_output(decoder), input_x\n", 160 | ").sum(axis=(1, 2, 3)).mean()\n", 161 | "params = lasagne.layers.get_all_params([encoder, decoder])\n", 162 | "#####################################################################################\n", 163 | "\n", 164 | "updates = lasagne.updates.adam(loss, params)\n", 165 | " \n", 166 | "train = theano.function(\n", 167 | " [input_x],\n", 168 | " loss,\n", 169 | " updates=updates\n", 170 | ")\n", 171 | "test_loss = theano.function(\n", 172 | " [input_x],\n", 173 | " loss\n", 174 | ")" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": { 180 | "deletable": true, 181 | "editable": true 182 | }, 183 | "source": [ 184 | "Обучение, как и во многих других случаях, выполяется с помощью стохастического градиентного спуска" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": false, 192 | "deletable": true, 193 | "editable": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "for epoch in range(num_epochs):\n", 198 | " train_err = 0\n", 199 | " train_batches = 0\n", 200 | " start_time = time.time()\n", 201 | " for batch in iterate_minibatches(X_train, batchsize=BATCH_SIZE):\n", 202 | " train_err += train(batch)\n", 203 | " train_batches += 1\n", 204 | " \n", 205 | " test_err = 0\n", 206 | " test_batches = 0\n", 207 | " for batch in iterate_minibatches(X_test, batchsize=BATCH_SIZE):\n", 208 | " test_err += test_loss(batch)\n", 209 | " test_batches += 1\n", 210 | " \n", 211 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n", 212 | " epoch + 1, num_epochs, time.time() - start_time))\n", 213 | " print(\"Train error {}\".format(train_err/train_batches))\n", 214 | " print(\"Test error {}\".format(test_err/test_batches))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": { 220 | "deletable": true, 221 | "editable": true 222 | }, 223 | "source": [ 224 | "## Визуализация\n", 225 | "\n", 226 | "Модель с двумерными скрытыми переменными легко визуализировать. Определим две функции: одну для построения пропущенных через автокодировщик изображений, вторую для вычисления скрытых представлений по изображению" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": false, 234 | "deletable": true, 235 | "editable": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "from utils import plot_reconstructions, plot_hidden_space\n", 240 | "\n", 241 | "reconstruct = theano.function(\n", 242 | " [input_x],\n", 243 | " lasagne.layers.get_output(decoder)\n", 244 | ")\n", 245 | "\n", 246 | "encode = theano.function(\n", 247 | " [input_x],\n", 248 | " lasagne.layers.get_output(encoder)\n", 249 | ")" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": { 255 | "deletable": true, 256 | "editable": true 257 | }, 258 | "source": [ 259 | "Примеры изображений, пропущенных через автокодировщик: " 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "collapsed": false, 267 | "deletable": true, 268 | "editable": true 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "plot_reconstructions(X_test, reconstruct)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "deletable": true, 279 | "editable": true 280 | }, 281 | "source": [ 282 | "Визуализация признакового пространства. Насколько пространство простое? Везде ли оно плотно? Как выбрать точку в этом пространстве, которая будет соответствовать коду какого-то объекта?" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false, 290 | "deletable": true, 291 | "editable": true 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "plot_hidden_space(X_test[:1000], encode)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "deletable": true, 302 | "editable": true 303 | }, 304 | "source": [ 305 | "Попробуйте погенерировать изображения по паре координат" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": true, 313 | "deletable": true, 314 | "editable": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "input_z = T.matrix('input_z')\n", 319 | "\n", 320 | "decode_a_code = theano.function(\n", 321 | " [input_z],\n", 322 | " lasagne.layers.get_output(decoder, input_z),\n", 323 | ")\n", 324 | "\n", 325 | "def generate_from_code(x, y):\n", 326 | " img = decode_a_code([[x, y]]).reshape((28, 28))\n", 327 | " plt.imshow(img, 'gray')" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "collapsed": false, 335 | "deletable": true, 336 | "editable": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "generate_from_code(50., 20.)" 341 | ] 342 | } 343 | ], 344 | "metadata": { 345 | "kernelspec": { 346 | "display_name": "Python 3", 347 | "language": "python", 348 | "name": "python3" 349 | }, 350 | "language_info": { 351 | "codemirror_mode": { 352 | "name": "ipython", 353 | "version": 3 354 | }, 355 | "file_extension": ".py", 356 | "mimetype": "text/x-python", 357 | "name": "python", 358 | "nbconvert_exporter": "python", 359 | "pygments_lexer": "ipython3", 360 | "version": "3.5.1" 361 | } 362 | }, 363 | "nbformat": 4, 364 | "nbformat_minor": 2 365 | } 366 | -------------------------------------------------------------------------------- /sem2-classify&generate/2_ae_with_gaps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Автокодировщик" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "collapsed": false, 18 | "deletable": true, 19 | "editable": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import time\n", 24 | "\n", 25 | "import numpy as np\n", 26 | "import theano\n", 27 | "import theano.tensor as T\n", 28 | "import lasagne\n", 29 | "\n", 30 | "import matplotlib.pylab as plt\n", 31 | "from utils import load_dataset, iterate_minibatches\n", 32 | "%matplotlib inline\n", 33 | "\n", 34 | "BATCH_SIZE = 20\n", 35 | "HIDDEN_DIM = 2\n", 36 | "\n", 37 | "num_epochs = 40\n", 38 | "\n", 39 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "deletable": true, 46 | "editable": true 47 | }, 48 | "source": [ 49 | "## Обучение модели\n", 50 | "\n", 51 | "tl;dr: Автокодировщик может быть использован для построения маломерных признаков данных без разметки.\n", 52 | "\n", 53 | "В процессе обучения строится пара отображений $E: \\mathbb R^D \\rightarrow R^d$ (кодировщик) и $D: \\mathbb R^d \\rightarrow R^D$ (декодировщик), чья композиция приближает тождественное отображение:\n", 54 | "\n", 55 | "$$ D(E(x)) \\approx x $$" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true, 63 | "deletable": true, 64 | "editable": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "# Определим кодировщик и декодировщик с помощью пары полносвязных нейронных сетей\n", 69 | "\n", 70 | "def ae_encoder(input_var):\n", 71 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input_var)\n", 72 | " ######################################################################################\n", 73 | " # Реализуйте некоторую несложную архитектуру кодировщика, возвращающую HIDDEN_DIM-мерный код #\n", 74 | " # Какие функции активации можно поставить на выход сети? #\n", 75 | " ######################################################################################\n", 76 | " return l_out\n", 77 | "\n", 78 | "\n", 79 | "def ae_decoder(input_var):\n", 80 | " l_in = lasagne.layers.InputLayer(shape=(None, HIDDEN_DIM), input_var=input_var)\n", 81 | " ##################################################################################################\n", 82 | " # Реализуйте некоторую несложную архитектуру декодировщика, возвращающую батч объектов размера (1, 28, 28) #\n", 83 | " ##################################################################################################\n", 84 | " return l_out" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true, 92 | "deletable": true, 93 | "editable": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "# Инициализируем сеть\n", 98 | "input_x = T.tensor4('input_x')\n", 99 | " \n", 100 | "encoder = ae_encoder(input_x)\n", 101 | "decoder = ae_decoder(\n", 102 | " lasagne.layers.get_output(encoder)\n", 103 | ")" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "deletable": true, 110 | "editable": true 111 | }, 112 | "source": [ 113 | "Для обучения автокодировщика будем использовать среднеквадратичную ошибку\n", 114 | "\n", 115 | "$$ L(X) = \\frac{1}{N}\\sum_{i=1}^{N} \\sum_{j=1}^{28^2} \\left( D(E(x_i))_j - x_{i,j} \\right)^2 = \\frac{1}{N}\\sum_{i=1}^{N} (D(E(x_i)) - x_i)^T (D(E(x_i)) - x_i) $$" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": false, 123 | "deletable": true, 124 | "editable": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "#####################################################################################\n", 129 | "# Определите операцию для вычисления функции потерь, а также создайте список параметров модели #\n", 130 | "# для передачи в оптимизатор #\n", 131 | "loss = None\n", 132 | "params = None\n", 133 | "#####################################################################################\n", 134 | "\n", 135 | "updates = lasagne.updates.adam(loss, params)\n", 136 | " \n", 137 | "train = theano.function(\n", 138 | " [input_x],\n", 139 | " loss,\n", 140 | " updates=updates\n", 141 | ")\n", 142 | "test_loss = theano.function(\n", 143 | " [input_x],\n", 144 | " loss\n", 145 | ")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "deletable": true, 152 | "editable": true 153 | }, 154 | "source": [ 155 | "Обучение, как и во многих других случаях, выполяется с помощью стохастического градиентного спуска" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": false, 163 | "deletable": true, 164 | "editable": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "for epoch in range(num_epochs):\n", 169 | " train_err = 0\n", 170 | " train_batches = 0\n", 171 | " start_time = time.time()\n", 172 | " for batch in iterate_minibatches(X_train, batchsize=BATCH_SIZE):\n", 173 | " train_err += train(batch)\n", 174 | " train_batches += 1\n", 175 | " \n", 176 | " test_err = 0\n", 177 | " test_batches = 0\n", 178 | " for batch in iterate_minibatches(X_test, batchsize=BATCH_SIZE):\n", 179 | " test_err += test_loss(batch)\n", 180 | " test_batches += 1\n", 181 | " \n", 182 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n", 183 | " epoch + 1, num_epochs, time.time() - start_time))\n", 184 | " print(\"Train error {}\".format(train_err/train_batches))\n", 185 | " print(\"Test error {}\".format(test_err/test_batches))" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "deletable": true, 192 | "editable": true 193 | }, 194 | "source": [ 195 | "## Визуализация\n", 196 | "\n", 197 | "Модель с двумерными скрытыми переменными легко визуализировать. Определим две функции: одну для построения пропущенных через автокодировщик изображений, вторую для вычисления скрытых представлений по изображению" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": false, 205 | "deletable": true, 206 | "editable": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "from utils import plot_reconstructions, plot_hidden_space\n", 211 | "\n", 212 | "reconstruct = theano.function(\n", 213 | " [input_x],\n", 214 | " lasagne.layers.get_output(decoder)\n", 215 | ")\n", 216 | "\n", 217 | "encode = theano.function(\n", 218 | " [input_x],\n", 219 | " lasagne.layers.get_output(encoder)\n", 220 | ")" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "deletable": true, 227 | "editable": true 228 | }, 229 | "source": [ 230 | "Примеры изображений, пропущенных через автокодировщик: " 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": false, 238 | "deletable": true, 239 | "editable": true 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "plot_reconstructions(X_test, reconstruct)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": { 249 | "deletable": true, 250 | "editable": true 251 | }, 252 | "source": [ 253 | "Визуализация признакового пространства. Насколько пространство простое? Везде ли оно плотно? Как выбрать точку в этом пространстве, которая будет соответствовать коду какого-то объекта?" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "collapsed": false, 261 | "deletable": true, 262 | "editable": true 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "plot_hidden_space(X_test[:1000], encode)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": { 272 | "deletable": true, 273 | "editable": true 274 | }, 275 | "source": [ 276 | "Попробуйте погенерировать изображения по паре координат" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "collapsed": true, 284 | "deletable": true, 285 | "editable": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "input_z = T.matrix('input_z')\n", 290 | "\n", 291 | "decode_a_code = theano.function(\n", 292 | " [input_z],\n", 293 | " lasagne.layers.get_output(decoder, input_z),\n", 294 | ")\n", 295 | "\n", 296 | "def generate_from_code(x, y):\n", 297 | " img = decode_a_code([[x, y]]).reshape((28, 28))\n", 298 | " plt.imshow(img, 'gray')" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "collapsed": false, 306 | "deletable": true, 307 | "editable": true 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "generate_from_code(50., 20.)" 312 | ] 313 | } 314 | ], 315 | "metadata": { 316 | "kernelspec": { 317 | "display_name": "Python 3", 318 | "language": "python", 319 | "name": "python3" 320 | }, 321 | "language_info": { 322 | "codemirror_mode": { 323 | "name": "ipython", 324 | "version": 3 325 | }, 326 | "file_extension": ".py", 327 | "mimetype": "text/x-python", 328 | "name": "python", 329 | "nbconvert_exporter": "python", 330 | "pygments_lexer": "ipython3", 331 | "version": "3.5.1" 332 | } 333 | }, 334 | "nbformat": 4, 335 | "nbformat_minor": 2 336 | } 337 | -------------------------------------------------------------------------------- /sem2-classify&generate/3_vae_complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Вариационный автокодировщик\n", 11 | "\n", 12 | "tl;dr: Вместо тождественного отображения вариационны автокодировщик выучивает вероятностую модель данных. Стохастическия вычисления и априорное распределение кодов дополнительно регуляризуют модель." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": false, 20 | "deletable": true, 21 | "editable": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import time\n", 26 | "\n", 27 | "import numpy as np\n", 28 | "import theano\n", 29 | "import theano.tensor as T\n", 30 | "import lasagne\n", 31 | "\n", 32 | "import matplotlib.pylab as plt\n", 33 | "%matplotlib inline\n", 34 | "\n", 35 | "from utils import load_dataset, iterate_minibatches\n", 36 | "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n", 37 | "\n", 38 | "BATCH_SIZE = 20\n", 39 | "HIDDEN_DIM = 2\n", 40 | "\n", 41 | "num_epochs = 128\n", 42 | "\n", 43 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": { 49 | "deletable": true, 50 | "editable": true 51 | }, 52 | "source": [ 53 | "## Кратко о вариационных автокодировщиках\n", 54 | "\n", 55 | "Рассмотрим вариационный автокодировщик для бинарных наблюдений. Вариационный автокодировщик состоит из генеративной модели наблюдений\n", 56 | "\n", 57 | "\\begin{align}\n", 58 | "& p(x, z | \\theta) = p(x | z, \\theta) p(z) \\\\\n", 59 | "& p(x | z, \\theta) = \\prod_{i = 1}^D p_i(z, \\theta)^{x_i} (1 - p_i(z, \\theta))^{1 - x_i} \\\\\n", 60 | "& p(z) = \\mathcal N(z | 0, I)\n", 61 | "\\end{align}\n", 62 | "\n", 63 | "и приближенного апостериорного распределения\n", 64 | "\n", 65 | "\\begin{equation}\n", 66 | "q(z | x, \\phi) = \\mathcal N(z | \\mu(x, \\phi), \\operatorname{diag}(\\sigma^2(x, \\phi)))\n", 67 | "\\end{equation}\n", 68 | "\n", 69 | "Для краткости все выкладки приводятся для одного наблюдения $x$, параметры распределений по возможности опускаются. Для набора данных при обучении используется среднее значение нижней оценки. Цель обучения - максимизировать нижнюю оценку на обоснованность\n", 70 | "\n", 71 | "$$ \\mathcal L(x, \\theta, \\phi) = \\mathbb E_{q(z | x, \\phi)} p(x | z, \\theta) - \\operatorname{KL}(q(z | x, \\phi) || p(z )) = \\mathbb E_{q(z | x, \\phi)} \\log \\frac{p(x | z, \\phi)p(z)}{q(z | x, \\theta)} \\rightarrow \\max_{\\theta, \\phi} $$\n", 72 | "\n", 73 | "Как было рассказано на лекции, на практике нижняя оценка приближается оценкой \n", 74 | "\n", 75 | "\\begin{align*}\n", 76 | "&\\frac{1}{K} \\sum_{k=1}^K \\log \\frac{p(x | z_k)p(z_k)}{q(z_k | x)} \\\\\n", 77 | "& \\\\\n", 78 | "&z_k = \\mu(x, \\phi) + \\sigma^2(x, \\phi)^T \\varepsilon_k \\\\\n", 79 | "&\\varepsilon_k \\sim \\mathcal N(0, I), iid\n", 80 | "\\end{align*}\n", 81 | "\n", 82 | "с K=1, а затем максимизируется с помощью градиентного подъема.\n", 83 | "\n", 84 | "## Как это реализовать?\n", 85 | "\n", 86 | "Для вычисления приведенной выше нижней оценки необходимо уметь:\n", 87 | "1. Вычислять логарифм плотности всех распределений ($p(x | z)$, $p(z)$, $q(z | x)$)\n", 88 | "2. Сэмплировать из $q(z | x)$\n", 89 | "\n", 90 | "Следуя практике *tensorflow.distributions*, мы реализуем распределения как два класса с методами *log_prob()* и *sample()*" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true, 98 | "deletable": true, 99 | "editable": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "class BinaryVector():\n", 104 | " def __init__(self, logits, rng=None):\n", 105 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n", 106 | " self.logits = logits\n", 107 | "\n", 108 | " def log_prob(self, x):\n", 109 | " # возвращает вектор вероятностей для каждого объекта в батче\n", 110 | " pixelwise_log_probs = (\n", 111 | " x * (self.logits - T.nnet.softplus(self.logits))\n", 112 | " - (1 - x) * T.nnet.softplus(self.logits)\n", 113 | " )\n", 114 | " return T.sum(pixelwise_log_probs, axis=(1, 2, 3))\n", 115 | " \n", 116 | " def sample(self):\n", 117 | " shape = self.logits.shape\n", 118 | " return T.nnet.sigmoid(self.logits) >= self.rng.uniform(shape)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": true, 126 | "deletable": true, 127 | "editable": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "class MultivariateNormalDiag():\n", 132 | " def __init__(self, loc=None, scale=None, rng=None):\n", 133 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n", 134 | " self.loc= loc\n", 135 | " self.scale = scale\n", 136 | " \n", 137 | " def log_prob(self, z):\n", 138 | " normalization_constant = (\n", 139 | " - 0.5 * np.log(2 * np.pi)\n", 140 | " - T.log(self.scale)\n", 141 | " )\n", 142 | " square_term = -0.5 * ((z - self.loc) / self.scale) ** 2\n", 143 | " log_prob_vec = normalization_constant + square_term\n", 144 | " return T.sum(log_prob_vec, axis=1)\n", 145 | " \n", 146 | " def sample(self):\n", 147 | " ######################################################################\n", 148 | " # Сэмплирование из q(z | x) - ключевой момент в вариационном автокоидровщике #\n", 149 | " # Пользуясь методом self.rng.normal() реализуйте её самостоятельно #\n", 150 | " ######################################################################\n", 151 | " shape = self.loc.shape\n", 152 | " z = (self.loc + self.scale * self.rng.normal(shape))\n", 153 | " return z" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "deletable": true, 160 | "editable": true 161 | }, 162 | "source": [ 163 | "Для параметров распределений построим две сети. Обратите внимание, что кодировщик теперь возвращает и код, и параметр масштаба" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": true, 171 | "deletable": true, 172 | "editable": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "def vae_encoder_mlp(input_x):\n", 177 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),\n", 178 | " input_var=input_x)\n", 179 | " ######################################################################################\n", 180 | " # Реализуйте некоторую несложную архитектуру кодировщика, возвращающую вектор среднего и вектор #\n", 181 | " # стандартных отклонений. Их размерность должны быть HIDDEN_DIM. Какие функции активаций ну-#\n", 182 | " # жно использовать? #\n", 183 | " ######################################################################################\n", 184 | " l_hid1 = lasagne.layers.DenseLayer(\n", 185 | " l_in, num_units=128,\n", 186 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 187 | " W=lasagne.init.GlorotUniform(),\n", 188 | " name='e_hid1')\n", 189 | " l_hid2 = lasagne.layers.DenseLayer(\n", 190 | " l_hid1, num_units=64,\n", 191 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 192 | " name='e_hid2')\n", 193 | " l_out_loc = lasagne.layers.DenseLayer(\n", 194 | " l_hid2, num_units=2,\n", 195 | " nonlinearity=None,\n", 196 | " name='e_mean')\n", 197 | " l_out_scale = lasagne.layers.DenseLayer(\n", 198 | " l_hid2, num_units=2,\n", 199 | " nonlinearity=lasagne.nonlinearities.softplus,\n", 200 | " name='e_scale')\n", 201 | " return l_out_loc, l_out_scale\n", 202 | "\n", 203 | "def vae_decoder_mlp(input_z):\n", 204 | " l_in = lasagne.layers.InputLayer(shape=(None, 2),\n", 205 | " input_var=input_z)\n", 206 | " l_hid1 = lasagne.layers.DenseLayer(\n", 207 | " l_in, num_units=64,\n", 208 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 209 | " W=lasagne.init.GlorotUniform(),\n", 210 | " name='d_hid1')\n", 211 | " l_hid2 = lasagne.layers.DenseLayer(\n", 212 | " l_hid1, num_units=128,\n", 213 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 214 | " name='d_hid2')\n", 215 | " l_out = lasagne.layers.DenseLayer(\n", 216 | " l_hid2, num_units=28 ** 2,\n", 217 | " nonlinearity=None,\n", 218 | " name='d_out')\n", 219 | " l_out = lasagne.layers.ReshapeLayer(l_out, shape=(-1, 1, 28, 28))\n", 220 | " return l_out" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "deletable": true, 227 | "editable": true 228 | }, 229 | "source": [ 230 | "## Строим граф вычислений \n", 231 | "\n", 232 | "Входы и модель вывода $q(z | x)$" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false, 240 | "deletable": true, 241 | "editable": true 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "input_x = T.tensor4('inputs')\n", 246 | "#####################################################\n", 247 | "# Определите encoder_mean, encoder scale, затем #\n", 248 | "# определите объект для апостериорного распределения qz_x #\n", 249 | "####################################################\n", 250 | "encoder_mean, encoder_scale = vae_encoder_mlp(input_x)\n", 251 | "qz_x = MultivariateNormalDiag(\n", 252 | " lasagne.layers.get_output(encoder_mean), \n", 253 | " lasagne.layers.get_output(encoder_scale)\n", 254 | ")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": { 260 | "deletable": true, 261 | "editable": true 262 | }, 263 | "source": [ 264 | "Генеративная модель $p(x, z)$" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "collapsed": true, 272 | "deletable": true, 273 | "editable": true 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "###################################################################\n", 278 | "# Определите параметр p(x | z) decoder_logits, затем #\n", 279 | "# определите объекты pz распределения p(z) и px_z распределения p(x | z) #\n", 280 | "###################################################################\n", 281 | "z = qz_x.sample()\n", 282 | "decoder_logits = vae_decoder_mlp(z)\n", 283 | "pz = MultivariateNormalDiag(T.zeros((BATCH_SIZE, HIDDEN_DIM)),\n", 284 | " T.ones((BATCH_SIZE, HIDDEN_DIM)))\n", 285 | "px_z = BinaryVector(\n", 286 | " lasagne.layers.get_output(decoder_logits)\n", 287 | ")" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": { 293 | "deletable": true, 294 | "editable": true 295 | }, 296 | "source": [ 297 | "ELBO и правила для обновления весов" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": { 304 | "collapsed": true, 305 | "deletable": true, 306 | "editable": true 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "########################################################################################\n", 311 | "# Пользуясь методами px_z, p_z, qz_x определите функцию потерь для вариационного автокодировщика #\n", 312 | "# При обучении значение функции потерь должно принимать значения порядка -100 (от -150 и выше) #\n", 313 | "# Создайте список параметров сети для передачи в оптимизатор #\n", 314 | "# Что использовать в качестве функции потерь? #\n", 315 | "elbo = T.mean(px_z.log_prob(input_x)\n", 316 | " + pz.log_prob(z)\n", 317 | " - qz_x.log_prob(z))\n", 318 | "params = lasagne.layers.get_all_params([encoder_mean,\n", 319 | " encoder_scale,\n", 320 | " decoder_logits])\n", 321 | "loss = -elbo\n", 322 | "########################################################################################\n", 323 | "updates = lasagne.updates.adam(loss, params)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": { 329 | "deletable": true, 330 | "editable": true 331 | }, 332 | "source": [ 333 | "Определяем функции" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": false, 341 | "deletable": true, 342 | "editable": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "train = theano.function(\n", 347 | " [input_x],\n", 348 | " elbo,\n", 349 | " updates=updates\n", 350 | ")\n", 351 | "\n", 352 | "elbo_at_test = theano.function(\n", 353 | " [input_x],\n", 354 | " elbo\n", 355 | ")" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": { 361 | "deletable": true, 362 | "editable": true 363 | }, 364 | "source": [ 365 | "И обучаем модель" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "collapsed": false, 373 | "deletable": true, 374 | "editable": true 375 | }, 376 | "outputs": [], 377 | "source": [ 378 | "for epoch in range(num_epochs):\n", 379 | " train_elbo = 0\n", 380 | " train_batches = 0\n", 381 | " start_time = time.time()\n", 382 | " for batch in iterate_minibatches(X_train, batchsize=BATCH_SIZE):\n", 383 | " \"\"\"\n", 384 | " Обратите внимание, что тут предложенна вероятностная модель для бинарных данных.\n", 385 | " MNIST содержит черно-белые изображения с градациями серого.\n", 386 | " На практике при обучении автокодировщика получают бинарные данные, всякий раз положив случайно значение пикселя равным 0 или 1\n", 387 | " в зависимости от интенсивности пикселя в объекте из данных.\n", 388 | " Такой прием называется динамическая бинаризация, он эффективно расширяет обучающую выборку и приводит к лучшим значениям \n", 389 | " правдоподобия обученных моделей.\n", 390 | " \"\"\"\n", 391 | " batch = np.random.rand(*batch.shape) <= batch\n", 392 | " train_elbo += train(batch)\n", 393 | " train_batches += 1\n", 394 | " \n", 395 | " test_elbo = 0\n", 396 | " test_batches = 0\n", 397 | " for batch in iterate_minibatches(X_test, batchsize=BATCH_SIZE):\n", 398 | " batch = np.random.rand(*batch.shape) <= batch\n", 399 | " test_elbo += elbo_at_test(batch)\n", 400 | " test_batches += 1\n", 401 | " \n", 402 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n", 403 | " epoch + 1, num_epochs, time.time() - start_time))\n", 404 | " print(\"Train error {}\".format(train_elbo/train_batches))\n", 405 | " print(\"Test error {}\".format(test_elbo/test_batches))" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": { 411 | "deletable": true, 412 | "editable": true 413 | }, 414 | "source": [ 415 | "## Что получается? Визуализации" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": { 422 | "collapsed": false, 423 | "deletable": true, 424 | "editable": true 425 | }, 426 | "outputs": [], 427 | "source": [ 428 | "from utils import plot_reconstructions, plot_hidden_space\n", 429 | "\n", 430 | "reconstruct = theano.function(\n", 431 | " [input_x],\n", 432 | " T.nnet.sigmoid(lasagne.layers.get_output(decoder_logits))\n", 433 | ")\n", 434 | "\n", 435 | "encode = theano.function(\n", 436 | " [input_x],\n", 437 | " qz_x.sample(),\n", 438 | ")" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": { 444 | "deletable": true, 445 | "editable": true 446 | }, 447 | "source": [ 448 | "Визуализируем среднее распределения $p(x | z)$" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": { 455 | "collapsed": false, 456 | "deletable": true, 457 | "editable": true 458 | }, 459 | "outputs": [], 460 | "source": [ 461 | "plot_reconstructions(X_test, reconstruct)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "deletable": true, 468 | "editable": true 469 | }, 470 | "source": [ 471 | "Чем отличается пространство представлений автокоидровщика от пространства представлений вариационного автокоидровщика? Почему возникло различие?" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": { 478 | "collapsed": false, 479 | "deletable": true, 480 | "editable": true 481 | }, 482 | "outputs": [], 483 | "source": [ 484 | "plot_hidden_space(X_test[:1000], encode)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": { 491 | "collapsed": false, 492 | "deletable": true, 493 | "editable": true 494 | }, 495 | "outputs": [], 496 | "source": [ 497 | "# рисуем по 25 сэмплов кода для каждого объекта\n", 498 | "x_test_repeated = np.repeat(X_test[:25], repeats=25, axis=0)\n", 499 | "plot_hidden_space(x_test_repeated, encode)" 500 | ] 501 | } 502 | ], 503 | "metadata": { 504 | "kernelspec": { 505 | "display_name": "Python 3", 506 | "language": "python", 507 | "name": "python3" 508 | }, 509 | "language_info": { 510 | "codemirror_mode": { 511 | "name": "ipython", 512 | "version": 3 513 | }, 514 | "file_extension": ".py", 515 | "mimetype": "text/x-python", 516 | "name": "python", 517 | "nbconvert_exporter": "python", 518 | "pygments_lexer": "ipython3", 519 | "version": "3.5.1" 520 | } 521 | }, 522 | "nbformat": 4, 523 | "nbformat_minor": 2 524 | } 525 | -------------------------------------------------------------------------------- /sem2-classify&generate/3_vae_with_gaps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Вариационный автокодировщик\n", 11 | "\n", 12 | "tl;dr: Вместо тождественного отображения вариационны автокодировщик выучивает вероятностую модель данных. Стохастическия вычисления и априорное распределение кодов дополнительно регуляризуют модель." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": false, 20 | "deletable": true, 21 | "editable": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import time\n", 26 | "\n", 27 | "import numpy as np\n", 28 | "import theano\n", 29 | "import theano.tensor as T\n", 30 | "import lasagne\n", 31 | "\n", 32 | "import matplotlib.pylab as plt\n", 33 | "%matplotlib inline\n", 34 | "\n", 35 | "from utils import load_dataset, iterate_minibatches\n", 36 | "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams\n", 37 | "\n", 38 | "BATCH_SIZE = 20\n", 39 | "HIDDEN_DIM = 2\n", 40 | "\n", 41 | "num_epochs = 10\n", 42 | "\n", 43 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": { 49 | "deletable": true, 50 | "editable": true 51 | }, 52 | "source": [ 53 | "## Кратко о вариационных автокодировщиках\n", 54 | "\n", 55 | "Рассмотрим вариационный автокодировщик для бинарных наблюдений. Вариационный автокодировщик состоит из генеративной модели наблюдений\n", 56 | "\n", 57 | "\\begin{align}\n", 58 | "& p(x, z | \\theta) = p(x | z, \\theta) p(z) \\\\\n", 59 | "& p(x | z, \\theta) = \\prod_{i = 1}^D p_i(z, \\theta)^{x_i} (1 - p_i(z, \\theta))^{1 - x_i} \\\\\n", 60 | "& p(z) = \\mathcal N(z | 0, I)\n", 61 | "\\end{align}\n", 62 | "\n", 63 | "и приближенного апостериорного распределения\n", 64 | "\n", 65 | "\\begin{equation}\n", 66 | "q(z | x, \\phi) = \\mathcal N(z | \\mu(x, \\phi), \\operatorname{diag}(\\sigma^2(x, \\phi)))\n", 67 | "\\end{equation}\n", 68 | "\n", 69 | "Для краткости все выкладки приводятся для одного наблюдения $x$, параметры распределений по возможности опускаются. Для набора данных при обучении используется среднее значение нижней оценки. Цель обучения - максимизировать нижнюю оценку на обоснованность\n", 70 | "\n", 71 | "$$ \\mathcal L(x, \\theta, \\phi) = \\mathbb E_{q(z | x, \\phi)} p(x | z, \\theta) - \\operatorname{KL}(q(z | x, \\phi) || p(z )) = \\mathbb E_{q(z | x, \\phi)} \\log \\frac{p(x | z, \\phi)p(z)}{q(z | x, \\theta)} \\rightarrow \\max_{\\theta, \\phi} $$\n", 72 | "\n", 73 | "Как было рассказано на лекции, на практике нижняя оценка приближается оценкой \n", 74 | "\n", 75 | "\\begin{align*}\n", 76 | "&\\frac{1}{K} \\sum_{k=1}^K \\log \\frac{p(x | z_k)p(z_k)}{q(z_k | x)} \\\\\n", 77 | "& \\\\\n", 78 | "&z_k = \\mu(x, \\phi) + \\sigma^2(x, \\phi)^T \\varepsilon_k \\\\\n", 79 | "&\\varepsilon_k \\sim \\mathcal N(0, I), iid\n", 80 | "\\end{align*}\n", 81 | "\n", 82 | "с K=1, а затем максимизируется с помощью градиентного подъема.\n", 83 | "\n", 84 | "## Как это реализовать?\n", 85 | "\n", 86 | "Для вычисления приведенной выше нижней оценки необходимо уметь:\n", 87 | "1. Вычислять логарифм плотности всех распределений ($p(x | z)$, $p(z)$, $q(z | x)$)\n", 88 | "2. Сэмплировать из $q(z | x)$\n", 89 | "\n", 90 | "Следуя практике *tensorflow.distributions*, мы реализуем распределения как два класса с методами *log_prob()* и *sample()*" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true, 98 | "deletable": true, 99 | "editable": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "class BinaryVector():\n", 104 | " def __init__(self, logits, rng=None):\n", 105 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n", 106 | " self.logits = logits\n", 107 | "\n", 108 | " def log_prob(self, x):\n", 109 | " # возвращает вектор вероятностей для каждого объекта в батче\n", 110 | " pixelwise_log_probs = (\n", 111 | " x * (self.logits - T.nnet.softplus(self.logits))\n", 112 | " - (1 - x) * T.nnet.softplus(self.logits)\n", 113 | " )\n", 114 | " return T.sum(pixelwise_log_probs, axis=(1, 2, 3))\n", 115 | " \n", 116 | " def sample(self):\n", 117 | " shape = self.logits.shape\n", 118 | " return T.nnet.sigmoid(self.logits) >= self.rng.uniform(shape)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": true, 126 | "deletable": true, 127 | "editable": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "class MultivariateNormalDiag():\n", 132 | " def __init__(self, loc=None, scale=None, rng=None):\n", 133 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n", 134 | " self.loc= loc\n", 135 | " self.scale = scale\n", 136 | " \n", 137 | " def log_prob(self, z):\n", 138 | " normalization_constant = (\n", 139 | " - 0.5 * np.log(2 * np.pi)\n", 140 | " - T.log(self.scale)\n", 141 | " )\n", 142 | " square_term = -0.5 * ((z - self.loc) / self.scale) ** 2\n", 143 | " log_prob_vec = normalization_constant + square_term\n", 144 | " return T.sum(log_prob_vec, axis=1)\n", 145 | " \n", 146 | " def sample(self):\n", 147 | " ######################################################################\n", 148 | " # Сэмплирование из q(z | x) - ключевой момент в вариационном автокоидровщике #\n", 149 | " # Пользуясь методом self.rng.normal() реализуйте её самостоятельно #\n", 150 | " ######################################################################\n", 151 | " return z" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": { 157 | "deletable": true, 158 | "editable": true 159 | }, 160 | "source": [ 161 | "Для параметров распределений построим две сети. Обратите внимание, что кодировщик теперь возвращает и код, и параметр масштаба" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": true, 169 | "deletable": true, 170 | "editable": true 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "def vae_encoder_mlp(input_x):\n", 175 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),\n", 176 | " input_var=input_x)\n", 177 | " ######################################################################################\n", 178 | " # Реализуйте некоторую несложную архитектуру кодировщика, возвращающую вектор среднего и вектор #\n", 179 | " # стандартных отклонений. Их размерность должны быть HIDDEN_DIM. Какие функции активаций ну-#\n", 180 | " # жно использовать? #\n", 181 | " ######################################################################################\n", 182 | " return l_out_loc, l_out_scale\n", 183 | "\n", 184 | "def vae_decoder_mlp(input_z):\n", 185 | " l_in = lasagne.layers.InputLayer(shape=(None, 2),\n", 186 | " input_var=input_z)\n", 187 | " l_hid1 = lasagne.layers.DenseLayer(\n", 188 | " l_in, num_units=64,\n", 189 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 190 | " W=lasagne.init.GlorotUniform(),\n", 191 | " name='d_hid1')\n", 192 | " l_hid2 = lasagne.layers.DenseLayer(\n", 193 | " l_hid1, num_units=128,\n", 194 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 195 | " name='d_hid2')\n", 196 | " l_out = lasagne.layers.DenseLayer(\n", 197 | " l_hid2, num_units=28 ** 2,\n", 198 | " nonlinearity=None,\n", 199 | " name='d_out')\n", 200 | " l_out = lasagne.layers.ReshapeLayer(l_out, shape=(-1, 1, 28, 28))\n", 201 | " return l_out" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": { 207 | "deletable": true, 208 | "editable": true 209 | }, 210 | "source": [ 211 | "## Строим граф вычислений \n", 212 | "\n", 213 | "Входы и модель вывода $q(z | x)$" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "collapsed": false, 221 | "deletable": true, 222 | "editable": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "input_x = T.tensor4('inputs')\n", 227 | "#####################################################\n", 228 | "# Определите encoder_mean, encoder scale, затем #\n", 229 | "# определите объект для апостериорного распределения qz_x #\n", 230 | "####################################################\n", 231 | "\n", 232 | "encoder_mean, encoder_scale = # ... \n", 233 | "qz_x = # MultivariateNormalDiag ... " 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": { 239 | "deletable": true, 240 | "editable": true 241 | }, 242 | "source": [ 243 | "Генеративная модель $p(x, z)$" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true, 251 | "deletable": true, 252 | "editable": true 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "###################################################################\n", 257 | "# Определите параметр p(x | z) decoder_logits, затем #\n", 258 | "# определите объекты pz распределения p(z) и px_z распределения p(x | z) #\n", 259 | "###################################################################\n", 260 | "\n", 261 | "decoder_logits = # vae_decoder_mlp \n", 262 | "pz = # MultivariateNormalDiag ...\n", 263 | "px_z = # BinaryVector ..." 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "deletable": true, 270 | "editable": true 271 | }, 272 | "source": [ 273 | "ELBO и правила для обновления весов" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": true, 281 | "deletable": true, 282 | "editable": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "########################################################################################\n", 287 | "# Пользуясь методами px_z, p_z, qz_x определите функцию потерь для вариационного автокодировщика #\n", 288 | "# При обучении значение функции потерь должно принимать значения порядка -100 (от -150 и выше) #\n", 289 | "# Создайте список параметров сети для передачи в оптимизатор #\n", 290 | "# Что использовать в качестве функции потерь? #\n", 291 | "elbo = None\n", 292 | "params = None\n", 293 | "loss = None\n", 294 | "########################################################################################\n", 295 | "updates = lasagne.updates.adam(loss, params)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "deletable": true, 302 | "editable": true 303 | }, 304 | "source": [ 305 | "Определяем функции" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": false, 313 | "deletable": true, 314 | "editable": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "train = theano.function(\n", 319 | " [input_x],\n", 320 | " elbo,\n", 321 | " updates=updates\n", 322 | ")\n", 323 | "\n", 324 | "elbo_at_test = theano.function(\n", 325 | " [input_x],\n", 326 | " elbo\n", 327 | ")" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "deletable": true, 334 | "editable": true 335 | }, 336 | "source": [ 337 | "И обучаем модель" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false, 345 | "deletable": true, 346 | "editable": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "for epoch in range(num_epochs):\n", 351 | " train_elbo = 0\n", 352 | " train_batches = 0\n", 353 | " start_time = time.time()\n", 354 | " for batch in iterate_minibatches(X_train, batchsize=BATCH_SIZE):\n", 355 | " \"\"\"\n", 356 | " Обратите внимание, что тут предложенна вероятностная модель для бинарных данных.\n", 357 | " MNIST содержит черно-белые изображения с градациями серого.\n", 358 | " На практике при обучении автокодировщика получают бинарные данные, всякий раз положив случайно значение пикселя равным 0 или 1\n", 359 | " в зависимости от интенсивности пикселя в объекте из данных.\n", 360 | " Такой прием называется динамическая бинаризация, он эффективно расширяет обучающую выборку и приводит к лучшим значениям \n", 361 | " правдоподобия обученных моделей.\n", 362 | " \"\"\"\n", 363 | " batch = np.random.rand(*batch.shape) <= batch\n", 364 | " train_elbo += train(batch)\n", 365 | " train_batches += 1\n", 366 | " \n", 367 | " test_elbo = 0\n", 368 | " test_batches = 0\n", 369 | " for batch in iterate_minibatches(X_test, batchsize=BATCH_SIZE):\n", 370 | " batch = np.random.rand(*batch.shape) <= batch\n", 371 | " test_elbo += elbo_at_test(batch)\n", 372 | " test_batches += 1\n", 373 | " \n", 374 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n", 375 | " epoch + 1, num_epochs, time.time() - start_time))\n", 376 | " print(\"Train error {}\".format(train_elbo/train_batches))\n", 377 | " print(\"Test error {}\".format(test_elbo/test_batches))" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "deletable": true, 384 | "editable": true 385 | }, 386 | "source": [ 387 | "## Что получается? Визуализации" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": { 394 | "collapsed": false, 395 | "deletable": true, 396 | "editable": true 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "from utils import plot_reconstructions, plot_hidden_space\n", 401 | "\n", 402 | "reconstruct = theano.function(\n", 403 | " [input_x],\n", 404 | " T.nnet.sigmoid(lasagne.layers.get_output(decoder_logits))\n", 405 | ")\n", 406 | "\n", 407 | "encode = theano.function(\n", 408 | " [input_x],\n", 409 | " qz_x.sample(),\n", 410 | ")" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "Визуализируем среднее распределения $p(x | z)$" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": false, 425 | "deletable": true, 426 | "editable": true 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "plot_reconstructions(X_test, reconstruct)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "Чем отличается пространство представлений автокоидровщика от пространства представлений вариационного автокоидровщика? Почему возникло различие?" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": { 444 | "collapsed": false, 445 | "deletable": true, 446 | "editable": true 447 | }, 448 | "outputs": [], 449 | "source": [ 450 | "plot_hidden_space(X_test[:1000], encode)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": { 457 | "collapsed": false, 458 | "deletable": true, 459 | "editable": true 460 | }, 461 | "outputs": [], 462 | "source": [ 463 | "# рисуем по 25 сэмплов кода для каждого объекта\n", 464 | "x_test_repeated = np.repeat(X_test[:25], repeats=25, axis=0)\n", 465 | "plot_hidden_space(x_test_repeated, encode)" 466 | ] 467 | } 468 | ], 469 | "metadata": { 470 | "kernelspec": { 471 | "display_name": "Python 2", 472 | "language": "python", 473 | "name": "python2" 474 | }, 475 | "language_info": { 476 | "codemirror_mode": { 477 | "name": "ipython", 478 | "version": 2 479 | }, 480 | "file_extension": ".py", 481 | "mimetype": "text/x-python", 482 | "name": "python", 483 | "nbconvert_exporter": "python", 484 | "pygments_lexer": "ipython2", 485 | "version": "2.7.10" 486 | } 487 | }, 488 | "nbformat": 4, 489 | "nbformat_minor": 2 490 | } 491 | -------------------------------------------------------------------------------- /sem2-classify&generate/4_ss_vae.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Обучение на частично размеченной выборке*\n", 11 | "\n", 12 | "Дополнительные материалы к семинару. По мотивам статьи [\"Semi-supervised Learning with\n", 13 | "Deep Generative Models\"](https://arxiv.org/pdf/1406.5298.pdf)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": true, 21 | "deletable": true, 22 | "editable": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "import sys\n", 27 | "import os\n", 28 | "import time\n", 29 | "\n", 30 | "import numpy as np\n", 31 | "import theano\n", 32 | "import theano.tensor as T\n", 33 | "import lasagne\n", 34 | "import matplotlib.pylab as plt\n", 35 | "%matplotlib inline\n", 36 | "\n", 37 | "from utils import load_dataset, iterate_minibatches\n", 38 | "from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "deletable": true, 45 | "editable": true 46 | }, 47 | "source": [ 48 | "Для этого задания мы повысим размерность скрытых компонент, а также случайным образом \"выбросим\" приблизительно 95% меток классов." 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": false, 56 | "deletable": true, 57 | "editable": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "BATCH_SIZE = 20\n", 62 | "HIDDEN_DIM = 16\n", 63 | "NUMBER_OF_DIGITS = 10\n", 64 | "\n", 65 | "num_epochs = 40\n", 66 | "\n", 67 | "X_train, y_train, X_val, y_val, X_test, y_test = load_dataset()\n", 68 | "present = np.random.rand(X_train.shape[0]) < 0.05" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "deletable": true, 75 | "editable": true 76 | }, 77 | "source": [ 78 | "Классы для распределений" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": true, 86 | "deletable": true, 87 | "editable": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "class BinaryVector():\n", 92 | " def __init__(self, logits, rng=None):\n", 93 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n", 94 | " self.logits = logits\n", 95 | "\n", 96 | " def log_prob(self, x):\n", 97 | " pixelwise_log_probs = (\n", 98 | " x * (self.logits - T.nnet.softplus(self.logits))\n", 99 | " - (1 - x) * T.nnet.softplus(self.logits)\n", 100 | " )\n", 101 | " return T.sum(pixelwise_log_probs, axis=(1, 2, 3))\n", 102 | " \n", 103 | " def sample(self):\n", 104 | " shape = self.logits.shape\n", 105 | " return T.nnet.sigmoid(self.logits) >= self.rng.uniform(shape)\n", 106 | "\n", 107 | "class MultivariateNormalDiag():\n", 108 | " def __init__(self, loc=None, scale=None, rng=None):\n", 109 | " self.rng = rng if rng else RandomStreams(lasagne.random.get_rng().randint(1,2147462579))\n", 110 | " self.loc= loc\n", 111 | " self.scale = scale\n", 112 | " \n", 113 | " def log_prob(self, z):\n", 114 | " normalization_constant = (\n", 115 | " - 0.5 * np.log(2 * np.pi)\n", 116 | " - T.log(self.scale)\n", 117 | " )\n", 118 | " square_term = -0.5 * ((z - self.loc) / self.scale) ** 2\n", 119 | " log_prob_vec = normalization_constant + square_term\n", 120 | " return T.sum(log_prob_vec, axis=1)\n", 121 | " \n", 122 | " def sample(self):\n", 123 | " shape = self.loc.shape\n", 124 | " z = (self.loc + self.scale * self.rng.normal(shape))\n", 125 | " return z" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": { 131 | "deletable": true, 132 | "editable": true 133 | }, 134 | "source": [ 135 | "## Вероятностная модель данных" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "deletable": true, 142 | "editable": true 143 | }, 144 | "source": [ 145 | "В отличие от вариационного автокодировщика, генеративная модель теперь будет также включать и метки классов $y$:\n", 146 | "\n", 147 | "\\begin{align*}\n", 148 | "& p(x, y, z) = p(x | y, z) p(z) p(y) \\\\\n", 149 | "& p(y) = Cat(y | \\pi), \\pi = (1/10, \\dots, 1/10) \\\\\n", 150 | "& p(z) = \\mathcal N(z | 0, I) \\\\\n", 151 | "& p(x | y, z) = \\prod_{i=1}^D p_i(y, z)^{x_i} (1 - p_i(y, z))^{1 - x_i}\n", 152 | "\\end{align*}\n", 153 | "\n", 154 | "При обучении вариационного автокодировщика максимизируется маргинальное правдоподобие $\\log p(x)$ (нижняя оценка на него, если быть точным), а в данном случае мы будем максимизировать $\\log p(x,y)$ для объектов с метками и $\\log p(x)$ для объектов без метки. Обозначим за $P$ индексы объектов обучающей выборки с метками класса.\n", 155 | "\n", 156 | "Построим нижнюю оценку для\n", 157 | "\n", 158 | "\\begin{equation}\n", 159 | "L(X, y) = \\sum_{i \\notin P} \\log p(x_i) + \\sum_{i \\in P} \\log p(x_i, y_i)\n", 160 | "\\end{equation}\n", 161 | "\n", 162 | "Для этого определим следующее вариационное приближение:\n", 163 | "\n", 164 | "\\begin{align*}\n", 165 | "& q(y, z | x) = q(y | x) q(z | y, x)\\\\\n", 166 | "& \\\\\n", 167 | "& q(y | x) = Cat(y | \\pi(x))\\\\\n", 168 | "& q(z | y, x) = \\mathcal N(z | \\mu_\\phi(x, y), \\operatorname{diag}\\sigma^2(y, x))\n", 169 | "\\end{align*}\n", 170 | "\n", 171 | "### Оценка для $i \\in P$\n", 172 | "\n", 173 | "Случай похож на модель для вариационного автокодировщика\n", 174 | "\n", 175 | "\\begin{equation}\n", 176 | "\\log p(x, y) = \\log \\mathbb E_{p(z)} p(x, y | z) \\geq \\mathbb E_{q(z | y, x)} \\log \\frac{p(x, y|z) p(z)}{q(z | y, x)}\n", 177 | "\\end{equation}\n", 178 | "\n", 179 | "### Оценка $i \\notin P$\n", 180 | "\n", 181 | "\\begin{equation}\n", 182 | "\\log p(x) = \\log \\mathbb E_{p(y)} \\mathbb E_{p(z | y)} \\log p(x| z, y)\\geq \\mathbb E_{q(y | x)} \\mathbb E_{q(z | y, x)} \\log \\frac{p(x, y, z)}{q(z | y, x) q(y | x)}\n", 183 | "\\end{equation}\n", 184 | "\n", 185 | "### Целевая функия\n", 186 | "\n", 187 | "\\begin{equation}\n", 188 | "\\mathcal L(X, y) = \\sum_{i \\in P} \\mathbb E_{q(z_i | y_i, x_i)} \\log \\frac{p(x_i, y_i, z_i)}{q(z_i | y_i, x_i)} + \\sum_{i \\notin P} \\mathbb E_{q(y_i | x_i)} \\mathbb E_{q(z_i | y_i, x_i)} \\log \\frac{p(x_i, y_i, z_i)}{q(z_i | y_i, x_i) q(y_i | x_i)}\n", 189 | "\\end{equation}\n", 190 | "\n", 191 | "Оценку для математического ожидания по $z$ будет получать с помощью *reparametrization trick*.\n", 192 | "Пользуясь малым количеством классов, математическое ожидание по $y$ будем вычислять явно.\n", 193 | "\n", 194 | "# Как заставить модель все-таки обучаться?\n", 195 | "\n", 196 | "Максимизация нижней оценки на обоснованность на практике может не приводит к построению хорошей модели вывода $q(y | x)$.\n", 197 | "\n", 198 | "Естественно искать модели $q(y | x)$ среди тех, которые согласуются с размеченными объектами обучающей выборки $(x_i, y_i)$. В статье, в которой была впервые предложена описанная в семинаре модель, с весом $\\alpha$ добавляется дополнительное слагаемое к функции потерь:\n", 199 | "\n", 200 | "\\begin{equation}\n", 201 | "\\frac{1}{|P|}\\sum_{i \\in P} y_i^T \\log q(y | x).\n", 202 | "\\end{equation}\n", 203 | "\n", 204 | "Оно соответствует кросс-энтропии классификатора $q(y|x)$ на размеченных объектах." 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "deletable": true, 211 | "editable": true 212 | }, 213 | "source": [ 214 | "### Особенности реализации\n", 215 | "В данной реализации мы передаем на вход кодировщика и декодировщика one-hot коды для $y$.\n", 216 | "\n", 217 | "Это находит свое отражение в размерах входов сетей:" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "collapsed": false, 225 | "deletable": true, 226 | "editable": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "def classifier_mlp(input_x):\n", 231 | " # takes x to produce posterior class assignment probabilities\n", 232 | " l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),\n", 233 | " input_var=input_x)\n", 234 | " l_hid1 = lasagne.layers.DenseLayer(\n", 235 | " l_in, num_units=256,\n", 236 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 237 | " W=lasagne.init.GlorotUniform(),\n", 238 | " name='cl_hid1')\n", 239 | " l_out = lasagne.layers.DenseLayer(\n", 240 | " l_hid1, num_units=10,\n", 241 | " nonlinearity=lasagne.nonlinearities.softmax,\n", 242 | " name='cl_out')\n", 243 | " return l_out\n", 244 | "\n", 245 | "def vae_encoder_cond(input_xy):\n", 246 | " l_in = lasagne.layers.InputLayer(shape=(None, 28 * 28 + NUMBER_OF_DIGITS),\n", 247 | " input_var=input_xy)\n", 248 | " l_hid1 = lasagne.layers.DenseLayer(\n", 249 | " l_in, num_units=256,\n", 250 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 251 | " W=lasagne.init.GlorotUniform(),\n", 252 | " name='e_hid')\n", 253 | " l_out_loc = lasagne.layers.DenseLayer(\n", 254 | " l_hid1, num_units=HIDDEN_DIM,\n", 255 | " nonlinearity=None,\n", 256 | " name='e_mean')\n", 257 | " l_out_scale = lasagne.layers.DenseLayer(\n", 258 | " l_hid1, num_units=HIDDEN_DIM,\n", 259 | " nonlinearity=lasagne.nonlinearities.softplus,\n", 260 | " name='e_scale')\n", 261 | " \n", 262 | " return l_out_loc, l_out_scale\n", 263 | " \n", 264 | " \n", 265 | "def vae_decoder_cond(input_zy):\n", 266 | " l_in = lasagne.layers.InputLayer(shape=(None, HIDDEN_DIM + NUMBER_OF_DIGITS),\n", 267 | " input_var=input_zy)\n", 268 | " l_hid1 = lasagne.layers.DenseLayer(\n", 269 | " l_in, num_units=256,\n", 270 | " nonlinearity=lasagne.nonlinearities.rectify,\n", 271 | " W=lasagne.init.GlorotUniform(),\n", 272 | " name='d_hid1')\n", 273 | " l_out = lasagne.layers.DenseLayer(\n", 274 | " l_hid1, num_units=28 * 28,\n", 275 | " nonlinearity=None,\n", 276 | " name='d_out')\n", 277 | " l_out = lasagne.layers.ReshapeLayer(l_out, shape=(-1, 1, 28, 28))\n", 278 | " return l_out" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": { 284 | "deletable": true, 285 | "editable": true 286 | }, 287 | "source": [ 288 | "При обучении мы будем вычислять выходы нейросети на всех возможных значениях $y$, все они нужны в 95% случаев для подсчета нижней оценки на обоснованность. Для этого здесь написаны две вспомонательные функции:" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": true, 296 | "deletable": true, 297 | "editable": true 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "input_x = T.tensor4('input_x')\n", 302 | "input_y = T.ivector('input_y')\n", 303 | "input_p = T.bvector('input_present')\n", 304 | "\n", 305 | "def add_all_possible_labels(input_x):\n", 306 | " # создает десять копий объекта из батча и приписывает к каждой из них код для y\n", 307 | " input_x = T.reshape(input_x, newshape=(BATCH_SIZE, -1))\n", 308 | " input_x = T.repeat(input_x, repeats=NUMBER_OF_DIGITS, axis=0)\n", 309 | " input_y = T.repeat(T.eye(NUMBER_OF_DIGITS), repeats=BATCH_SIZE, axis=0)\n", 310 | " input_xy = T.concatenate([input_x, input_y], axis=1)\n", 311 | " return input_xy\n", 312 | "\n", 313 | "def add_corresponding_labels(input_z):\n", 314 | " # приписывает код n % 10 (остаток деления) для n объекта в батче\n", 315 | " input_y = T.repeat(T.eye(NUMBER_OF_DIGITS), repeats=BATCH_SIZE, axis=0)\n", 316 | " input_zy = T.concatenate([input_z, input_y], axis=1)\n", 317 | " return input_zy" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": { 323 | "deletable": true, 324 | "editable": true 325 | }, 326 | "source": [ 327 | "Модель вывода" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "collapsed": true, 335 | "deletable": true, 336 | "editable": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "input_xy = add_all_possible_labels(input_x)\n", 341 | "encoder_mean, encoder_scale = vae_encoder_cond(input_xy)\n", 342 | "qz_xy = MultivariateNormalDiag(\n", 343 | " lasagne.layers.get_output(encoder_mean), \n", 344 | " lasagne.layers.get_output(encoder_scale)\n", 345 | ")\n", 346 | "\n", 347 | "input_zy = add_corresponding_labels(qz_xy.sample())\n" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": { 353 | "deletable": true, 354 | "editable": true 355 | }, 356 | "source": [ 357 | "Генеративная модель" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": false, 365 | "deletable": true, 366 | "editable": true 367 | }, 368 | "outputs": [], 369 | "source": [ 370 | "decoder_logits = vae_decoder_cond(input_zy)\n", 371 | "pz = MultivariateNormalDiag(T.zeros((NUMBER_OF_DIGITS * BATCH_SIZE, HIDDEN_DIM)),\n", 372 | " T.ones((NUMBER_OF_DIGITS * BATCH_SIZE, HIDDEN_DIM)))\n", 373 | "# здесь мы не стали реализовывать отдельный класс\n", 374 | "p_y = -np.log(NUMBER_OF_DIGITS * np.ones([BATCH_SIZE * NUMBER_OF_DIGITS]))\n", 375 | "\n", 376 | "px_zy = BinaryVector(\n", 377 | " lasagne.layers.get_output(decoder_logits)\n", 378 | ")\n", 379 | "\n", 380 | "classifier = classifier_mlp(input_x)\n", 381 | "qy_x_probs = lasagne.layers.get_output(classifier)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": { 387 | "deletable": true, 388 | "editable": true 389 | }, 390 | "source": [ 391 | "Функция потерь" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": { 398 | "collapsed": false, 399 | "deletable": true, 400 | "editable": true 401 | }, 402 | "outputs": [], 403 | "source": [ 404 | "alpha = 5.\n", 405 | "\n", 406 | "elbo_vec = (+ p_y\n", 407 | " + px_zy.log_prob(T.repeat(input_x, repeats=NUMBER_OF_DIGITS, axis=0))\n", 408 | " + pz.log_prob(qz_xy.sample())\n", 409 | " - qz_xy.log_prob(qz_xy.sample()))\n", 410 | "elbo_vec = T.reshape(elbo_vec, newshape=(BATCH_SIZE, NUMBER_OF_DIGITS))\n", 411 | "\n", 412 | "elbo_vec = (\n", 413 | " input_p * T.sum(elbo_vec * lasagne.utils.one_hot(input_y, m=NUMBER_OF_DIGITS), axis=1)\n", 414 | " + (1 - input_p) * T.sum(qy_x_probs * elbo_vec - qy_x_probs * T.log(qy_x_probs), axis=1)\n", 415 | ")\n", 416 | "\n", 417 | "loss = T.mean(elbo_vec - alpha * input_p * lasagne.objectives.categorical_crossentropy(qy_x_probs, input_y))" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": { 424 | "collapsed": false, 425 | "deletable": true, 426 | "editable": true 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "params = lasagne.layers.get_all_params(\n", 431 | " [encoder_mean, encoder_scale, decoder_logits, classifier]\n", 432 | ")\n", 433 | "updates = lasagne.updates.adam(-loss, params)\n", 434 | "\n", 435 | "train_fn = theano.function([input_x, input_y, input_p], loss, updates=updates)\n", 436 | "accuracy = theano.function(\n", 437 | " [input_x, input_y],\n", 438 | " T.mean(T.eq(T.argmax(qy_x_probs, axis=1), input_y), dtype=theano.config.floatX)\n", 439 | ")" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": { 446 | "collapsed": false, 447 | "deletable": true, 448 | "editable": true 449 | }, 450 | "outputs": [], 451 | "source": [ 452 | "for epoch in range(num_epochs):\n", 453 | " # In each epoch, we do a full pass over the training data:\n", 454 | " train_err = 0\n", 455 | " train_batches = 0\n", 456 | " start_time = time.time()\n", 457 | " for batch in iterate_minibatches(X_train, y_train, present=present, batchsize=BATCH_SIZE):\n", 458 | " inputs, targets, batch_present = batch\n", 459 | " inputs = np.random.rand(*inputs.shape) < inputs\n", 460 | " train_err += train_fn(inputs, targets, batch_present)\n", 461 | " train_batches += 1\n", 462 | " \n", 463 | " test_accuracy = 0\n", 464 | " test_batches = 0\n", 465 | " for batch in iterate_minibatches(X_test, y_test, batchsize=BATCH_SIZE, shuffle=False):\n", 466 | " inputs, targets = batch\n", 467 | " inputs = np.random.rand(*inputs.shape) < inputs\n", 468 | " test_accuracy += accuracy(inputs, targets)\n", 469 | " test_batches += 1\n", 470 | " \n", 471 | " print(\"Epoch {} of {} took {:.3f}s\".format(\n", 472 | " epoch + 1, num_epochs, time.time() - start_time))\n", 473 | " print(\"Train elbo {}\".format(train_err/train_batches))\n", 474 | " print(\"Test accuracy {}\".format(test_accuracy/test_batches))" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": { 480 | "deletable": true, 481 | "editable": true 482 | }, 483 | "source": [ 484 | "# Задание*\n", 485 | "\n", 486 | "Ниже приведен код, генерирующий случайные цифры из заданного класса.\n", 487 | "\n", 488 | "Эксперементируя с архитектурами сети и параметрами модели, попробуйте обучить модель, для которой успешно выполняется это сэмплирование (см. эксперементальные результаты статьи https://arxiv.org/pdf/1406.5298.pdf)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "collapsed": false, 496 | "deletable": true, 497 | "editable": true 498 | }, 499 | "outputs": [], 500 | "source": [ 501 | "input_z = T.matrix('input_z')\n", 502 | "\n", 503 | "decode_a_code = theano.function(\n", 504 | " [input_z],\n", 505 | " lasagne.layers.get_output(decoder_logits, input_z),\n", 506 | ")" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": { 513 | "collapsed": false, 514 | "deletable": true, 515 | "editable": true 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "digit_to_draw = 4\n", 520 | "\n", 521 | "z_samples = np.random.randn(64, HIDDEN_DIM)\n", 522 | "y_samples = np.zeros((64, NUMBER_OF_DIGITS))\n", 523 | "y_samples[:, digit_to_draw] = 1\n", 524 | "zy_samples = np.concatenate([z_samples, y_samples], axis=1)\n", 525 | "\n", 526 | "decoded_images = decode_a_code(zy_samples)\n", 527 | "\n", 528 | "fig, axes = plt.subplots(8, 8, figsize=(8, 8),\n", 529 | " subplot_kw={'xticks': [], 'yticks': []}\n", 530 | ")\n", 531 | "fig.subplots_adjust(hspace=0.04, wspace=0.02)\n", 532 | "\n", 533 | "for ax, i in zip(axes.flat, range(64)):\n", 534 | " ax.imshow(decoded_images[i].reshape((28, 28)), cmap='gray')" 535 | ] 536 | } 537 | ], 538 | "metadata": { 539 | "kernelspec": { 540 | "display_name": "Python 3", 541 | "language": "python", 542 | "name": "python3" 543 | }, 544 | "language_info": { 545 | "codemirror_mode": { 546 | "name": "ipython", 547 | "version": 3 548 | }, 549 | "file_extension": ".py", 550 | "mimetype": "text/x-python", 551 | "name": "python", 552 | "nbconvert_exporter": "python", 553 | "pygments_lexer": "ipython3", 554 | "version": "3.5.1" 555 | } 556 | }, 557 | "nbformat": 4, 558 | "nbformat_minor": 2 559 | } 560 | -------------------------------------------------------------------------------- /sem2-classify&generate/mnist.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | 5 | import numpy as np 6 | 7 | 8 | def iterate_minibatches(inputs, targets, batchsize, shuffle=False): 9 | assert len(inputs) == len(targets) 10 | if shuffle: 11 | indices = np.arange(len(inputs)) 12 | np.random.shuffle(indices) 13 | for start_idx in range(0, len(inputs) - batchsize + 1, batchsize): 14 | if shuffle: 15 | excerpt = indices[start_idx:start_idx + batchsize] 16 | else: 17 | excerpt = slice(start_idx, start_idx + batchsize) 18 | yield inputs[excerpt], targets[excerpt] 19 | 20 | 21 | __doc__="""taken from https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py""" 22 | 23 | def load_dataset(): 24 | # We first define a download function, supporting both Python 2 and 3. 25 | if sys.version_info[0] == 2: 26 | from urllib import urlretrieve 27 | else: 28 | from urllib.request import urlretrieve 29 | 30 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 31 | print("Downloading %s" % filename) 32 | urlretrieve(source + filename, filename) 33 | 34 | # We then define functions for loading MNIST images and labels. 35 | # For convenience, they also download the requested files if needed. 36 | import gzip 37 | 38 | def load_mnist_images(filename): 39 | if not os.path.exists(filename): 40 | download(filename) 41 | # Read the inputs in Yann LeCun's binary format. 42 | with gzip.open(filename, 'rb') as f: 43 | data = np.frombuffer(f.read(), np.uint8, offset=16) 44 | # The inputs are vectors now, we reshape them to monochrome 2D images, 45 | # following the shape convention: (examples, channels, rows, columns) 46 | data = data.reshape(-1, 1, 28, 28) 47 | # The inputs come as bytes, we convert them to float32 in range [0,1]. 48 | # (Actually to range [0, 255/256], for compatibility to the version 49 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) 50 | return data / np.float32(256) 51 | 52 | def load_mnist_labels(filename): 53 | if not os.path.exists(filename): 54 | download(filename) 55 | # Read the labels in Yann LeCun's binary format. 56 | with gzip.open(filename, 'rb') as f: 57 | data = np.frombuffer(f.read(), np.uint8, offset=8) 58 | # The labels are vectors of integers now, that's exactly what we want. 59 | return data 60 | 61 | # We can now download and read the training and test set images and labels. 62 | X_train = load_mnist_images('train-images-idx3-ubyte.gz') 63 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 64 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 65 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 66 | 67 | # We reserve the last 10000 training examples for validation. 68 | X_train, X_val = X_train[:-10000], X_train[-10000:] 69 | y_train, y_val = y_train[:-10000], y_train[-10000:] 70 | 71 | # We just return all the arrays in order, as expected in main(). 72 | # (It doesn't matter how we do this as long as we can read them again.) 73 | return X_train, y_train, X_val, y_val, X_test, y_test 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /sem2-classify&generate/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | import numpy as np 5 | 6 | import matplotlib.pylab as plt 7 | from matplotlib.offsetbox import OffsetImage, AnnotationBbox 8 | 9 | def load_dataset(): 10 | if sys.version_info[0] == 2: 11 | from urllib import urlretrieve 12 | else: 13 | from urllib.request import urlretrieve 14 | 15 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 16 | print("Downloading %s" % filename) 17 | urlretrieve(source + filename, filename) 18 | 19 | import gzip 20 | 21 | def load_mnist_images(filename): 22 | if not os.path.exists(filename): 23 | download(filename) 24 | with gzip.open(filename, 'rb') as f: 25 | data = np.frombuffer(f.read(), np.uint8, offset=16) 26 | data = data.reshape(-1, 1, 28, 28) 27 | return data / np.float32(256) 28 | 29 | def load_mnist_labels(filename): 30 | if not os.path.exists(filename): 31 | download(filename) 32 | with gzip.open(filename, 'rb') as f: 33 | data = np.frombuffer(f.read(), np.uint8, offset=8) 34 | return data 35 | 36 | X = {} 37 | y = {} 38 | 39 | X_train= load_mnist_images('train-images-idx3-ubyte.gz') 40 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 41 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 42 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 43 | 44 | X_train, X_val = X_train[:-10000], X_train[-10000:] 45 | y_train, y_val = y_train[:-10000], y_train[-10000:] 46 | return X_train, y_train, X_val, y_val, X_test, y_test 47 | 48 | def iterate_minibatches(inputs, targets=None, batchsize=20, present=None, shuffle=True): 49 | if shuffle: 50 | indices = np.arange(len(inputs)) 51 | np.random.shuffle(indices) 52 | for start_idx in range(0, len(inputs) - batchsize + 1, batchsize): 53 | if shuffle: 54 | excerpt = indices[start_idx:start_idx + batchsize] 55 | else: 56 | excerpt = slice(start_idx, start_idx + batchsize) 57 | if targets is None: 58 | yield inputs[excerpt] 59 | elif present is None: 60 | yield inputs[excerpt], targets[excerpt] 61 | else: 62 | yield inputs[excerpt], targets[excerpt], present[excerpt] 63 | 64 | def plot_reconstructions(x_test, reconstruction_func): 65 | decoded_imgs = reconstruction_func(x_test) 66 | 67 | indices = np.random.choice(x_test.shape[0], 64) 68 | 69 | n = x_test.shape[0] # how many digits we will display 70 | 71 | fig, axes = plt.subplots(8, 16, figsize=(16, 8), 72 | subplot_kw={'xticks': [], 'yticks': []} 73 | ) 74 | fig.subplots_adjust(hspace=0.04, wspace=0.02) 75 | 76 | for ax, i in zip(axes[:, :8].flat, indices): 77 | ax.imshow(x_test[i].reshape((28, 28)), cmap='gray') 78 | 79 | for ax, i in zip(axes[:, 8:].flat, indices): 80 | ax.imshow(decoded_imgs[i].reshape((28, 28)), cmap='gray') 81 | 82 | plt.show() 83 | 84 | 85 | def imscatter(x, y, image, ax=None, zoom=1): 86 | if ax is None: 87 | ax = plt.gca() 88 | #im = OffsetImage(image.reshape((-1, 28, 28)), zoom=zoom) 89 | #x, y = np.atleast_1d(x, y) 90 | artists = [] 91 | #assert len(x) == len(y) == len(image) 92 | n = len(x) 93 | for i in range(n): 94 | im = OffsetImage(image[i], zoom=zoom, cmap='gray') 95 | ab = AnnotationBbox(im, (x[i], y[i]), xycoords='data', frameon=False) 96 | artists.append(ax.add_artist(ab)) 97 | ax.update_datalim(np.column_stack([x, y])) 98 | ax.autoscale() 99 | return artists 100 | 101 | def plot_hidden_space(x_test, encode_func, zoom=0.5): 102 | encoded = encode_func(x_test) 103 | 104 | fig, ax = plt.subplots(figsize=(11, 11)) 105 | imscatter(encoded[:, 0], encoded[:, 1], x_test.reshape((-1, 28, 28)), zoom=zoom, ax=ax) 106 | 107 | ax.spines['left'].set_position('center') 108 | ax.spines['bottom'].set_position('center') 109 | ax.spines['right'].set_color('none') 110 | ax.spines['top'].set_color('none') 111 | ax.xaxis.set_ticks_position('bottom') 112 | ax.yaxis.set_ticks_position('left') 113 | 114 | plt.gray() 115 | plt.show() -------------------------------------------------------------------------------- /sem3-attention/Attention_seminar (Start here).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Attention\n", 8 | "* Alexandr Panin, Arseniy Ashuha, you can text me ```ars.ashuha@gmail.com```,\n", 9 | "* Based on https://github.com/ebenolson/pydata2015\n", 10 | "\n", 11 | "\n", 12 | "

Part I: Attention mechanism at toy problems

\n", 13 | "\n", 14 | "\n", 15 | "\n", 16 | "In this seminar you will implement attention mechanism and apply it to a simple task of associative recall.\n", 17 | "\n", 18 | "# Install me:\n", 19 | "```(bash)\n", 20 | "sudo pip install --upgrade https://github.com/yandexdataschool/agentnet/archive/master.zip\n", 21 | "```\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import lasagne\n", 33 | "import numpy as np\n", 34 | "from lasagne.layers import *\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import theano,theano.tensor as T\n", 37 | "\n", 38 | "%matplotlib inline" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Problem description:\n", 46 | "\n", 47 | "You are given a sequence of pairs [key,value]. \n", 48 | "\n", 49 | "Both keys and values are one-hot encoded integers. \n", 50 | "\n", 51 | "The network should learn to generate values in order of ascension of keys.\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "CODE_SIZE = 10\n", 63 | "def generate_sample(min_length = 3, max_length = 10, code_size=CODE_SIZE):\n", 64 | " assert code_size >= max_length\n", 65 | " length = np.random.randint(min_length, max_length)\n", 66 | " \n", 67 | " keys = np.random.permutation(length)\n", 68 | " values = np.random.permutation(length)\n", 69 | " input_pairs = zip(keys,values)\n", 70 | " \n", 71 | " input_1hot = np.zeros([length+1,code_size*2])\n", 72 | " for i,(k,v) in enumerate(input_pairs):\n", 73 | " input_1hot[i+1][k] = 1\n", 74 | " input_1hot[i+1][code_size + v] = 1\n", 75 | " \n", 76 | " sorted_pairs = sorted(input_pairs,key=lambda (k,v):k)\n", 77 | " \n", 78 | " target_1hot = np.zeros([length+1,code_size*2])\n", 79 | " for i,(k,v) in enumerate(sorted_pairs):\n", 80 | " target_1hot[i+1][k] = 1\n", 81 | " target_1hot[i+1][code_size + v] = 1\n", 82 | " \n", 83 | " \n", 84 | " return input_1hot,target_1hot" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "inp,out = generate_sample(max_length=5,code_size=5)\n", 96 | "print '-'*9 + \"KEY\" + '-'*9 + ' ' + '+'*9 + \"VAL\" + \"+\"*9\n", 97 | "print \"Input pairs:\\n\",inp\n", 98 | "print \"Target pairs:\\n\",out" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "### Attention!\n", 106 | "\n", 107 | "We're now going to implement attention mechanism, or more specifically, _additive attention_ (a.k.a. Bahdanau's attention).\n", 108 | "\n", 109 | "We'll do so in two steps:\n", 110 | "\n", 111 | "* __AttentionWeights(encoder_seq,attn_query)__ - a layer that returns attention weights (aka probabilities of taking each value).\n", 112 | "* __AttentionOutput(encoder_seq,attn_weights)__ - a layer that averages inputs given probabilities from AttentionWeights.\n", 113 | "\n", 114 | "If you're not feeling familiar with this procedure, just follow the step-by-step instructions in code." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "from lasagne.init import Normal\n", 126 | "class AttentionWeights(MergeLayer):\n", 127 | " def __init__(self, encoder_seq, attn_query, num_units):\n", 128 | " MergeLayer.__init__(self, [encoder_seq, attn_query])\n", 129 | " \n", 130 | " enc_units = encoder_seq.output_shape[2]\n", 131 | " dec_units = attn_query.output_shape[1]\n", 132 | " \n", 133 | " self.W_enc = self.add_param(Normal(), (enc_units, num_units), name='enc_to_hid')\n", 134 | " self.W_query = self.add_param(Normal(), (dec_units, num_units), name='dec_to_hid')\n", 135 | " self.W_out = self.add_param(Normal(), (num_units, 1),name='hid_to_logit')\n", 136 | " \n", 137 | " def get_output_for(self, inputs):\n", 138 | " # the encoder_sequence shape = [batch, time,units]\n", 139 | " # the query shapeshape = [batch, units]\n", 140 | " encoder_sequence, query = inputs\n", 141 | " \n", 142 | " # Hidden layer activations, shape [batch,seq_len,hid_units]\n", 143 | " \n", 144 | " query_to_hid = query.dot(self.W_query)[:,None,:]\n", 145 | " \n", 146 | " enc_to_hid = \n", 147 | " \n", 148 | " hid = T.tanh()\n", 149 | " \n", 150 | " # Logits from hidden, [batch_size, seq_len]\n", 151 | " logits = \n", 152 | " \n", 153 | " assert logits.ndim ==2, \"Logits must have shape [batch,time] and be 2-dimensional.\"\\\n", 154 | " \"Current amount of dimensions:\"+str(logits.ndim)\n", 155 | " \n", 156 | " attn_weights = T.nnet.softmax(logits)\n", 157 | " \n", 158 | " return attn_weights\n", 159 | " \n", 160 | " def get_output_shape_for(self,input_shapes):\n", 161 | " enc_shape,query_shape = input_shapes\n", 162 | " return enc_shape[:-1]" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "class AttentionOutput(MergeLayer):\n", 174 | " def __init__(self, encoder_seq, attn_weights):\n", 175 | " MergeLayer.__init__(self,[encoder_seq,attn_weights])\n", 176 | " \n", 177 | " def get_output_for(self,inputs):\n", 178 | " # encoder_sequence shape = [batch,time,units]\n", 179 | " # attn_weights shape = [batch,time]\n", 180 | " encoder_sequence, attn_weights = inputs\n", 181 | " \n", 182 | " #Reshape attn_weights to make 'em 3-dimensional: [batch,time,1] - so you could multiply by encoder sequence\n", 183 | " attn_weights = attn_weights.reshape([attn_weights.shape[0],attn_weights.shape[1],1])\n", 184 | " \n", 185 | " #Compute attention response by summing encoder elements with weights along time axis (axis=1)\n", 186 | " attn_output = \n", 187 | " \n", 188 | " return attn_output\n", 189 | " \n", 190 | " def get_output_shape_for(self,input_shapes):\n", 191 | " enc_shape,query_shape = input_shapes\n", 192 | " return (enc_shape[0],enc_shape[-1])" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "We now define a single step of recurrent neural network using attention" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "input_sequence = T.itensor3(\"Input tokens [batch,time,code]\")\n", 211 | "reference_answers = T.itensor3(\"Reference answers[batch,time,code]\")\n", 212 | "\n", 213 | "l_inputs = InputLayer((None,None,CODE_SIZE*2),input_sequence)\n", 214 | "l_prev_answers = InputLayer((None,None,CODE_SIZE*2),reference_answers[:,:-1])" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "from agentnet.memory import RNNCell\n", 226 | "class step:\n", 227 | " prev_output = InputLayer((None, CODE_SIZE*2), name='previous output')\n", 228 | " input_sequence = InputLayer((None, None, CODE_SIZE*2), name='input sequence for attention')\n", 229 | " prev_rnn = InputLayer((None, 64), name='last rnn state')\n", 230 | " \n", 231 | " #TODO your code here\n", 232 | " attention_weights = AttentionWeights(input_sequence, prev_rnn,32)\n", 233 | " attention_value = AttentionOutput(input_sequence, attention_weights)\n", 234 | " \n", 235 | " new_rnn = RNNCell(prev_rnn,concat([attention_value, prev_output]))\n", 236 | " \n", 237 | " output_probs = DenseLayer(\n", 238 | " concat([new_rnn,attention_value]),\n", 239 | " num_units=CODE_SIZE*2, nonlinearity=T.nnet.sigmoid)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "from agentnet import Recurrence\n", 251 | "#This layer applies RNN to itself in a symbolic loop.\n", 252 | "#Please wait for DeepBayes' staff to explain how it works.\n", 253 | "\n", 254 | "rnn = Recurrence(\n", 255 | " input_sequences = {step.prev_output: l_prev_answers},\n", 256 | " input_nonsequences = {step.input_sequence: l_inputs},\n", 257 | " state_variables = {step.new_rnn: step.prev_rnn},\n", 258 | " tracked_outputs = [step.output_probs,step.attention_weights],\n", 259 | " unroll_scan=False,\n", 260 | ")" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "output_probs,attn_weights = get_output(\n", 272 | " [rnn[step.output_probs], rnn[step.attention_weights]])\n", 273 | "\n", 274 | "predict = theano.function(\n", 275 | " [input_sequence,reference_answers],\n", 276 | " [output_probs,attn_weights],\n", 277 | " allow_input_downcast=True)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": false, 285 | "scrolled": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "next_answers = reference_answers[:,1:]\n", 290 | "\n", 291 | "loss = -T.log(output_probs)*next_answers -T.log(1-output_probs)*(1-next_answers)\n", 292 | "loss = T.mean(loss)\n", 293 | "\n", 294 | "updates = \n", 295 | "\n", 296 | "train = theano.function([input_sequence, reference_answers], loss, updates=updates,allow_input_downcast=True)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "### Training" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "from tqdm import tnrange\n", 315 | "from IPython.display import clear_output\n", 316 | "loss_history = []\n", 317 | "\n", 318 | "for i in tnrange(10000):\n", 319 | " bx,by = generate_sample()\n", 320 | " loss_history.append(train([bx],[by]))\n", 321 | " \n", 322 | " if i%500==0:\n", 323 | " clear_output(True)\n", 324 | " plt.plot(loss_history)\n", 325 | " plt.show()\n", 326 | " \n", 327 | " #draw attention map\n", 328 | " bx,by = generate_sample()\n", 329 | " probs,attentions = predict([bx],[by])\n", 330 | "\n", 331 | " input_kv = zip(bx[:,:CODE_SIZE].argmax(-1),bx[:,CODE_SIZE:].argmax(-1))\n", 332 | " target_kv = zip(by[:,:CODE_SIZE].argmax(-1),by[:,CODE_SIZE:].argmax(-1))\n", 333 | " plt.imshow(attentions[0])\n", 334 | " plt.xticks(*zip(*enumerate(map(str,input_kv))),rotation=45)\n", 335 | " plt.yticks(*zip(*enumerate(map(str,target_kv))),rotation=45)\n", 336 | " plt.show()" 337 | ] 338 | } 339 | ], 340 | "metadata": { 341 | "kernelspec": { 342 | "display_name": "Python 2", 343 | "language": "python", 344 | "name": "python2" 345 | }, 346 | "language_info": { 347 | "codemirror_mode": { 348 | "name": "ipython", 349 | "version": 2 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython2", 356 | "version": "2.7.10" 357 | }, 358 | "widgets": { 359 | "state": { 360 | "1efdd72be63d457dafc441ce841e39f5": { 361 | "views": [ 362 | { 363 | "cell_index": 15 364 | } 365 | ] 366 | } 367 | }, 368 | "version": "1.2.0" 369 | } 370 | }, 371 | "nbformat": 4, 372 | "nbformat_minor": 2 373 | } 374 | -------------------------------------------------------------------------------- /sem3-attention/Captioning_seminar.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": false 7 | }, 8 | "source": [ 9 | "@authors\n", 10 | "* Arseniy Ashuha, you can text me ```ars.ashuha@gmail.com```,\n", 11 | "* Based on https://github.com/ebenolson/pydata2015\n", 12 | "\n", 13 | "

Part II: Attention mechanism @ Image Captioning

\n", 14 | "\n", 15 | "\n", 16 | "\n", 17 | "In this seminar you'll be going through the image captioning pipeline.\n", 18 | "\n", 19 | "To begin with, let us download the dataset of image features from a pre-trained GoogleNet (see instructions in chat)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Data preprocessing" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "# Load dataset\n", 38 | "import numpy as np\n", 39 | "\n", 40 | "captions = np.load(\"./data/train-data-captions.npy\")\n", 41 | "img_codes = np.load(\"./data/train-data-features.npy\").astype('float32')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "print (\"each image code is a 6x6 feature matrix from GoogleNet:\", img_codes.shape)\n", 53 | "print (img_codes[0,:10,0,0])\n", 54 | "print ('\\n\\n')\n", 55 | "print (\"for each image there are 5-7 descriptions, e.g.:\\n\")\n", 56 | "print ('\\n'.join(captions[0]))" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "#split descriptions into tokens\n", 68 | "for img_i in range(len(captions)):\n", 69 | " for caption_i in range(len(captions[img_i])):\n", 70 | " sentence = captions[img_i][caption_i] \n", 71 | " captions[img_i][caption_i] = [\"#START#\"]+sentence.split(' ')+[\"#END#\"]" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "# Build a Vocabulary\n", 83 | "from collections import Counter\n", 84 | "word_counts = Counter()\n", 85 | "for img_captions in captions:\n", 86 | " for caption in img_captions:\n", 87 | " word_counts.update(caption)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "vocab = ['#UNK#', '#START#', '#END#']\n", 99 | "vocab += [k for k, v in word_counts.items() if v >= 5]\n", 100 | "vocab = list(set(vocab))\n", 101 | "n_tokens = len(vocab)\n", 102 | "\n", 103 | "assert 12000 <= n_tokens <= 15000\n", 104 | "\n", 105 | "word_to_index = {w: i for i, w in enumerate(vocab)}" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "We'll use this function to convert sentences into a network-readible matrix of token indices.\n", 113 | "\n", 114 | "When given several sentences of different length, it pads them with -1." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "PAD_ix = -1\n", 126 | "UNK_ix = vocab.index('#UNK#')\n", 127 | "START_ix = vocab.index(\"#START#\")\n", 128 | "END_ix = vocab.index(\"#END#\")\n", 129 | "\n", 130 | "#good old as_matrix for the third time\n", 131 | "def as_matrix(sequences,max_len=None):\n", 132 | " max_len = max_len or max(map(len,sequences))\n", 133 | " \n", 134 | " matrix = np.zeros((len(sequences),max_len),dtype='int32')+PAD_ix\n", 135 | " for i,seq in enumerate(sequences):\n", 136 | " row_ix = [word_to_index.get(word,UNK_ix) for word in seq[:max_len]]\n", 137 | " matrix[i,:len(row_ix)] = row_ix\n", 138 | " \n", 139 | " return matrix\n", 140 | "\n", 141 | "def to_string(tokens_ix):\n", 142 | " assert len(np.shape(tokens_ix))==1,\"to_string works on one sequence at a time\"\n", 143 | " tokens_ix = list(tokens_ix)[1:]\n", 144 | " if END_ix in tokens_ix:\n", 145 | " tokens_ix = tokens_ix[:tokens_ix.index(END_ix)]\n", 146 | " return \" \".join([vocab[i] for i in tokens_ix])" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "#try it out on several descriptions of a random image\n", 158 | "as_matrix(captions[1337])" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "to_string(as_matrix(captions[1337])[0])" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "### The neural network\n", 177 | "\n", 178 | "Since the image encoder CNN is already applied, the only remaining part is to write a sentence decoder.\n" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "import theano, theano.tensor as T\n", 190 | "import lasagne\n", 191 | "from lasagne.layers import *\n", 192 | "\n", 193 | "# network shapes. \n", 194 | "EMBEDDING_SIZE = 128 #Change at your will\n", 195 | "LSTM_SIZE = 256 #Change at your will\n", 196 | "ATTN_SIZE = 256 #Change at your will\n", 197 | "FEATURES,HEIGHT,WIDTH = img_codes.shape[1:]\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "source": [ 206 | "We will define a single LSTM step here. An LSTM step should\n", 207 | "* take previous cell/out and input\n", 208 | "* compute next cell/out and next token probabilities\n", 209 | "* use attention to work with image features" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "#" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "from agentnet.resolver import ProbabilisticResolver\n", 232 | "from agentnet.memory import LSTMCell\n", 233 | "\n", 234 | "temperature = theano.shared(1.)\n", 235 | "class decoder:\n", 236 | " prev_word = InputLayer((None,),name='index of previous word')\n", 237 | " image_features = InputLayer((None,FEATURES,HEIGHT,WIDTH),name='img features')\n", 238 | "\n", 239 | " prev_cell = InputLayer((None,LSTM_SIZE),name='previous LSTM cell goes here')\n", 240 | " prev_out = InputLayer((None,LSTM_SIZE),name='previous LSTM output goes here')\n", 241 | " \n", 242 | " prev_word_emb = EmbeddingLayer(prev_word,len(vocab),EMBEDDING_SIZE)\n", 243 | " \n", 244 | " ###Attention part:\n", 245 | " # Please implement attention part of rnn architecture\n", 246 | " \n", 247 | " #First we reshape image into a sequence of image vectors\n", 248 | " image_features_seq = reshape(dimshuffle(image_features,[0,2,3,1]),[[0],-1,[3]])\n", 249 | " \n", 250 | " #Then we apply attention just as usual\n", 251 | " attn_probs = \n", 252 | " attn = \n", 253 | "\n", 254 | " lstm_input = concat([attn,prev_word_emb],axis=-1)\n", 255 | "\n", 256 | " new_cell,new_out = LSTMCell(prev_cell,prev_out,lstm_input)\n", 257 | " \n", 258 | " \n", 259 | " output_probs = DenseLayer(new_out,len(vocab),nonlinearity=T.nnet.softmax)\n", 260 | "\n", 261 | " \n", 262 | " output_probs_scaled = ExpressionLayer(output_probs,lambda p: p**temperature)\n", 263 | " output_tokens = ProbabilisticResolver(output_probs_scaled,assume_normalized=False)\n", 264 | " \n", 265 | " \n", 266 | " # recurrent state transition dict\n", 267 | " # on next step, {key} becomes {value}\n", 268 | " transition = {\n", 269 | " new_cell:prev_cell,\n", 270 | " new_out:prev_out\n", 271 | " }" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Training\n", 279 | "\n", 280 | "During training, we should feed our decoder RNN with reference captions from the dataset. Training then comes down to simple likelihood maximization problem.\n", 281 | "\n", 282 | "Deep learning people also know this as minimizing crossentropy." 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "# Inputs for sentences\n", 294 | "sentences = T.imatrix(\"[batch_size x time] of word ids\")\n", 295 | "l_sentences = InputLayer((None,None),sentences)\n", 296 | "\n", 297 | "# Input layer for image features\n", 298 | "image_vectors = T.tensor4(\"image features [batch,channels,h,w]\")\n", 299 | "l_image_features = InputLayer((None,FEATURES,HEIGHT,WIDTH),image_vectors)\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "collapsed": false 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "from agentnet import Recurrence\n", 311 | "\n", 312 | "decoder_trainer = Recurrence(\n", 313 | " input_sequences={decoder.prev_word:l_sentences},\n", 314 | " input_nonsequences={decoder.image_features:l_image_features},\n", 315 | " state_variables=decoder.transition,\n", 316 | " tracked_outputs=[decoder.output_probs],\n", 317 | " unroll_scan = False,\n", 318 | ")" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": { 325 | "collapsed": false 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "#get predictions and define loss\n", 330 | "next_token_probs = get_output(decoder_trainer[decoder.output_probs])\n", 331 | "\n", 332 | "next_token_probs = next_token_probs[:,:-1].reshape([-1,len(vocab)])\n", 333 | "next_tokens = sentences[:,1:].ravel()\n", 334 | "\n", 335 | "loss = T.nnet.categorical_crossentropy(next_token_probs,next_tokens)\n", 336 | "\n", 337 | "#apply mask\n", 338 | "mask = T.neq(next_tokens,PAD_ix)\n", 339 | "loss = T.sum(loss*mask)/T.sum(mask)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "#trainable NN weights\n", 351 | "weights = get_all_params(decoder_trainer,trainable=True)\n", 352 | "updates = lasagne.updates.adam(loss,weights)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": { 359 | "collapsed": false 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "#compile a functions for training and evaluation\n", 364 | "#please not that your functions must accept image features as FIRST param and sentences as second one\n", 365 | "train_step = theano.function([image_vectors,sentences],loss,updates=updates,allow_input_downcast=True)\n", 366 | "val_step = theano.function([image_vectors,sentences],loss,allow_input_downcast=True)\n", 367 | "#for val_step use deterministic=True if you have any dropout/noize" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "source": [ 376 | "# Training\n", 377 | "\n", 378 | "* You first have to implement a batch generator\n", 379 | "* Than the network will get trained the usual way" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": { 386 | "collapsed": true 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "from random import choice\n", 391 | "\n", 392 | "def generate_batch(images,captions,batch_size,max_caption_len=None):\n", 393 | " \n", 394 | " #sample random numbers for image/caption indicies\n", 395 | " random_image_ix = np.random.randint(0,len(images),size=batch_size)\n", 396 | " \n", 397 | " #get images\n", 398 | " batch_images = images[random_image_ix]\n", 399 | " \n", 400 | " #5-7 captions for each image\n", 401 | " captions_for_batch_images = captions[random_image_ix]\n", 402 | " \n", 403 | " #pick 1 from 5-7 captions for each image\n", 404 | " batch_captions = list(map(choice,captions_for_batch_images))\n", 405 | " \n", 406 | " #convert to matrix\n", 407 | " batch_captions_ix = as_matrix(batch_captions,max_len=max_caption_len)\n", 408 | " \n", 409 | " return batch_images, batch_captions_ix" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "collapsed": false 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "bx,by = generate_batch(img_codes,captions,3)\n", 421 | "bx[0,:10,0,0],by" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "### Main loop\n", 429 | "* We recommend you to periodically evaluate the network using the next \"apply trained model\" block\n", 430 | " * its safe to interrupt training, run a few examples and start training again" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "metadata": { 437 | "collapsed": true 438 | }, 439 | "outputs": [], 440 | "source": [ 441 | "batch_size=50 #adjust me\n", 442 | "n_epochs=100 #adjust me\n", 443 | "n_batches_per_epoch = 50 #adjust me\n", 444 | "n_validation_batches = 5 #how many batches are used for validation after each epoch\n" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "collapsed": false, 452 | "scrolled": false 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "from tqdm import tqdm\n", 457 | "\n", 458 | "for epoch in range(n_epochs):\n", 459 | " \n", 460 | " train_loss=0\n", 461 | " for _ in tqdm(range(n_batches_per_epoch)):\n", 462 | " train_loss += train_step(*generate_batch(img_codes,captions,batch_size))\n", 463 | " train_loss /= n_batches_per_epoch\n", 464 | " \n", 465 | " \n", 466 | " print('Epoch: {}, train loss: {}'.format(epoch, train_loss))\n", 467 | "\n", 468 | "print(\"Finish :)\")" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": { 474 | "collapsed": false 475 | }, 476 | "source": [ 477 | "### apply trained model" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "collapsed": true 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "batch_size = theano.shared(np.int32(1))\n", 489 | "MAX_LENGTH = 20 #Change at your will" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "collapsed": false 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "#set up recurrent network that generates tokens and feeds them back to itself\n", 501 | "unroll_dict = dict(decoder.transition)\n", 502 | "unroll_dict[decoder.output_tokens] = decoder.prev_word #on next iter, output goes to input\n", 503 | "\n", 504 | "first_output = T.repeat(T.constant(START_ix,dtype='int32'),batch_size)\n", 505 | "init_dict = {\n", 506 | " decoder.output_tokens:InputLayer([None],first_output)\n", 507 | "}\n", 508 | "\n", 509 | "decoder_applier = Recurrence(\n", 510 | " input_nonsequences={decoder.image_features:l_image_features},\n", 511 | " state_variables=unroll_dict,\n", 512 | " state_init = init_dict,\n", 513 | " tracked_outputs=[decoder.output_probs,decoder.output_tokens],\n", 514 | " n_steps = MAX_LENGTH,\n", 515 | ")" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "collapsed": false 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "generated_tokens = get_output(decoder_applier[decoder.output_tokens])\n", 527 | "\n", 528 | "generate = theano.function([image_vectors],generated_tokens,allow_input_downcast=True)" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": { 535 | "collapsed": false 536 | }, 537 | "outputs": [], 538 | "source": [ 539 | "from pretrained_lenet import image_to_features\n", 540 | "import matplotlib.pyplot as plt\n", 541 | "%matplotlib inline\n", 542 | "\n", 543 | "img = plt.imread(\"./data/Dog-and-Cat.jpg\")\n", 544 | "plt.imshow(img)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": { 551 | "collapsed": false 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "output_ix = generate([image_to_features(img)])[0]\n", 556 | "\n", 557 | "for _ in range(100):\n", 558 | " temperature.set_value(10)\n", 559 | " print to_string(output_ix)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "### Some tricks (for further research)\n", 567 | "\n", 568 | "* Initialize LSTM with some function of image features.\n", 569 | "\n", 570 | "* Try other attention functions\n", 571 | "\n", 572 | "* If you train large network, it is usually a good idea to make a 2-stage prediction\n", 573 | " 1. (large recurrent state) -> (bottleneck e.g. 256)\n", 574 | " 2. (bottleneck) -> (vocabulary size)\n", 575 | " * this way you won't need to store/train (large_recurrent_state x vocabulary size) matrix\n", 576 | " \n", 577 | "* Use [hierarchical softmax](https://gist.github.com/justheuristic/581853c6d6b87eae9669297c2fb1052d) or [byte pair encodings](https://github.com/rsennrich/subword-nmt)\n", 578 | "\n", 579 | "\n" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": { 586 | "collapsed": true 587 | }, 588 | "outputs": [], 589 | "source": [] 590 | } 591 | ], 592 | "metadata": { 593 | "kernelspec": { 594 | "display_name": "Python 2", 595 | "language": "python", 596 | "name": "python2" 597 | }, 598 | "language_info": { 599 | "codemirror_mode": { 600 | "name": "ipython", 601 | "version": 2 602 | }, 603 | "file_extension": ".py", 604 | "mimetype": "text/x-python", 605 | "name": "python", 606 | "nbconvert_exporter": "python", 607 | "pygments_lexer": "ipython2", 608 | "version": "2.7.6" 609 | } 610 | }, 611 | "nbformat": 4, 612 | "nbformat_minor": 0 613 | } 614 | -------------------------------------------------------------------------------- /sem3-attention/data/Dog-and-Cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem3-attention/data/Dog-and-Cat.jpg -------------------------------------------------------------------------------- /sem3-attention/pretrained_lenet.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import theano,theano.tensor as T 4 | 5 | from lasagne.layers import * 6 | from lasagne.layers import Conv2DLayer as ConvLayer 7 | from lasagne.layers import MaxPool2DLayer as PoolLayerDNN 8 | from lasagne.layers import MaxPool2DLayer as PoolLayer 9 | from lasagne.layers import LocalResponseNormalization2DLayer as LRNLayer 10 | from lasagne.nonlinearities import softmax, linear 11 | 12 | 13 | def build_inception_module(name, input_layer, nfilters): 14 | # nfilters: (pool_proj, 1x1, 3x3_reduce, 3x3, 5x5_reduce, 5x5) 15 | net = {} 16 | net['pool'] = PoolLayerDNN(input_layer, pool_size=3, stride=1, pad=1) 17 | net['pool_proj'] = ConvLayer(net['pool'], nfilters[0], 1) 18 | 19 | net['1x1'] = ConvLayer(input_layer, nfilters[1], 1) 20 | 21 | net['3x3_reduce'] = ConvLayer(input_layer, nfilters[2], 1) 22 | net['3x3'] = ConvLayer(net['3x3_reduce'], nfilters[3], 3, pad=1) 23 | 24 | net['5x5_reduce'] = ConvLayer(input_layer, nfilters[4], 1) 25 | net['5x5'] = ConvLayer(net['5x5_reduce'], nfilters[5], 5, pad=2) 26 | 27 | net['output'] = ConcatLayer([ 28 | net['1x1'], 29 | net['3x3'], 30 | net['5x5'], 31 | net['pool_proj'], 32 | ]) 33 | 34 | return {'{}/{}'.format(name, k): v for k, v in net.items()} 35 | 36 | 37 | def build_model(): 38 | net = {} 39 | net['input'] = InputLayer((None, 3, None, None)) 40 | net['conv1/7x7_s2'] = ConvLayer(net['input'], 64, 7, stride=2, pad=3) 41 | net['pool1/3x3_s2'] = PoolLayer(net['conv1/7x7_s2'], 42 | pool_size=3, 43 | stride=2, 44 | ignore_border=False) 45 | net['pool1/norm1'] = LRNLayer(net['pool1/3x3_s2'], alpha=0.00002, k=1) 46 | net['conv2/3x3_reduce'] = ConvLayer(net['pool1/norm1'], 64, 1) 47 | net['conv2/3x3'] = ConvLayer(net['conv2/3x3_reduce'], 192, 3, pad=1) 48 | net['conv2/norm2'] = LRNLayer(net['conv2/3x3'], alpha=0.00002, k=1) 49 | net['pool2/3x3_s2'] = PoolLayer(net['conv2/norm2'], pool_size=3, stride=2) 50 | 51 | net.update(build_inception_module('inception_3a', 52 | net['pool2/3x3_s2'], 53 | [32, 64, 96, 128, 16, 32])) 54 | net.update(build_inception_module('inception_3b', 55 | net['inception_3a/output'], 56 | [64, 128, 128, 192, 32, 96])) 57 | net['pool3/3x3_s2'] = PoolLayer(net['inception_3b/output'], 58 | pool_size=3, stride=2) 59 | 60 | net.update(build_inception_module('inception_4a', 61 | net['pool3/3x3_s2'], 62 | [64, 192, 96, 208, 16, 48])) 63 | net.update(build_inception_module('inception_4b', 64 | net['inception_4a/output'], 65 | [64, 160, 112, 224, 24, 64])) 66 | net.update(build_inception_module('inception_4c', 67 | net['inception_4b/output'], 68 | [64, 128, 128, 256, 24, 64])) 69 | net.update(build_inception_module('inception_4d', 70 | net['inception_4c/output'], 71 | [64, 112, 144, 288, 32, 64])) 72 | net.update(build_inception_module('inception_4e', 73 | net['inception_4d/output'], 74 | [128, 256, 160, 320, 32, 128])) 75 | net['pool4/3x3_s2'] = PoolLayer(net['inception_4e/output'], 76 | pool_size=3, stride=2) 77 | 78 | net.update(build_inception_module('inception_5a', 79 | net['pool4/3x3_s2'], 80 | [128, 256, 160, 320, 32, 128])) 81 | net.update(build_inception_module('inception_5b', 82 | net['inception_5a/output'], 83 | [128, 384, 192, 384, 48, 128])) 84 | 85 | net['pool5/7x7_s1'] = GlobalPoolLayer(net['inception_5b/output']) 86 | net['loss3/classifier'] = DenseLayer(net['pool5/7x7_s1'], 87 | num_units=1000, 88 | nonlinearity=linear) 89 | net['prob'] = NonlinearityLayer(net['loss3/classifier'], 90 | nonlinearity=softmax) 91 | return net 92 | 93 | 94 | import skimage.transform 95 | import numpy as np 96 | MEAN_VALUES = np.array([104, 117, 123]).reshape((3,1,1)) 97 | def preprocess(im): 98 | if len(im.shape) == 2: 99 | im = im[:, :, np.newaxis] 100 | im = np.repeat(im, 3, axis=2) 101 | # Resize so smallest dim = 224, preserving aspect ratio 102 | h, w, _ = im.shape 103 | if h < w: 104 | im = skimage.transform.resize(im, (224, w*224//h), preserve_range=True) 105 | else: 106 | im = skimage.transform.resize(im, (h*224//w, 224), preserve_range=True) 107 | 108 | # Central crop to 224x224 109 | h, w, _ = im.shape 110 | im = im[h//2-112:h//2+112, w//2-112:w//2+112] 111 | 112 | rawim = np.copy(im).astype('uint8') 113 | 114 | # Shuffle axes to c01 115 | im = np.swapaxes(np.swapaxes(im, 1, 2), 0, 1) 116 | 117 | # Convert to BGR 118 | im = im[::-1, :, :] 119 | 120 | im = im - MEAN_VALUES 121 | return im[np.newaxis].astype('float32') 122 | 123 | #build and compile model 124 | import pickle 125 | lenet = build_model() 126 | lenet_weights = pickle.load(open('data/blvc_googlenet.pkl'))['param values'] 127 | set_all_param_values(lenet["prob"], lenet_weights) 128 | 129 | cnn_input_var = lenet['input'].input_var 130 | cnn_feature_layer = lenet['inception_5b/output'] 131 | get_cnn_features = theano.function([cnn_input_var], get_output(cnn_feature_layer)) 132 | 133 | pca = pickle.load(open("./data/svd.pcl")) 134 | 135 | 136 | def image_to_features(im): 137 | assert len(im.shape) ==3 and im.shape[2]==3,"You should provide an RGB image of shape [h,w,3]" 138 | im = preprocess(im) 139 | cnn_features = get_cnn_features(im)[0] 140 | H,W = cnn_features.shape[-2:] 141 | cnn_features_flat = cnn_features.transpose([1,2,0]).reshape([-1,1024]) 142 | svd_features_flat = pca.transform(cnn_features_flat) 143 | return svd_features_flat.reshape([H,W,-1]).transpose([2,0,1]) 144 | -------------------------------------------------------------------------------- /sem4-GP/2_BayesOpt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Bayesian optimization\n", 12 | "\n", 13 | "* Mainly used for optimization of \"heavy\" functions (computationally complex, expensive to evaluate)\n", 14 | "* The objective function can be \"black box\"\n", 15 | "* Uses approximation of the objective function\n", 16 | "* Takes into account quality of the approximation" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "slideshow": { 23 | "slide_type": "slide" 24 | } 25 | }, 26 | "source": [ 27 | "#### Optimization procedure:\n", 28 | "1. Build approximation $\\hat{f}(x)$ of function $f(x)$\n", 29 | "2. Choose new point as an argmax of the criterion\n", 30 | "$$\n", 31 | "x_{new} = \\arg\\max\\limits_x a(x)\n", 32 | "$$\n", 33 | "3. Evaluate $f(x)$ at new point\n", 34 | "4. Update model $\\hat{f}(x)$ and go to step 2.\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "slideshow": { 41 | "slide_type": "slide" 42 | } 43 | }, 44 | "source": [ 45 | "### Expected Improvement\n", 46 | "\n", 47 | "$$\n", 48 | "EI(x) = \\mathbb{E}_{p(\\hat{f})} \\left [\\max(0, y_{min} - \\hat{f}) \\right ]\n", 49 | "$$\n", 50 | "where $\\hat{y}, \\sigma$ - mean and variance of GP model at point $x$,\n", 51 | "$\\Phi(\\cdot)$ - cdf of standard normal distribution,\n", 52 | "$\\phi(\\cdot)$ - pdf of standard normal distribution.\n", 53 | "\n", 54 | "Usually logarithm of EI is used." 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "slideshow": { 61 | "slide_type": "slide" 62 | } 63 | }, 64 | "source": [ 65 | "" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "slideshow": { 72 | "slide_type": "slide" 73 | } 74 | }, 75 | "source": [ 76 | "### Optimization of criterion\n", 77 | "\n", 78 | "Any optimization algorithm could be used.\n", 79 | "\n", 80 | "In this seminar we will use multi-start with L-BFGS optimization algorithm\n", 81 | "\n", 82 | "Multi-start procedure:\n", 83 | "1. Generate initial set of points $x_1, \\ldots, x_n$. Calculate criterion at each point to obtain $(a(x_1), \\ldots, a(x_n))$.\n", 84 | "2. Choose $k$ points with smallest values of criterion.\n", 85 | "3. Using each point as an initial point run the optimization algorithm (L-BFGS) and obtain $k$ optimization results.\n", 86 | "4. From all optimization results choose the best one." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "slideshow": { 93 | "slide_type": "skip" 94 | } 95 | }, 96 | "source": [ 97 | "### L-BFGS \n", 98 | "\n", 99 | "It's a quasi-Newton method of optimization and it is based on second order Taylor expansion\n", 100 | "$$\n", 101 | "f(x_k + p) \\approx f(x_k) + \\nabla f^T(x_k) p + \\frac12 p^T \\mathbf{H}p\n", 102 | "$$\n", 103 | "$$\n", 104 | "p = -\\mathbf{H}^{-1}\\nabla f^T(x_k) \\approx -\\mathbf{B}_k^{-1} \\nabla f^T(x_k),\n", 105 | "$$\n", 106 | "where $\\mathbf{B}_k$ is an approximation of hessian $\\mathbf{H}$.\n", 107 | "\n", 108 | "Approximation $\\mathbf{B}_k$ is updated at every step by the following rule:\n", 109 | "$$\n", 110 | "\\mathbf{B}_{k + 1} = \\mathbf{B}_k - \\frac{\\mathbf{B}_k s_k s_k^T \\mathbf{B}_k}{s_k^T \\mathbf{B}_k s_k} + \\frac{y_k y_k^T}{y_k^T s_k},\n", 111 | "$$\n", 112 | "where $s_k = x_{k + 1} - x_k$, $y_k = \\nabla f(x_{k + 1}) - \\nabla f(x_k)$." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true, 120 | "slideshow": { 121 | "slide_type": "skip" 122 | } 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "%matplotlib inline\n", 127 | "\n", 128 | "from __future__ import print_function\n", 129 | "\n", 130 | "import numpy as np\n", 131 | "from matplotlib import pyplot\n", 132 | "from mpl_toolkits.mplot3d import Axes3D\n", 133 | "from matplotlib import cm\n", 134 | "from scipy.optimize import minimize\n", 135 | "\n", 136 | "\n", 137 | "import GPy\n", 138 | "\n", 139 | "import bayes_opt\n", 140 | "\n", 141 | "\n", 142 | "def f(x):\n", 143 | " return (6 * x - 2)**2 * np.sin(12 * x - 4) \n", 144 | "\n", 145 | "def get_1d_data():\n", 146 | " np.random.seed(239)\n", 147 | " x_train = np.array([0.0, 0.58, 0.38, 0.95]).reshape(-1, 1)\n", 148 | " y_train = f(x_train)\n", 149 | " return x_train, y_train" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true, 157 | "slideshow": { 158 | "slide_type": "skip" 159 | } 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "x_train, y_train = get_1d_data()\n", 164 | "kernel = GPy.kern.RBF(1, variance=0.5, lengthscale=0.2)\n", 165 | "model = GPy.models.GPRegression(x_train, y_train, kernel)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "collapsed": true, 173 | "slideshow": { 174 | "slide_type": "skip" 175 | } 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "x_grid = np.linspace(0, 1, 100).reshape(-1, 1)\n", 180 | "y_grid = f(x_grid)\n", 181 | "prediction, std = model.predict(x_grid)\n", 182 | "prediction = prediction.ravel()\n", 183 | "std = std.ravel()\n", 184 | "pyplot.figure(figsize=(8, 6))\n", 185 | "pyplot.plot(x_train, y_train, 'or', markersize=8, label='Training set')\n", 186 | "pyplot.plot(x_grid, prediction, '-k', linewidth=2, label='Approximation')\n", 187 | "pyplot.fill_between(x_grid.ravel(), prediction - 2 * std, prediction + 2 * std, alpha=0.3)\n", 188 | "pyplot.plot(x_grid, y_grid, '--b', label='True function')\n", 189 | "pyplot.ylim([-15, 20])\n", 190 | "pyplot.legend(loc='best')" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "### Task\n", 198 | "\n", 199 | "Derive expression for EI: express it in terms of $\\Phi(\\cdot)$ and $\\phi(\\cdot)$ - cdf and pdf of $\\mathcal{N}(0, 1)$." 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "slideshow": { 206 | "slide_type": "slide" 207 | } 208 | }, 209 | "source": [ 210 | "### Task\n", 211 | "Implement multi-start optimization procedure" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "collapsed": true, 219 | "slideshow": { 220 | "slide_type": "skip" 221 | } 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "def get_new_point(model, lb, ub, data=None, multistart=10, criterion='ei', k=1, random_state=None):\n", 226 | " \"\"\"\n", 227 | " Parameters:\n", 228 | " model - GP model of the objective function\n", 229 | " lb, ub - array-like, lower and upper bounds of x\n", 230 | " data - tuple(x_train, y_train)\n", 231 | " multistart - number of multistart runs\n", 232 | " criterion - aqcuisition function, by default EI\n", 233 | " k - parameter of the LowerConfidenceBound function\n", 234 | " random_state - np.random.RandomState\n", 235 | " Returns\n", 236 | " tuple - argmin of the objective function and min value of the objective \n", 237 | " \"\"\"\n", 238 | " if random_state is None:\n", 239 | " random_state = np.random.RandomState()\n", 240 | "\n", 241 | " lb = np.array(lb).reshape(1, -1)\n", 242 | " ub = np.array(ub).reshape(1, -1)\n", 243 | " \n", 244 | " # 1. Generate inital X points (number of points == multistart) in [lb, ub]\n", 245 | " \n", 246 | " ######## Your code here ########\n", 247 | " x_random = \n", 248 | "\n", 249 | " \n", 250 | " ######## ########\n", 251 | " def objective(x):\n", 252 | " if x.ndim == 1:\n", 253 | " x = x.reshape(1, -1)\n", 254 | " mean_values, variance = model.predict(x)\n", 255 | " std_values = np.sqrt(variance)\n", 256 | " if criterion == 'ei':\n", 257 | " return -log_expected_improvement(mean_values, std_values, data[1].min())\n", 258 | " elif criterion == 'lcb':\n", 259 | " return lcb(mean_values, std_values, params)\n", 260 | " else:\n", 261 | " raise NotImplementedError('Criterion is not implemented!')\n", 262 | "\n", 263 | " criterion_value = objective(x_random)\n", 264 | " \n", 265 | " # 2. From each points from x_random run L-BFGS optimization algorithm, \n", 266 | " # choose the best result and return it\n", 267 | " # Use function minimize: minimize(objective, x_init, method='L-BFGS-B',\n", 268 | " # bounds=np.vstack((lb, ub)).T)\n", 269 | " # it returns object with fields 'fun' - optimum function value, 'x' - argmin.\n", 270 | "\n", 271 | " best_result = None\n", 272 | " best_value = np.inf\n", 273 | "\n", 274 | " ######## Your code here ########\n", 275 | " \n", 276 | " return best_result.x, best_result.fun" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "collapsed": true, 284 | "slideshow": { 285 | "slide_type": "skip" 286 | } 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "# Check your code \n", 291 | "lb = [0]\n", 292 | "ub = [1]\n", 293 | "kernel = GPy.kern.RBF(1, variance=0.5, lengthscale=0.1)\n", 294 | "model = GPy.models.GPRegression(x_train, y_train, kernel)\n", 295 | "x_new, f_new = get_new_point(model, lb, ub, data=(x_train, y_train), random_state=np.random.RandomState(42))\n", 296 | "\n", 297 | "assert(np.isclose(x_new, 0.29985639))\n", 298 | "assert(np.isclose(f_new, 0.86480674))\n", 299 | "print('Correct!')" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "collapsed": true 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "def optimization_step(x_train, y_train, kernel, objective, lb=None, ub=None, criterion='ei', k=1, plot=False):\n", 311 | " model = GPy.models.GPRegression(x_train, y_train, kernel)\n", 312 | " model.optimize_restarts(num_restarts=10, verbose=False)\n", 313 | "\n", 314 | " x_new, criterion_value = get_new_point(model, data=(x_train, y_train), lb=lb, ub=ub, criterion=criterion, k=k)\n", 315 | " if plot:\n", 316 | " bayes_opt.plot1d(x_train, y_train, model, objective, x_new, criterion_value)\n", 317 | " pyplot.show()\n", 318 | "\n", 319 | " x_new = x_new.reshape(1, -1)\n", 320 | " x_train = np.vstack([x_train, x_new])\n", 321 | " y_train = np.vstack([y_train, np.asarray(objective(x_new)).reshape(1, -1)])\n", 322 | " return x_train, y_train, model" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "slideshow": { 329 | "slide_type": "skip" 330 | } 331 | }, 332 | "source": [ 333 | "## 1D example" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": true, 341 | "scrolled": false, 342 | "slideshow": { 343 | "slide_type": "skip" 344 | } 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "x_train, y_train = get_1d_data()\n", 349 | "kernel = GPy.kern.RBF(1, variance=0.5, lengthscale=0.2)\n", 350 | "model = GPy.models.GPRegression(x_train, y_train, kernel)\n", 351 | "for i in range(6):\n", 352 | " x_train, y_train, model = bayes_opt.optimization_step(x_train, y_train, kernel, f, lb=[0], ub=[1], criterion='ei', plot=True)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "slideshow": { 359 | "slide_type": "skip" 360 | } 361 | }, 362 | "source": [ 363 | "## 2D demo" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "collapsed": true, 371 | "scrolled": false, 372 | "slideshow": { 373 | "slide_type": "skip" 374 | } 375 | }, 376 | "outputs": [], 377 | "source": [ 378 | "budget = 30\n", 379 | "n_init = 10\n", 380 | "\n", 381 | "kernel = GPy.kern.RBF(2, variance=1, lengthscale=0.5, ARD=False)\n", 382 | "\n", 383 | "save_path = '2d_demo.mp4'\n", 384 | "bayes_opt.demo_2d(n_init, budget, kernel, save_path=save_path)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": true, 392 | "scrolled": false, 393 | "slideshow": { 394 | "slide_type": "skip" 395 | } 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "import io\n", 400 | "import base64\n", 401 | "from IPython.display import HTML\n", 402 | "\n", 403 | "video = io.open(save_path, 'r+b').read()\n", 404 | "encoded = base64.b64encode(video)\n", 405 | "HTML(data=''''''.format(encoded.decode('ascii')))" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": { 413 | "collapsed": true, 414 | "slideshow": { 415 | "slide_type": "slide" 416 | } 417 | }, 418 | "source": [ 419 | "### Hyperparmeters tuning\n", 420 | "\n", 421 | "* Almost all machine learning have hyperparameters\n", 422 | "* Quality of the model depends on the hyperparameters\n", 423 | "* Quality estimation for one set of hyperparameters can take long time\n", 424 | "* => Bayesian optimization can be used for hyperparameters tuning." 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": { 430 | "slideshow": { 431 | "slide_type": "slide" 432 | } 433 | }, 434 | "source": [ 435 | "#### Bayesian optimization for hyperparameter tuning\n", 436 | "\n", 437 | "Objective function to optimize\n", 438 | "* Takes hyperparameters as input\n", 439 | "* Builds a model (maybe several times in case of cross-validation)\n", 440 | "* Calculates and returns model quality" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": { 447 | "collapsed": true, 448 | "slideshow": { 449 | "slide_type": "skip" 450 | } 451 | }, 452 | "outputs": [], 453 | "source": [ 454 | "from sklearn.ensemble import RandomForestRegressor\n", 455 | "from sklearn.model_selection import train_test_split, cross_val_score\n", 456 | "from sklearn.preprocessing import StandardScaler\n", 457 | "\n", 458 | "from IPython import display" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": { 464 | "slideshow": { 465 | "slide_type": "skip" 466 | } 467 | }, 468 | "source": [ 469 | "##### House pricing dataset\n", 470 | "\n", 471 | "In this task you need to predict House Sale Price. There are 25 numerical input features like lot area, overall condition rating, house quality, number of kitchens and so on (there were a lot of categorical variables which we removed in this example for simplicity).\n", 472 | "\n", 473 | "We are going to tune XGBoost parameters using Bayesian Optimization to obtain more accurate model." 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": { 480 | "collapsed": true, 481 | "slideshow": { 482 | "slide_type": "skip" 483 | } 484 | }, 485 | "outputs": [], 486 | "source": [ 487 | "data = np.loadtxt('house_pricing.csv')\n", 488 | "\n", 489 | "X = data[:, :-1]\n", 490 | "y = data[:, -1:]\n", 491 | "\n", 492 | "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": { 498 | "slideshow": { 499 | "slide_type": "skip" 500 | } 501 | }, 502 | "source": [ 503 | "We implement `model_error_cv()` function that will be our objective function. \n", 504 | "We are going to use RBF kernel in our Bayesian Optimization, the result of optimization will be continuous variables,\n", 505 | "so we need to preprocess parameters - cast integer parameters to int." 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": { 512 | "collapsed": true, 513 | "slideshow": { 514 | "slide_type": "skip" 515 | } 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "def wrap_parameters(parameters, scaler=None):\n", 520 | " if scaler:\n", 521 | " parameters = scaler.transform(parameters)\n", 522 | " return parameters\n", 523 | "\n", 524 | "\n", 525 | "def unwrap_parameters(parameters, scaler=None):\n", 526 | " if scaler:\n", 527 | " parameters = scaler.inverse_transform(parameters)\n", 528 | " p = [int(parameters[0]), parameters[1], int(parameters[2]),\n", 529 | " max(0, min(parameters[3], 1))]\n", 530 | " return p\n", 531 | "\n", 532 | "\n", 533 | "def model_error_cv(parameters, X, y, scaler=None):\n", 534 | " errors = []\n", 535 | " for p in parameters:\n", 536 | " p = unwrap_parameters(p, scaler)\n", 537 | " model = xgboost.XGBRegressor(max_depth=p[0],\n", 538 | " learning_rate=p[1],\n", 539 | " n_estimators=p[2],\n", 540 | " subsample=p[3],\n", 541 | " )\n", 542 | "\n", 543 | " score = cross_val_score(model, X, y, cv=3).mean()\n", 544 | " errors.append(score)\n", 545 | " return np.array(errors).reshape(-1, 1)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": { 551 | "slideshow": { 552 | "slide_type": "skip" 553 | } 554 | }, 555 | "source": [ 556 | "We scale the parameters using StandardScaler() from sklearn - it is nice to have all the parameters with unit variance and mean zero\n", 557 | "when using RBF kernel as it is easier to tune lengthscale parameters, because these parameters depend on the range of input variables." 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": { 564 | "collapsed": true, 565 | "slideshow": { 566 | "slide_type": "skip" 567 | } 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "# xgboost params: max_depth, learning_rate, n_estimators, subsample\n", 572 | "lower_bound = np.array([1, 0.001, 100, 0.2])\n", 573 | "upper_bound = np.array([6, 0.1, 1000, 1])\n", 574 | "\n", 575 | "np.random.seed(42)\n", 576 | "n_init_points = 10\n", 577 | "initial_parameters = np.random.rand(n_init_points, len(lower_bound)) * (upper_bound - lower_bound) + lower_bound\n", 578 | "initial_errors = -model_error_cv(initial_parameters, X, y)\n", 579 | "\n", 580 | "scaler = StandardScaler()\n", 581 | "scaler.fit(initial_parameters)\n", 582 | "lower_bound = scaler.transform(lower_bound)\n", 583 | "upper_bound = scaler.transform(upper_bound)\n", 584 | "initial_parameters = wrap_parameters(initial_parameters, scaler)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": { 590 | "slideshow": { 591 | "slide_type": "skip" 592 | } 593 | }, 594 | "source": [ 595 | "It is also nice idea to explicitly constrain lengthscale parameter - it shouldn't be much larger than distance between points in the training set, it shouldn't be much smaller than the distance between points in the training set." 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": { 602 | "collapsed": true, 603 | "slideshow": { 604 | "slide_type": "skip" 605 | } 606 | }, 607 | "outputs": [], 608 | "source": [ 609 | "kernel = GPy.kern.RBF(len(lower_bound), lengthscale=(upper_bound - lower_bound).min() / n_init_points, ARD=False)\n", 610 | "gp_model = GPy.models.GPRegression(initial_parameters, initial_errors, kernel=kernel)\n", 611 | "gp_model.rbf.lengthscale.constrain_bounded(0.001, 10)\n", 612 | "gp_model.optimize()\n", 613 | "print(gp_model)\n", 614 | "print(gp_model.rbf.lengthscale)" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": { 621 | "collapsed": true, 622 | "scrolled": false, 623 | "slideshow": { 624 | "slide_type": "-" 625 | } 626 | }, 627 | "outputs": [], 628 | "source": [ 629 | "budget = 40\n", 630 | "\n", 631 | "hyperparameters = initial_parameters\n", 632 | "errors = initial_errors\n", 633 | "error_history = [-initial_errors[:i].min() for i in range(1, n_init_points + 1)]\n", 634 | "objective = lambda x: -model_error_cv(x, X, y, scaler)\n", 635 | "for i in range(budget):\n", 636 | " hyperparameters, errors, gp_model = bayes_opt.optimization_step(hyperparameters, errors, kernel, objective,\n", 637 | " lb=lower_bound, ub=upper_bound)\n", 638 | " error_history.append(-errors.min())\n", 639 | " # Visualize\n", 640 | " display.clear_output(wait=True)\n", 641 | " pyplot.figure(figsize=(8, 6))\n", 642 | " \n", 643 | " pyplot.xlabel(\"#iteration\")\n", 644 | " pyplot.ylabel(\"R2\")\n", 645 | " pyplot.plot(error_history)\n", 646 | " pyplot.show()\n", 647 | " \n", 648 | " print(\"New parameters: {}, new error:\\t{}\\nbest parameters: {}, best error:\\t{}\".format(\n", 649 | " unwrap_parameters(hyperparameters[-1], scaler), -errors[-1, 0],\n", 650 | " unwrap_parameters(hyperparameters[errors.argmin()], scaler), -errors.min()))\n", 651 | " print(gp_model.rbf.lengthscale)" 652 | ] 653 | } 654 | ], 655 | "metadata": { 656 | "kernelspec": { 657 | "display_name": "Python 3", 658 | "language": "python", 659 | "name": "python3" 660 | }, 661 | "language_info": { 662 | "codemirror_mode": { 663 | "name": "ipython", 664 | "version": 3 665 | }, 666 | "file_extension": ".py", 667 | "mimetype": "text/x-python", 668 | "name": "python", 669 | "nbconvert_exporter": "python", 670 | "pygments_lexer": "ipython3", 671 | "version": "3.5.2" 672 | } 673 | }, 674 | "nbformat": 4, 675 | "nbformat_minor": 2 676 | } 677 | -------------------------------------------------------------------------------- /sem4-GP/2d_demo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem4-GP/2d_demo.mp4 -------------------------------------------------------------------------------- /sem4-GP/3_LargeScaleGP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "### Large-scale GP\n", 12 | "\n", 13 | "Predictive mean and variance of GPR model:\n", 14 | "$$\n", 15 | "m(x_*) = \\mathbf{k}^T \\mathbf{K}_y^{-1} \\mathbf{y},\n", 16 | "$$\n", 17 | "$$\n", 18 | "\\sigma^2(x_*) = k(x_*, x_*) - \\mathbf{k}^T\\mathbf{K}_y^{-1}\\mathbf{k}\n", 19 | "$$\n", 20 | "\n", 21 | "**Issue**: the computational complexity is $\\mathcal{O}(N^3)$, where $N$ is the training size." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": { 27 | "slideshow": { 28 | "slide_type": "slide" 29 | } 30 | }, 31 | "source": [ 32 | "### Nystrom approximation\n", 33 | "\n", 34 | "Idea: introduce inducing points $(X_u, \\mathbf{u})$ which are used for low-rank approximation of covariance matrix:\n", 35 | "$$\n", 36 | "\\mathbf{K} \\approx \\mathbf{K}_{NM} \\mathbf{K}_{MM}^{-1} \\mathbf{K}_{MN}, \\quad \\mathbf{K}_{NM} = k(X, X_u), \\mathbf{K}_{MM} = k(X_u, X_u), \\mathbf{K}_{MN} = k(X_u, X)\n", 37 | "$$\n", 38 | "\n", 39 | "Predictive distribtion:\n", 40 | "$$\n", 41 | "f_* \\; | \\; x_*, X, \\mathbf{y} \\sim \\mathcal{N}\\left (m(x_*), \\; \\sigma^2(x_*)\\right ),\n", 42 | "$$\n", 43 | "$$\n", 44 | "m(x_*) = \\mathbf{k}^T \\left (\\mathbf{K}_{NM}\\mathbf{K}_{MM}^{-1} \\mathbf{K}_{MN} + \\sigma_n^2 I \\right )^{-1} \\mathbf{y}\n", 45 | "$$\n", 46 | "$$\n", 47 | "\\sigma^2(x_*) = k(x_*, x_*) - \\mathbf{k}^T\\left (\\mathbf{K}_{NM}\\mathbf{K}_{MM}^{-1}\\mathbf{K}_{MN} + \\sigma^2_n I \\right)^{-1} k(x_*, x_*)\n", 48 | "$$" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "slideshow": { 55 | "slide_type": "slide" 56 | } 57 | }, 58 | "source": [ 59 | "Using Woodbury matrix identity we can calculate the inverse more efficiently:\n", 60 | "$$\n", 61 | "\\left (\\sigma_n^2 I + \\mathbf{K}_{NM}\\mathbf{K}_{MM}^{-1} \\mathbf{K}_{MN} \\right)^{-1} = \\sigma_n^{-2} \\left (\n", 62 | "I - \\mathbf{K}_{NM} \\left (\\sigma_n^2 \\mathbf{K}_{MM} + \\mathbf{K}_{MN} \\mathbf{K}_{NM} \\right )^{-1} \\mathbf{K}_{MN}\n", 63 | "\\right )\n", 64 | "$$\n", 65 | "\n", 66 | "The computational complexity is $\\mathcal{O}(NM^2)$." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": true, 74 | "slideshow": { 75 | "slide_type": "skip" 76 | } 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "from __future__ import print_function\n", 81 | "\n", 82 | "import pandas as pd\n", 83 | "import numpy as np\n", 84 | "import GPy\n", 85 | "from sklearn.model_selection import train_test_split\n", 86 | "from sklearn.metrics import mean_squared_error, r2_score\n", 87 | "from sklearn.preprocessing import StandardScaler\n", 88 | "from sklearn.model_selection import cross_val_predict\n", 89 | "from sklearn.pipeline import Pipeline\n", 90 | "from sklearn import svm\n", 91 | "\n", 92 | "from matplotlib import pyplot\n", 93 | "\n", 94 | "%matplotlib notebook" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "slideshow": { 101 | "slide_type": "skip" 102 | } 103 | }, 104 | "source": [ 105 | "For convenience we wrap GPy model to have sklearn-like API to use it in `cross_val_predict()` function from sklearn\n", 106 | "\n", 107 | "Note, that in this implementation we generate random inducing inputs and fix them. However, inducing points can be optimized." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": true, 115 | "slideshow": { 116 | "slide_type": "skip" 117 | } 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "from sklearn.base import BaseEstimator\n", 122 | "\n", 123 | "class SparseGPModel(BaseEstimator):\n", 124 | " def __init__(self, kernel, num_inducing=100):\n", 125 | " self.kernel_ = kernel\n", 126 | " self.num_inducing = num_inducing\n", 127 | " \n", 128 | " def fit(self, X, y):\n", 129 | " idx = np.random.permutation(X.shape[0])\n", 130 | " Z = X[idx[:self.num_inducing]]\n", 131 | " self.model_ = GPy.models.SparseGPRegression(X, y, kernel=self.kernel_, Z=Z)\n", 132 | " self.model_.inducing_inputs.fix()\n", 133 | "\n", 134 | " self.model_.optimize(max_iters=100)\n", 135 | " \n", 136 | " def predict(self, X):\n", 137 | " prediction, _ = self.model_.predict(X)\n", 138 | " return prediction\n", 139 | " \n", 140 | " def score(self, X, y):\n", 141 | " prediction = self.predict(X)\n", 142 | " return r2_score(y, prediction)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": { 148 | "slideshow": { 149 | "slide_type": "skip" 150 | } 151 | }, 152 | "source": [ 153 | "Let's load house pricing data again." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": true, 161 | "slideshow": { 162 | "slide_type": "skip" 163 | } 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "data = np.loadtxt('house_pricing.csv')\n", 168 | "\n", 169 | "scaler = StandardScaler()\n", 170 | "\n", 171 | "X = scaler.fit_transform(data[:, :-1])\n", 172 | "y = data[:, -1:]\n", 173 | "y_log = np.log(y)\n", 174 | "\n", 175 | "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": true, 183 | "scrolled": false, 184 | "slideshow": { 185 | "slide_type": "skip" 186 | } 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "%%time\n", 191 | "kernel = GPy.kern.RBF(X.shape[1])\n", 192 | "\n", 193 | "model = SparseGPModel(kernel, num_inducing=100)\n", 194 | "prediction = cross_val_predict(model, X, np.log1p(y), cv=3, n_jobs=1)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": true, 202 | "slideshow": { 203 | "slide_type": "skip" 204 | } 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "prediction = np.expm1(prediction)\n", 209 | "R2 = r2_score(y, prediction)\n", 210 | "print(R2)\n", 211 | "\n", 212 | "def scatter_plot(y_test, prediction):\n", 213 | " pyplot.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--k')\n", 214 | " pyplot.scatter(y_test, prediction)\n", 215 | " pyplot.xlabel('Actual value')\n", 216 | " pyplot.ylabel('Predicted value')\n", 217 | " pyplot.show()\n", 218 | " \n", 219 | "scatter_plot(y, prediction)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "slideshow": { 226 | "slide_type": "skip" 227 | } 228 | }, 229 | "source": [ 230 | "### Task\n", 231 | "\n", 232 | "For different number of inducing points (100, 200, 300, 500) build GP model and plot figure of how model accuracy and building time changes." 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": true, 240 | "slideshow": { 241 | "slide_type": "skip" 242 | } 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "######## Your code here ########\n", 247 | "import time\n", 248 | "\n", 249 | "n_inducing = [100, 200, 300, 500]\n", 250 | "errors = []\n", 251 | "times = []\n" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": { 257 | "slideshow": { 258 | "slide_type": "skip" 259 | } 260 | }, 261 | "source": [ 262 | "Plot figures" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": { 269 | "collapsed": true, 270 | "scrolled": false, 271 | "slideshow": { 272 | "slide_type": "skip" 273 | } 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "figure, ax = pyplot.subplots(1, 2, figsize=(6, 3))\n", 278 | "ax[0].plot(n_inducing, errors, '.', label='R2')\n", 279 | "ax[0].plot(n_inducing, errors, '-', label='R2')\n", 280 | "ax[1].plot(n_inducing, times, '.', label='Training time')\n", 281 | "ax[1].plot(n_inducing, times, '-', label='Training time')\n", 282 | "figure.tight_layout()\n", 283 | "pyplot.show()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": { 289 | "slideshow": { 290 | "slide_type": "slide" 291 | } 292 | }, 293 | "source": [ 294 | "### Random Fourier Features\n", 295 | "\n", 296 | "Idea: shift-invariant kernel can be represented as\n", 297 | "$$\n", 298 | "k(x, y) = k(x - y) = \\int p(w) e^{jw^T(x - y)} dw\n", 299 | "$$\n", 300 | "\n", 301 | "Let's calculate integral approximately by Monte Carlo\n", 302 | "$$\n", 303 | "k(x, y) \\approx \\frac{1}{M} \\sum_{i=1}^M \\phi_i^T(x) \\phi_i(y),\n", 304 | "$$\n", 305 | "where $\\phi_i(x) = \\sqrt{2}\\cos(w^Tx + b)$, $w \\sim p(w), b \\sim Uniform([0, 2\\pi])$.\n", 306 | "\n", 307 | "This means that the covariance matrix is approximated by $\\mathbf{K} = \\Phi \\Phi^T$, where $\\Phi = \\|\\boldsymbol{\\phi}(x_i)\\|_{i = 1}^N, \\quad \\boldsymbol{\\phi}(x) = (\\phi_1(x), \\ldots, \\phi_M(x))$" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "slideshow": { 314 | "slide_type": "slide" 315 | } 316 | }, 317 | "source": [ 318 | "So, go back from functional space view to weight-space view:\n", 319 | "$$\n", 320 | "y = \\beta^T\\phi(x) + \\varepsilon, \\quad \\beta \\sim \\mathcal{N}(0, \\; \\Sigma), \\quad \\varepsilon \\sim \\mathcal{N}(0, \\; \\sigma_n^2)\n", 321 | "$$\n", 322 | "The predictive distribution in this case:\n", 323 | "$$\n", 324 | "f_* \\; | \\; x_*, X, \\mathbf{y} = \\mathcal{N}\\left (\\frac{1}{\\sigma_n^2}\\boldsymbol{\\phi}(x_*)^TA^{-1}\\Phi^T \\mathbf{y},\\;\n", 325 | "\\boldsymbol{\\phi}(x_*)^T A^{-1}\\boldsymbol{\\phi}(x_*)\n", 326 | "\\right ), \\quad A = \\sigma_n^{-2}\\Phi^T \\Phi + \\Sigma^{-1}\n", 327 | "$$\n", 328 | "The computational complexity is $\\mathcal{O}(NM^2)$." 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "### Task\n", 336 | "\n", 337 | "Implement generation of RFF" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": true, 345 | "slideshow": { 346 | "slide_type": "skip" 347 | } 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "from sklearn.base import BaseEstimator\n", 352 | "from sklearn.exceptions import NotFittedError\n", 353 | "from scipy.stats import cauchy, laplace\n", 354 | "from sklearn.metrics.pairwise import rbf_kernel, laplacian_kernel\n", 355 | "\n", 356 | "\n", 357 | "class RFF(BaseEstimator):\n", 358 | " def __init__(self, gamma=1, n_components=50, kernel=\"rbf\"):\n", 359 | " self.gamma = gamma\n", 360 | " self.kernel = kernel\n", 361 | " # Number of features (Monte Carlo samples)\n", 362 | " self.n_components = n_components\n", 363 | " self.fitted = False\n", 364 | " \n", 365 | " def fit(self, X, y=None):\n", 366 | " \"\"\" Generates MonteCarlo random samples \"\"\"\n", 367 | " d = X.shape[1]\n", 368 | " \n", 369 | " ######## Your coder here ########\n", 370 | " #Generate D iid samples from p(w)\n", 371 | " \n", 372 | " if self.kernel == \"rbf\": # for RBF kernel p(w) ~ exp(-gamma * w^2)\n", 373 | " self.w = \n", 374 | " elif self.kernel == \"laplace\": # for Laplace distribution p(w) ~ Cauchy(gamma)\n", 375 | " self.w = \n", 376 | " \n", 377 | " #Generate D iid samples from Uniform(0,2*pi) \n", 378 | " self.u = \n", 379 | " self.fitted = True\n", 380 | " return self\n", 381 | " \n", 382 | " def transform(self, X):\n", 383 | " \"\"\" Transforms the data X (n_samples, n_features) to the new map space Z(X) (n_samples, n_components)\"\"\"\n", 384 | " if not self.fitted:\n", 385 | " raise NotFittedError(\"RBF_MonteCarlo must be fitted beform computing the feature map Z\")\n", 386 | " \n", 387 | " ######## Your coder here ########\n", 388 | " #Compute feature map Z(x):\n", 389 | " Z = \n", 390 | " return Z\n", 391 | " \n", 392 | " def compute_kernel(self, X):\n", 393 | " \"\"\" Computes the approximated kernel matrix K \"\"\"\n", 394 | " if not self.fitted:\n", 395 | " raise NotFittedError(\"RBF_MonteCarlo must be fitted beform computing the kernel matrix\")\n", 396 | " Z = self.transform(X)\n", 397 | " K = Z.dot(Z.T)\n", 398 | " return K" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "Now, generate 100 random points from [0, 1]^d, calculate exact kernel matrix for RBF and Exponential kernels,\n", 406 | "calculate their approximations using RFF and check that they are close." 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": { 413 | "collapsed": true 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "######## Your code here ########\n", 418 | "dim = 4\n", 419 | "rbf = GPy.kern.RBF(dim, lengthscale=1 / np.sqrt(2)).K\n", 420 | "\n", 421 | "exponential = GPy.kern.Exponential(dim).K\n", 422 | "\n", 423 | "np.random.seed(42)\n", 424 | "x = np.random.rand(100, dim)\n", 425 | "\n", 426 | "######## Your code here ########\n", 427 | "# 1. Calculate exact kernel matrix for RBF kernel and Exponential kernels\n", 428 | "# 2. Calculate approximations using RFF\n", 429 | "# 3. Calculate approximation error ||K_exact - K_approx|| / ||K_exact|| and check whether the norm is small\n" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": { 435 | "slideshow": { 436 | "slide_type": "skip" 437 | } 438 | }, 439 | "source": [ 440 | "### Task\n", 441 | "\n", 442 | "For different number of inducing points (100, 200, 300, 500) build GP model and plot figure of how model accuracy and building time changes." 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": { 449 | "collapsed": true, 450 | "slideshow": { 451 | "slide_type": "skip" 452 | } 453 | }, 454 | "outputs": [], 455 | "source": [ 456 | "######## Your code here ########\n", 457 | "# Hint: use Pipeline from sklearn\n", 458 | "\n", 459 | "n_inducing = [100, 200, 300, 500]\n", 460 | "errors = []\n", 461 | "times = []\n" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "slideshow": { 468 | "slide_type": "skip" 469 | } 470 | }, 471 | "source": [ 472 | "Plot figures" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": { 479 | "collapsed": true, 480 | "slideshow": { 481 | "slide_type": "skip" 482 | } 483 | }, 484 | "outputs": [], 485 | "source": [ 486 | "figure, ax = pyplot.subplots(1, 2, figsize=(6, 3))\n", 487 | "ax[0].plot(n_inducing, errors, '.', label='R2')\n", 488 | "ax[0].plot(n_inducing, errors, '-', label='R2')\n", 489 | "ax[0].legend(loc='best')\n", 490 | "ax[1].plot(n_inducing, times, '.', label='Training time')\n", 491 | "ax[1].plot(n_inducing, times, '-', label='Training time')\n", 492 | "ax[1].legend(loc='best')\n", 493 | "figure.tight_layout()\n", 494 | "\n", 495 | "pyplot.show()" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": { 502 | "collapsed": true 503 | }, 504 | "outputs": [], 505 | "source": [] 506 | } 507 | ], 508 | "metadata": { 509 | "kernelspec": { 510 | "display_name": "Python 3", 511 | "language": "python", 512 | "name": "python3" 513 | }, 514 | "language_info": { 515 | "codemirror_mode": { 516 | "name": "ipython", 517 | "version": 3 518 | }, 519 | "file_extension": ".py", 520 | "mimetype": "text/x-python", 521 | "name": "python", 522 | "nbconvert_exporter": "python", 523 | "pygments_lexer": "ipython3", 524 | "version": "3.5.2" 525 | } 526 | }, 527 | "nbformat": 4, 528 | "nbformat_minor": 2 529 | } 530 | -------------------------------------------------------------------------------- /sem4-GP/EI_vs_logEI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem4-GP/EI_vs_logEI.png -------------------------------------------------------------------------------- /sem4-GP/airline.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem4-GP/airline.mat -------------------------------------------------------------------------------- /sem4-GP/airline_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes2017/eb4b1f0019452a21a2df8238c1891976b5c5f3e3/sem4-GP/airline_result.png -------------------------------------------------------------------------------- /sem4-GP/bayes_opt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from scipy.stats import norm 4 | from scipy.optimize import minimize 5 | 6 | from matplotlib import pyplot 7 | 8 | import GPy 9 | 10 | 11 | def lower_confidence_bound(mean_values, std_values, coefficient=2): 12 | return mean_values.ravel() - coefficient * std_values.ravel() 13 | 14 | 15 | def log_expected_improvement(mean_values, variance_values, opt_value): 16 | estimated_values = mean_values.ravel() 17 | eps = 0.05/len(estimated_values) 18 | 19 | delta = (opt_value - estimated_values - eps).ravel() 20 | 21 | estimated_errors = (variance_values ** 0.5).ravel() 22 | 23 | non_zero_error_inds = np.where(estimated_errors > 1e-6)[0] 24 | Z = np.zeros(len(delta)) 25 | Z[non_zero_error_inds] = delta[non_zero_error_inds]/estimated_errors[non_zero_error_inds] 26 | log_EI = np.log(estimated_errors) + norm.logpdf(Z) + np.log(1 + Z * np.exp(norm.logcdf(Z) - norm.logpdf(Z))) 27 | return log_EI 28 | 29 | 30 | def expected_improvement(mean_values, std_values, opt_values): 31 | improvement = (opt_values.ravel()[0] - mean_values).ravel() 32 | std_values = std_values.ravel() 33 | EI = improvement * norm.cdf(improvement / std_values) + std_values * norm.pdf(improvement / std_values) 34 | return EI 35 | 36 | 37 | def get_new_point(model, lb, ub, data=None, multistart=10, criterion='ei', k=1, random_state=None): 38 | if random_state is None: 39 | random_state = np.random.RandomState() 40 | 41 | lb = np.array(lb).reshape(1, -1) 42 | ub = np.array(ub).reshape(1, -1) 43 | x_random = random_state.uniform(size=(multistart, np.array(lb).ravel().shape[0])) 44 | x_random *= ub - lb 45 | x_random += lb 46 | 47 | def objective(x): 48 | if x.ndim == 1: 49 | x = x.reshape(1, -1) 50 | mean_values, variance = model.predict(x) 51 | if criterion == 'ei': 52 | return -log_expected_improvement(mean_values, variance, data[1].min()) 53 | elif criterion == 'lcb': 54 | return lower_confidence_bound(mean_values, std_values, k) 55 | else: 56 | raise NotImplementedError('Criterion is not implemented!') 57 | 58 | criterion_value = objective(x_random) 59 | 60 | best_result = None 61 | best_value = np.inf 62 | for x_init in x_random: 63 | optimization_result = minimize(objective, x_init, method='L-BFGS-B', bounds=np.vstack((lb, ub)).T) 64 | 65 | if optimization_result.fun < best_value: 66 | best_result = optimization_result 67 | best_value = best_result.fun[0] 68 | return best_result.x, best_result.fun 69 | 70 | 71 | def optimization_step(x_train, y_train, kernel, objective, lb=None, ub=None, criterion='ei', k=1, plot=False): 72 | model = GPy.models.GPRegression(x_train, y_train, kernel) 73 | model.optimize_restarts(num_restarts=10, verbose=False) 74 | 75 | x_new, criterion_value = get_new_point(model, data=(x_train, y_train), lb=lb, ub=ub, criterion=criterion, k=k) 76 | if plot: 77 | plot1d(x_train, y_train, model, objective, x_new, criterion_value) 78 | pyplot.show() 79 | 80 | x_new = x_new.reshape(1, -1) 81 | x_train = np.vstack([x_train, x_new]) 82 | y_train = np.vstack([y_train, np.asarray(objective(x_new)).reshape(1, -1)]) 83 | return x_train, y_train, model 84 | 85 | 86 | def plot1d(x_train, y_train, model, objective, x_new, criterion_value): 87 | x_grid = np.linspace(0, 1, 100).reshape(-1, 1) 88 | y_grid = objective(x_grid) 89 | 90 | prediction, variance = model.predict(x_grid) 91 | std = np.sqrt(variance) 92 | prediction = prediction.ravel() 93 | std = std.ravel() 94 | 95 | pyplot.figure(figsize=(8, 6)) 96 | pyplot.plot(x_train, y_train, 'or', markersize=8, label='Training set') 97 | pyplot.plot(x_grid, y_grid, '--b', linewidth=2, label='True function') 98 | pyplot.plot(x_grid, prediction, '-k', linewidth=2, label='Approximation') 99 | pyplot.fill_between(x_grid.ravel(), prediction - 2 * std, prediction + 2 * std, alpha=0.3) 100 | pyplot.plot(x_new, objective(x_new), 'og', markersize=10, label='New point') 101 | pyplot.ylim([-15, 20]) 102 | pyplot.legend(loc='best') 103 | 104 | 105 | def plot2d(objective, x_train, y_train, model): 106 | grid_size = 50 107 | x = np.meshgrid(np.linspace(-1, 1, grid_size), np.linspace(-1, 1, grid_size)) 108 | x = np.hstack((x[0].reshape(-1, 1), x[1].reshape(-1, 1))) 109 | y = objective(x) 110 | 111 | prediction, variance = model.predict(x) 112 | std = np.sqrt(variance).ravel() 113 | 114 | x_train = (x_train + 1) * grid_size / 2 115 | log_EI = np.exp(log_expected_improvement(prediction, std, y_train.min())) 116 | 117 | values = [prediction, y, std, log_EI] 118 | names = ['Predicted values', 'Exact values', 'Predicted std', 'log EI'] 119 | 120 | figure, axes = pyplot.subplots(nrows=2, ncols=2, figsize=(6, 6)) 121 | 122 | for i, ax in enumerate(axes.ravel()): 123 | if i < 3: 124 | ax.imshow(values[i].reshape(grid_size, grid_size), vmin=0, vmax=1, alpha=0.8) 125 | else: 126 | ax.imshow(values[i].reshape(grid_size, grid_size), alpha=0.8) 127 | ax.scatter(x_train[:-1, 0], x_train[:-1, 1], c='r', s=20) 128 | ax.scatter(x_train[-1, 0], x_train[-1, 1], marker='d', edgecolor='k', c='g', s=180) 129 | ax.set_xlim([-0.5, grid_size + 0.5]) 130 | ax.set_ylim([-0.5, grid_size + 0.5]) 131 | ax.axis('off') 132 | ax.set_title(names[i]) 133 | 134 | figure.tight_layout() 135 | 136 | 137 | def demo_2d(n_init, budget, kernel, save_path='./library/2d_demo.mp4'): 138 | global x_train, y_train, model 139 | 140 | def f2d(x): 141 | t = np.sum((x + 0.6)**2, axis=1) - 0.3 142 | y = np.sin(t)**2 / np.tanh(t**2 + 0.4) 143 | return y.reshape(-1, 1) 144 | 145 | lb = [-1, -1] 146 | ub = [1, 1] 147 | np.random.seed(42) 148 | x_train = np.random.rand(n_init, 2) * 2 - 1 149 | y_train = f2d(x_train) 150 | 151 | model = GPy.models.GPRegression(x_train, y_train, kernel) 152 | model.optimize() 153 | 154 | # Set up formatting for the movie files 155 | import matplotlib.animation as animation 156 | from mpl_toolkits.axes_grid1 import make_axes_locatable 157 | 158 | Writer = animation.writers['ffmpeg_file'] 159 | writer = Writer(fps=1, metadata=dict(artist='Yermek Kapushev')) 160 | 161 | grid_size = 50 162 | x = np.meshgrid(np.linspace(-1, 1, grid_size), np.linspace(-1, 1, grid_size)) 163 | x = np.hstack((x[0].reshape(-1, 1), x[1].reshape(-1, 1))) 164 | y = f2d(x) 165 | 166 | 167 | def get_model_values(model, x, x_train): 168 | prediction, variance = model.predict(x) 169 | std = np.sqrt(variance).ravel() 170 | 171 | log_EI = np.exp(log_expected_improvement(prediction, std, y_train.min())) 172 | 173 | values = [prediction, y, log_EI] 174 | return values 175 | 176 | 177 | values = get_model_values(model, x, x_train) 178 | history = [y_train.min()] 179 | names = ['Predicted values', 'Exact values', 'log EI'] 180 | 181 | # Set up initial canvas 182 | figure, axes = pyplot.subplots(nrows=2, ncols=2, figsize=(6, 6)) 183 | heatmaps = [] 184 | scatters = [] 185 | new_point_scatters = [] 186 | for i, ax in enumerate(axes.ravel()[:-1]): 187 | heatmaps.append(ax.matshow(values[i].reshape(grid_size, grid_size), alpha=0.8)) 188 | x_scatter = (x_train + 1) * grid_size / 2 189 | scatters.append(ax.scatter(x_scatter[:-1, 0], x_scatter[:-1, 1], c='r', s=20)) 190 | new_point_scatters.append(ax.scatter(x_scatter[-1, 0], x_scatter[-1, 1], marker='d', edgecolor='k', 191 | c='g', s=180)) 192 | 193 | divider = make_axes_locatable(ax) 194 | cax = divider.append_axes("right", size="5%", pad=0.05) 195 | figure.colorbar(heatmaps[-1], cax=cax) 196 | ax.set_xlim([-0.5, grid_size + 0.5]) 197 | ax.set_ylim([-0.5, grid_size + 0.5]) 198 | ax.axis('off') 199 | ax.set_title(names[i]) 200 | 201 | convergence_plot = axes.ravel()[-1].plot([y_train.shape[0]], [y_train.min()], '-') 202 | axes.ravel()[-1].set_xlabel('iteration') 203 | axes.ravel()[-1].set_ylabel(r'$y_{min}$') 204 | axes.ravel()[-1].set_xlim([n_init - 1, n_init + budget]) 205 | axes.ravel()[-1].set_ylim([0, 0.0073]) 206 | figure.tight_layout() 207 | 208 | 209 | # Define function that updates figure 210 | def update_fig(iteration): 211 | global x_train, y_train, model 212 | # global y_train 213 | # global model 214 | 215 | if iteration == 0: 216 | return heatmaps + scatters + new_point_scatters + convergence_plot 217 | 218 | model = GPy.models.GPRegression(x_train, y_train, model.kern) 219 | model.optimize() 220 | 221 | x_new, criterion = get_new_point(model, lb, ub, data=(x_train, y_train), multistart=10, random_state=None) 222 | x_new = x_new.reshape(1, -1) 223 | x_train = np.vstack([x_train, x_new]) 224 | y_train = np.vstack([y_train, f2d(x_new)]) 225 | history.append(y_train.min()) 226 | 227 | values = get_model_values(model, x, x_train) 228 | 229 | for i, val in enumerate(values): 230 | heatmaps[i].set_array(val.reshape(grid_size, -1)) 231 | x_scatter = (x_train + 1) * grid_size / 2 232 | scatters[i].set_offsets(x_scatter[:-1]) 233 | new_point_scatters[i].set_offsets(x_scatter[-1:]) 234 | 235 | # adjust colorbar for std and log EI plot 236 | vmin = val.min() 237 | vmax = val.max() 238 | heatmaps[i].set_clim(vmin, vmax) 239 | 240 | convergence_plot[0].set_data(range(n_init, y_train.shape[0] + 1), history) 241 | 242 | return heatmaps + scatters + new_point_scatters + convergence_plot 243 | 244 | 245 | 246 | anim = animation.FuncAnimation(figure, update_fig, 247 | blit=False, 248 | repeat=False, 249 | frames=budget) 250 | anim.save(save_path, writer=writer) 251 | --------------------------------------------------------------------------------