├── README.md ├── homework.pdf ├── lectures ├── day1 │ ├── 0. Dmitry Vetrov - Opening remarks.pdf │ ├── 1. Dmitry Vetrov - Bayesian framework.pdf │ ├── 2. Dmitry Vetrov - Variational inference.pdf │ ├── 3. Dmitry Vetrov - Latent variable models.pdf │ └── readme.md ├── day2 │ ├── 1. Dmitry Vetrov - Stochastic variational inference.pdf │ ├── 2. Artem Sobolev - Discrete Variable Models.pdf │ └── readme.md ├── day3 │ ├── 1. Egor Zakharov - GANs .pdf │ ├── 2. Arsenii Ashukha - Normalizing flows.pdf │ └── readme.md ├── day4 │ ├── 1. Evgeny Burnaev - Gaussian processes.pdf │ ├── 2. Maurizio Filippone - Deep Gaussian Processes.pdf │ ├── 3. Sergey Bartunov - AdaGram.pdf │ └── readme.md ├── day5 │ ├── 1. Dmitry Kropotov - MCMC.pdf │ ├── 2. Kirill Neklyudov - Langevin dynamics │ │ ├── 87_strong_approx_JOTA.pdf │ │ ├── CHS1987.pdf │ │ ├── Gardiner-C-Handbook-of-Stochastic-Methods-2nd-Edition.djvu │ │ ├── Oksendal_B_Stochastic_differential.pdf │ │ ├── gelfand1991.pdf │ │ ├── gelfand1993.pdf │ │ ├── gidas1985.pdf │ │ └── readme.md │ ├── 3. Francisco Ruiz - VI with implicit and semi-implicit models.pdf │ ├── Sponsor talk. Ksenia Shinkarenko.pdf │ └── readme.md └── day6 │ ├── 1. Molchanov - Bayesian Neural Networks.pdf │ ├── 2. Andrey Malinin - Uncertainty estimation in supervised learning.pdf │ ├── 3. Dmitry Molchanov - Loss Surfaces.pdf │ └── readme.md ├── preliminary_materials.pdf └── seminars ├── day1 ├── 1.Bayesian reasoning - slides.pdf ├── 1.Bayesian reasoning-problem set.pdf ├── 2.Approximate inference - problem set.pdf ├── 2.Approximate inference - slides.pdf └── readme.md ├── day2 ├── 1. VAE + DRAW solution.ipynb ├── 1. VAE + DRAW.ipynb ├── 2. Gumbel for SS-VAE solution.ipynb ├── 2. Gumbel for SS-VAE.ipynb └── readme.md ├── day3 ├── gan │ ├── GAN_sem.ipynb │ ├── readme.md │ └── solutions_GAN_sem.ipynb └── nf │ ├── nf-assignment.ipynb │ ├── nf-solution.ipynb │ └── readme.md ├── day4 ├── AdaGram │ ├── AdaGram seminar.ipynb │ └── readme.md └── gp │ ├── BayesOpt │ ├── bayesopt_practice.ipynb │ ├── bayesopt_solution.ipynb │ ├── test_data.csv │ ├── training_data.csv │ └── utils.py │ ├── GP │ ├── EI_vs_logEI.png │ ├── airline.npz │ ├── airline_result.png │ ├── gp.png │ ├── gp_practice.ipynb │ ├── gp_solution.ipynb │ └── utils.py │ └── readme.md ├── day5 ├── Markov Chain Monte-Carlo Solution.ipynb ├── Markov Chain Monte-Carlo.ipynb └── readme.md └── day6 ├── SparseVD-assignment-colab.ipynb ├── SparseVD-assignment.ipynb ├── SparseVD-solution.ipynb ├── local_logger.py └── readme.md /README.md: -------------------------------------------------------------------------------- 1 | # Materials of the [Summer school on Deep learning and Bayesian methods 2019](http://deepbayes.ru/) 2 | -------------------------------------------------------------------------------- /homework.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/homework.pdf -------------------------------------------------------------------------------- /lectures/day1/0. Dmitry Vetrov - Opening remarks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day1/0. Dmitry Vetrov - Opening remarks.pdf -------------------------------------------------------------------------------- /lectures/day1/1. Dmitry Vetrov - Bayesian framework.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day1/1. Dmitry Vetrov - Bayesian framework.pdf -------------------------------------------------------------------------------- /lectures/day1/2. Dmitry Vetrov - Variational inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day1/2. Dmitry Vetrov - Variational inference.pdf -------------------------------------------------------------------------------- /lectures/day1/3. Dmitry Vetrov - Latent variable models.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day1/3. Dmitry Vetrov - Latent variable models.pdf -------------------------------------------------------------------------------- /lectures/day1/readme.md: -------------------------------------------------------------------------------- 1 | # Day 1 lectures 2 | -------------------------------------------------------------------------------- /lectures/day2/1. Dmitry Vetrov - Stochastic variational inference.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day2/1. Dmitry Vetrov - Stochastic variational inference.pdf -------------------------------------------------------------------------------- /lectures/day2/2. Artem Sobolev - Discrete Variable Models.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day2/2. Artem Sobolev - Discrete Variable Models.pdf -------------------------------------------------------------------------------- /lectures/day2/readme.md: -------------------------------------------------------------------------------- 1 | # Day 2 lectures 2 | 3 | Link to invited lecture: [3. Novi Quadrianto - Fair machine learning](https://predictive-analytics-lab.github.io/presentations/moscow2019.html#/) 4 | -------------------------------------------------------------------------------- /lectures/day3/1. Egor Zakharov - GANs .pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day3/1. Egor Zakharov - GANs .pdf -------------------------------------------------------------------------------- /lectures/day3/2. Arsenii Ashukha - Normalizing flows.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day3/2. Arsenii Ashukha - Normalizing flows.pdf -------------------------------------------------------------------------------- /lectures/day3/readme.md: -------------------------------------------------------------------------------- 1 | # Day 3 lectures 2 | -------------------------------------------------------------------------------- /lectures/day4/1. Evgeny Burnaev - Gaussian processes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day4/1. Evgeny Burnaev - Gaussian processes.pdf -------------------------------------------------------------------------------- /lectures/day4/2. Maurizio Filippone - Deep Gaussian Processes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day4/2. Maurizio Filippone - Deep Gaussian Processes.pdf -------------------------------------------------------------------------------- /lectures/day4/3. Sergey Bartunov - AdaGram.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day4/3. Sergey Bartunov - AdaGram.pdf -------------------------------------------------------------------------------- /lectures/day4/readme.md: -------------------------------------------------------------------------------- 1 | ### Day 4 lectures 2 | -------------------------------------------------------------------------------- /lectures/day5/1. Dmitry Kropotov - MCMC.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/1. Dmitry Kropotov - MCMC.pdf -------------------------------------------------------------------------------- /lectures/day5/2. Kirill Neklyudov - Langevin dynamics/87_strong_approx_JOTA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/2. Kirill Neklyudov - Langevin dynamics/87_strong_approx_JOTA.pdf -------------------------------------------------------------------------------- /lectures/day5/2. Kirill Neklyudov - Langevin dynamics/CHS1987.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/2. Kirill Neklyudov - Langevin dynamics/CHS1987.pdf -------------------------------------------------------------------------------- /lectures/day5/2. Kirill Neklyudov - Langevin dynamics/Gardiner-C-Handbook-of-Stochastic-Methods-2nd-Edition.djvu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/2. Kirill Neklyudov - Langevin dynamics/Gardiner-C-Handbook-of-Stochastic-Methods-2nd-Edition.djvu -------------------------------------------------------------------------------- /lectures/day5/2. Kirill Neklyudov - Langevin dynamics/Oksendal_B_Stochastic_differential.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/2. Kirill Neklyudov - Langevin dynamics/Oksendal_B_Stochastic_differential.pdf -------------------------------------------------------------------------------- /lectures/day5/2. Kirill Neklyudov - Langevin dynamics/gelfand1991.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/2. Kirill Neklyudov - Langevin dynamics/gelfand1991.pdf -------------------------------------------------------------------------------- /lectures/day5/2. Kirill Neklyudov - Langevin dynamics/gelfand1993.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/2. Kirill Neklyudov - Langevin dynamics/gelfand1993.pdf -------------------------------------------------------------------------------- /lectures/day5/2. Kirill Neklyudov - Langevin dynamics/gidas1985.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/2. Kirill Neklyudov - Langevin dynamics/gidas1985.pdf -------------------------------------------------------------------------------- /lectures/day5/2. Kirill Neklyudov - Langevin dynamics/readme.md: -------------------------------------------------------------------------------- 1 | # Materials for 2nd lecture 2 | 3 | [__LINK TO SLIDES__](https://docs.google.com/presentation/d/1_yekoTv_CHRgz6vsT57RMDESHjlnbGQvq8tYCxKLyW0/edit?usp=sharing) 4 | -------------------------------------------------------------------------------- /lectures/day5/3. Francisco Ruiz - VI with implicit and semi-implicit models.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/3. Francisco Ruiz - VI with implicit and semi-implicit models.pdf -------------------------------------------------------------------------------- /lectures/day5/Sponsor talk. Ksenia Shinkarenko.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day5/Sponsor talk. Ksenia Shinkarenko.pdf -------------------------------------------------------------------------------- /lectures/day5/readme.md: -------------------------------------------------------------------------------- 1 | # Day 5 slides 2 | 3 | * [Link to 2nd lecture](https://docs.google.com/presentation/d/1_yekoTv_CHRgz6vsT57RMDESHjlnbGQvq8tYCxKLyW0/edit?usp=sharing) 4 | * [Link to videos for sponsor talk](https://drive.google.com/drive/folders/1v7smJoEgVohwuPSGE0hSGlF-pkUHD-lY) 5 | -------------------------------------------------------------------------------- /lectures/day6/1. Molchanov - Bayesian Neural Networks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day6/1. Molchanov - Bayesian Neural Networks.pdf -------------------------------------------------------------------------------- /lectures/day6/2. Andrey Malinin - Uncertainty estimation in supervised learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day6/2. Andrey Malinin - Uncertainty estimation in supervised learning.pdf -------------------------------------------------------------------------------- /lectures/day6/3. Dmitry Molchanov - Loss Surfaces.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/lectures/day6/3. Dmitry Molchanov - Loss Surfaces.pdf -------------------------------------------------------------------------------- /lectures/day6/readme.md: -------------------------------------------------------------------------------- 1 | # Lectures for 6th day 2 | -------------------------------------------------------------------------------- /preliminary_materials.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/preliminary_materials.pdf -------------------------------------------------------------------------------- /seminars/day1/1.Bayesian reasoning - slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/seminars/day1/1.Bayesian reasoning - slides.pdf -------------------------------------------------------------------------------- /seminars/day1/1.Bayesian reasoning-problem set.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/seminars/day1/1.Bayesian reasoning-problem set.pdf -------------------------------------------------------------------------------- /seminars/day1/2.Approximate inference - problem set.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/seminars/day1/2.Approximate inference - problem set.pdf -------------------------------------------------------------------------------- /seminars/day1/2.Approximate inference - slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/seminars/day1/2.Approximate inference - slides.pdf -------------------------------------------------------------------------------- /seminars/day1/readme.md: -------------------------------------------------------------------------------- 1 | # Day 1, theoretical seminars 2 | -------------------------------------------------------------------------------- /seminars/day2/1. VAE + DRAW.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "deep_bayes_VAE_blank.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "w8F28mYbZJUo", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "*(to use GPU in colab go to Runtime -> Change Runtime Type and change the hardware accelerator)*" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "x_Dd0j4pcw9-", 32 | "colab_type": "code", 33 | "colab": {} 34 | }, 35 | "source": [ 36 | "# some prelimenaries\n", 37 | "from torchvision.datasets import MNIST\n", 38 | "from torchvision import transforms\n", 39 | "import torch\n", 40 | "from torch import nn\n", 41 | "import numpy as np\n", 42 | "import matplotlib.pylab as plt\n", 43 | "\n", 44 | "torch.manual_seed(0)\n", 45 | "\n", 46 | "if torch.cuda.is_available():\n", 47 | " device = torch.device('cuda:0')\n", 48 | "else:\n", 49 | " device = torch.device('cpu')\n", 50 | "\n", 51 | "print('Using torch version {}'.format(torch.__version__))\n", 52 | "print('Using {} device'.format(device))\n", 53 | " \n", 54 | "# Training dataset\n", 55 | "train_loader = torch.utils.data.DataLoader(\n", 56 | " MNIST(root='.', train=True, download=True,\n", 57 | " transform=transforms.ToTensor()),\n", 58 | " batch_size=100, shuffle=True, pin_memory=True)\n", 59 | "# Test dataset\n", 60 | "test_loader = torch.utils.data.DataLoader(\n", 61 | " MNIST(root='.', train=False, transform=transforms.ToTensor()),\n", 62 | " batch_size=100, shuffle=True, pin_memory=True)" 63 | ], 64 | "execution_count": 0, 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "id": "_c1mRNbLct6G", 71 | "colab_type": "text" 72 | }, 73 | "source": [ 74 | "# Practical Session. Variational Autoencoders\n", 75 | "\n", 76 | "During this practical session, you will implement a vanilla VAE on MNIST and then a VAE extension with multiple latent variables. Both implementations will be based on classes for parametric probabilistic distributions from the torch [*torch.distributions*](https://pytorch.org/docs/stable/distributions.html) module to emphasize the probabilistic nature of the models.\n", 77 | "\n", 78 | "To complete the task, you will have read the notebook and construct two loss functions using the classes and then train the models.\n", 79 | "\n", 80 | "# AEs vs. VAEs\n", 81 | "\n", 82 | "As illustrated below, autoencoders can provide good reconstruction quality. \n", 83 | "\n", 84 | "![Autoencoder reconstructions](https://github.com/bayesgroup/deepbayes-2018/blob/master/day2_vae/ae_reconstructions.png?raw=true)\n", 85 | "\n", 86 | "Still, the model has no control over the learned latent representations. For example, an interpolation of latent representations of two digits is typically not a latent representation for a digit:\n", 87 | "\n", 88 | "![Autoencoder interpolations](https://github.com/bayesgroup/deepbayes-2018/blob/master/day2_vae/ae_interpolations.png?raw=true)\n", 89 | "\n", 90 | "On the other hand, a standard VAE model forces latent representation to fit a multivariate Gaussian distribution. As a result, an interpolation of two latent representations is likely to be a latent representation of a digit." 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "id": "YCGwVlZWdUA-", 97 | "colab_type": "text" 98 | }, 99 | "source": [ 100 | "# Distributions for VAE\n", 101 | "\n", 102 | "For the assignment, we will need two types of distributions to define the probabilistic model. For the representation $z$ we need a multivariate [normal distribution](https://pytorch.org/docs/stable/distributions.html#normal) with diagonal covariance matrix (to put another way, a vector of independent normal random variables). For observations $x$, we will need a vector of independent [Bernoulli](https://pytorch.org/docs/stable/distributions.html#bernoulli) random variables. By default, both classes model a tensor of independent random **variables**. To represent a matrix of independent random variables as a batch of random **vectors** you may also use the [Independent](https://pytorch.org/docs/stable/distributions.html#independent) class.\n", 103 | "\n", 104 | "### Bernoulli random vector\n", 105 | "\n", 106 | "While the class can be initialized both with probabilities and logits, the best practice is to initialize the class with logits. Otherwise, computing logarithm of probability can be highly unstable. \n", 107 | "\n", 108 | "In the tasks, you will use this class to model $p(x | z)$ parametrized by the output of the decoder. To define the loss function you will need to compute $\\log p(x | z)$ for input images using *log_prob()* method.\n", 109 | "\n", 110 | "### Normal Distribution\n", 111 | "\n", 112 | "In this task, you will use the class to define the approximate posterior distribution $q(x | z)$ and the latent variable distribution $p(z)$.\n", 113 | "\n", 114 | "Again, you will use *log_prob()* method to compute the loss function. Besides that, you will need to generate a sample from $q(x | z)$ to pass it to the decoder. To implement the reparametrization trick the class defines a specific method *rsample()*, that computes $z = \\mu(x) + \\varepsilon \\odot \\sigma(x)$ for standard Gaussian noise $\\varepsilon$. Notice that the implementation of *rsample()* method differs from the implementation of *sample()* method." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "metadata": { 120 | "id": "9r_hPaHx0Jz1", 121 | "colab_type": "code", 122 | "colab": {} 123 | }, 124 | "source": [ 125 | "from torch.distributions import Normal, Bernoulli, Independent" 126 | ], 127 | "execution_count": 0, 128 | "outputs": [] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "id": "5wtFSf25dXjx", 134 | "colab_type": "text" 135 | }, 136 | "source": [ 137 | "# Vanilla VAE\n", 138 | "\n", 139 | "A variational autoencoder consists of two components. The first component is a probabilistic model for observations: \n", 140 | "\\begin{align}\n", 141 | "& p(x, z \\mid \\theta) = p(z) p(x \\mid z, \\theta) \\\\\n", 142 | "& p(z) = \\mathcal N(z \\mid 0, I) \\\\\n", 143 | "& p(x \\mid z, \\theta) = \\prod_{i = 1}^D p_i(z, \\theta)^{x_i} (1 - p_i(z, \\theta))^{1 - x_i}.\n", 144 | "\\end{align}\n", 145 | "The second component is a variational approximation, used to compute the lower bound on marginal likelihood (VAE uses the negative lower bound as a loss function)\n", 146 | "\\begin{equation}\n", 147 | "q(z \\mid x, \\phi) = \\mathcal N(z \\mid \\mu(x, \\phi), \\operatorname{diag}(\\sigma^2(x, \\phi))).\n", 148 | "\\end{equation}\n", 149 | "The lower bound for probability of observing $x$ from a minibatch is\n", 150 | "$$ \\mathcal L(x, \\theta, \\phi) = \\mathbb E_{q(z \\mid x, \\phi)} \\left[ \\log p(x \\mid z, \\phi) + \\log p(z) - \\log q(z \\mid x, \\theta) \\right] $$\n", 151 | "However, it is impossible to compute the expectation. The standard practice is to approximate it with the following one-sample Monte-Carlo estimate:\n", 152 | "\\begin{align*}\n", 153 | "\\log p(x \\mid z_0, \\phi) + \\log p(z_0) - \\log q(z_0 \\mid x, \\theta) \\\\\n", 154 | "z_0 = \\mu(x, \\phi) + \\sigma^2(x, \\phi)^T \\varepsilon_0 \\\\\n", 155 | "\\varepsilon_0 \\sim \\mathcal N(0, I)\n", 156 | "\\end{align*}\n", 157 | "*Note that this choice of the Monte-Carlo estimate for expectation is crucial and is typically reffered to as* **reparametrization trick.** For more details see [Auto-encoding Variational Bayes](https://arxiv.org/abs/1312.6114) paper.\n", 158 | "\n", 159 | "Finally, to train the model we average the lower bound values over the minibatch and then maximize the average with gradient ascent:\n", 160 | "$$ \\frac{1}{N} \\sum_{n=1}^N \\log p(x_n \\mid z_n, \\phi) + \\log p(z_n) - \\log q(z_n \\mid x_n, \\theta) \\rightarrow \\max_{\\theta, \\phi} $$\n", 161 | "## Encoder and decoder\n", 162 | "\n", 163 | "$q(z\\mid x, \\theta)$ is usually called encoder and $p(x \\mid z, \\phi)$ is usually called decoder. To parametrize these distributions we introduce two neural networks:\n", 164 | "\n", 165 | "- *enc* takes $x$ as input and return $2 \\times d$-dimensional vector to parametrize mean and standard deviation of $q(z \\mid x, \\theta)$\n", 166 | "- *dec* takes a latent representation $z$ and returns the logits of distribution $p(x \\mid z, \\phi)$.\n", 167 | "\n", 168 | "The computational graph has a simple structure of autoencoder. The only difference is that now it uses a stochastic variable $\\varepsilon$:\n", 169 | "\n", 170 | "![vae](https://github.com/bayesgroup/deepbayes-2018/blob/master/day2_vae/vae.png?raw=true)\n", 171 | "\n", 172 | "Below we initialize a couple of simple fully-connected networks to model the two distributions. " 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "metadata": { 178 | "id": "pakTj8-gc6SZ", 179 | "colab_type": "code", 180 | "colab": {} 181 | }, 182 | "source": [ 183 | "d, nh, D = 32, 100, 28 * 28\n", 184 | "\n", 185 | "enc = nn.Sequential(\n", 186 | " nn.Linear(D, nh),\n", 187 | " nn.ReLU(),\n", 188 | " nn.Linear(nh, nh),\n", 189 | " nn.ReLU(),\n", 190 | " nn.Linear(nh, 2 * d)) # note that the final layer outputs real values\n", 191 | "\n", 192 | "dec = nn.Sequential(\n", 193 | " nn.Linear(d, nh),\n", 194 | " nn.ReLU(),\n", 195 | " nn.Linear(nh, nh),\n", 196 | " nn.ReLU(),\n", 197 | " nn.Linear(nh, D)).to(device) # <-----------------------------------------------\n", 198 | "\n", 199 | "enc = enc.to(device)\n", 200 | "dec = dec.to(device)" 201 | ], 202 | "execution_count": 0, 203 | "outputs": [] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": { 208 | "id": "-xqmyAtbfmhG", 209 | "colab_type": "text" 210 | }, 211 | "source": [ 212 | "## Task 1: VAE Loss function\n", 213 | "\n", 214 | "Implement the loss function for the variational autoencoder" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "metadata": { 220 | "id": "ymwPo9E3erVB", 221 | "colab_type": "code", 222 | "colab": {} 223 | }, 224 | "source": [ 225 | "def loss_vae(x, encoder, decoder):\n", 226 | " \"\"\"\n", 227 | " TODO\n", 228 | " returns\n", 229 | " 1. the avergave value of negative ELBO across the minibatch x\n", 230 | " 2. and the output of the decoder\n", 231 | " \"\"\"\n", 232 | " pass\n", 233 | " #return loss, decoder_output" 234 | ], 235 | "execution_count": 0, 236 | "outputs": [] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": { 241 | "id": "dIMpMloYfyJT", 242 | "colab_type": "text" 243 | }, 244 | "source": [ 245 | "## Training\n", 246 | "The cell below implements a simple training function that can be used for both models." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "metadata": { 252 | "id": "qLI_soZRfzBM", 253 | "colab_type": "code", 254 | "colab": {} 255 | }, 256 | "source": [ 257 | "from itertools import chain\n", 258 | "\n", 259 | "def train_model(loss, model, batch_size=100, num_epochs=3, learning_rate=1e-3):\n", 260 | " gd = torch.optim.Adam(\n", 261 | " chain(*[x.parameters() for x in model\n", 262 | " if (isinstance(x, nn.Module) or isinstance(x, nn.Parameter))]),\n", 263 | " lr=learning_rate)\n", 264 | " train_losses = []\n", 265 | " test_results = []\n", 266 | " for _ in range(num_epochs):\n", 267 | " for i, (batch, _) in enumerate(train_loader):\n", 268 | " total = len(train_loader)\n", 269 | " gd.zero_grad()\n", 270 | " batch = batch.view(-1, D).to(device)\n", 271 | " loss_value, _ = loss(batch, *model)\n", 272 | " loss_value.backward()\n", 273 | " train_losses.append(loss_value.item())\n", 274 | " if (i + 1) % 10 == 0:\n", 275 | " print('\\rTrain loss:', train_losses[-1],\n", 276 | " 'Batch', i + 1, 'of', total, ' ' * 10, end='', flush=True)\n", 277 | " gd.step()\n", 278 | " test_loss = 0.\n", 279 | " for i, (batch, _) in enumerate(test_loader):\n", 280 | " batch = batch.view(-1, D).to(device)\n", 281 | " batch_loss, _ = loss(batch, *model)\n", 282 | " test_loss += (batch_loss - test_loss) / (i + 1)\n", 283 | " print('\\nTest loss after an epoch: {}'.format(test_loss))" 284 | ], 285 | "execution_count": 0, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "lPjL_TOpf17s", 292 | "colab_type": "code", 293 | "colab": {} 294 | }, 295 | "source": [ 296 | "# my implementation has test loss = -110.59\n", 297 | "train_model(loss_vae, model=[enc, dec], num_epochs=16)" 298 | ], 299 | "execution_count": 0, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": { 305 | "id": "McWlphgdf5ip", 306 | "colab_type": "text" 307 | }, 308 | "source": [ 309 | "## Visualisations\n", 310 | "\n", 311 | "- How do reconstruction compare to reconstructions of autoencoder?\n", 312 | "- Interpolations?\n", 313 | "- Is the latent space regularly covered? \n", 314 | "- Is there any dependence between T-SNE encoding and the digit label?" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "metadata": { 320 | "id": "pgFFrXPxkNAh", 321 | "colab_type": "code", 322 | "colab": {} 323 | }, 324 | "source": [ 325 | "def sample_vae(dec, n_samples=50):\n", 326 | " with torch.no_grad():\n", 327 | " samples = torch.sigmoid(dec(torch.randn(n_samples, d).to(device)))\n", 328 | " samples = samples.view(n_samples, 28, 28).cpu().numpy()\n", 329 | " return samples\n", 330 | " \n", 331 | "def plot_samples(samples, h=5, w=10):\n", 332 | " fig, axes = plt.subplots(nrows=h,\n", 333 | " ncols=w,\n", 334 | " figsize=(int(1.4 * w), int(1.4 * h)),\n", 335 | " subplot_kw={'xticks': [], 'yticks': []})\n", 336 | " for i, ax in enumerate(axes.flatten()):\n", 337 | " ax.imshow(samples[i], cmap='gray')" 338 | ], 339 | "execution_count": 0, 340 | "outputs": [] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "metadata": { 345 | "id": "jX7z79vpAUp1", 346 | "colab_type": "code", 347 | "colab": {} 348 | }, 349 | "source": [ 350 | "plot_samples(sample_vae(dec=dec))" 351 | ], 352 | "execution_count": 0, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "TVfp4hfbf66d", 359 | "colab_type": "code", 360 | "colab": {} 361 | }, 362 | "source": [ 363 | "def plot_reconstructions(loss, model):\n", 364 | " with torch.no_grad():\n", 365 | " batch = (test_loader.dataset.data[:25].float() / 255.)\n", 366 | " batch = batch.view(-1, D).to(device)\n", 367 | " _, rec = loss(batch, *model)\n", 368 | " rec = torch.sigmoid(rec)\n", 369 | " rec = rec.view(-1, 28, 28).cpu().numpy()\n", 370 | " batch = batch.view(-1, 28, 28).cpu().numpy()\n", 371 | " \n", 372 | " fig, axes = plt.subplots(nrows=5, ncols=10, figsize=(14, 7),\n", 373 | " subplot_kw={'xticks': [], 'yticks': []})\n", 374 | " for i in range(25):\n", 375 | " axes[i % 5, 2 * (i // 5)].imshow(batch[i], cmap='gray')\n", 376 | " axes[i % 5, 2 * (i // 5) + 1].imshow(rec[i], cmap='gray')" 377 | ], 378 | "execution_count": 0, 379 | "outputs": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "metadata": { 384 | "id": "Fn1cLF_BgAN2", 385 | "colab_type": "code", 386 | "colab": {} 387 | }, 388 | "source": [ 389 | "plot_reconstructions(loss_vae, [enc, dec])" 390 | ], 391 | "execution_count": 0, 392 | "outputs": [] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "metadata": { 397 | "id": "-Ye1dch0gCmp", 398 | "colab_type": "code", 399 | "colab": {} 400 | }, 401 | "source": [ 402 | "def plot_interpolations(encoder, decoder):\n", 403 | " with torch.no_grad():\n", 404 | " batch = (test_loader.dataset.data[:10].float() / 255.)\n", 405 | " batch = batch.view(-1, D).to(device)\n", 406 | " batch = encoder(batch)\n", 407 | " z_0 = batch[:5, :d].view(5, 1, d)\n", 408 | " z_1 = batch[5:, :d].view(5, 1, d)\n", 409 | " \n", 410 | " alpha = torch.linspace(0., 1., 10).to(device)\n", 411 | " alpha = alpha.view(1, 10, 1)\n", 412 | " \n", 413 | " interpolations_z = (z_0 * alpha + z_1 * (1 - alpha))\n", 414 | " interpolations_z = interpolations_z.view(50, d)\n", 415 | " interpolations_x = torch.sigmoid(decoder(interpolations_z))\n", 416 | " interpolations_x = interpolations_x.view(5, 10, 28, 28).cpu().numpy()\n", 417 | " \n", 418 | " fig, axes = plt.subplots(nrows=5, ncols=10, figsize=(14, 7),\n", 419 | " subplot_kw={'xticks': [], 'yticks': []})\n", 420 | " for i in range(50):\n", 421 | " axes[i // 10, i % 10].imshow(interpolations_x[i // 10, i % 10], cmap='gray')" 422 | ], 423 | "execution_count": 0, 424 | "outputs": [] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "metadata": { 429 | "id": "vi5Kw-KOgFky", 430 | "colab_type": "code", 431 | "colab": {} 432 | }, 433 | "source": [ 434 | "plot_interpolations(enc, dec)" 435 | ], 436 | "execution_count": 0, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "tR-VF5QdgHSP", 443 | "colab_type": "code", 444 | "colab": {} 445 | }, 446 | "source": [ 447 | "def plot_tsne(objects, labels):\n", 448 | " from sklearn.manifold import TSNE\n", 449 | " embeddings = TSNE(n_components=2).fit_transform(objects)\n", 450 | " plt.figure(figsize=(8, 8))\n", 451 | " for k in range(10):\n", 452 | " embeddings_for_k = embeddings[labels == k]\n", 453 | " plt.scatter(embeddings_for_k[:, 0], embeddings_for_k[:, 1],\n", 454 | " label='{}'.format(k))\n", 455 | " plt.legend()" 456 | ], 457 | "execution_count": 0, 458 | "outputs": [] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "metadata": { 463 | "id": "6vFlGlKogJ4i", 464 | "colab_type": "code", 465 | "colab": {} 466 | }, 467 | "source": [ 468 | "with torch.no_grad():\n", 469 | " batch = (test_loader.dataset.data[:1000].float() / 255.)\n", 470 | " batch = batch.view(-1, D).to(device)\n", 471 | " \n", 472 | " latent_variables = enc(batch)[:, :d]\n", 473 | " latent_variables = latent_variables.cpu().numpy()\n", 474 | " labels = test_loader.dataset.targets[:1000].numpy()\n", 475 | " \n", 476 | "plot_tsne(latent_variables, labels)" 477 | ], 478 | "execution_count": 0, 479 | "outputs": [] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": { 484 | "id": "wI6yAPTrgMVw", 485 | "colab_type": "text" 486 | }, 487 | "source": [ 488 | "# [DRAW](http://proceedings.mlr.press/v37/gregor15.pdf)\n", 489 | "\n", 490 | "To illustrate the flexibility of VAE framework, this section considers Deep Recurrent Attentive Writer Model. The goal of the model was to improve image generation by allowing the model to generate images step-by-step. Instead of encoding image into a fixed-size latent representation $z$, the model uses a recurrent neural network to generate a sequence of representations $z_1, \\dots, z_T$ that capture the generation steps.\n", 491 | "\n", 492 | "From the probabilistic viewpoint, the model puts standard Gaussian prior over $p(z_i) = \\mathcal N(0, I)$ and uses Bernoulli distribution to model $p(x | z, \\theta)$ just as in the standard VAE. But now the encoder uses a flexible autoregressive model\n", 493 | "\\begin{align}\n", 494 | "q(z_1, \\dots, z_T \\mid x, \\phi) = \\prod_{t=1}^T q(z_t \\mid z_1, \\dots, z_{t-1}, x, \\phi).\n", 495 | "\\end{align}\n", 496 | "Equations (3)-(8) from the [paper](http://proceedings.mlr.press/v37/gregor15.pdf) describe the details of the architecture. At each timestep $t$ the model stores \"canvas\" $c_t$, a D-dimensional vector that sequentially approximates the input sample $x$. \n", 497 | "\n", 498 | "**First**, the recurrent *encoder* computes the approximation error $$\\hat{x}_t = x - \\sigma(c_{t -1})$$ and computes next hidden state $h^{enc}_t$ based on the approximation error $\\hat{x}_t$, the input sample $x$ and the hiddent state of decoder network:\n", 499 | "\\begin{align}\n", 500 | "r_t &= \\textit{read}(x, \\hat{x}_t, h_{t - 1}^{dec}) \\\\\n", 501 | "h_t^{enc} & = RNN^{enc} (h^{enc}_{t - 1}, [r_t, h_{t - 1}^{dec}]).\n", 502 | "\\end{align}\n", 503 | "**Second**, similarly to VAE, the hidden state $h_t^{enc}$ defines mean and variance of a fully-factorised Gaussian distribution\n", 504 | "$$z_t \\sim q(z_t \\mid h_t^{enc}).$$\n", 505 | "**Third**, the *decoder* RNN updates the canvas:\n", 506 | "\\begin{align}\n", 507 | "h^{dec}_t &= RNN^{dec}(h_{t - 1}^{dec}, z_t) \\\\\n", 508 | "c_t &= c_{t - 1} + \\textit{write}(h_t^{dec}) \\\\.\n", 509 | "\\end{align}\n", 510 | "\n", 511 | "After making $T$ timesteps the model computes $p(x | z)$ (i.e. reconstruction error) using the canvas $c_T$ as logits to initialize Bernoulli distribution.\n", 512 | "\n", 513 | "The $\\textit{read}$ and $\\textit{write}$ modules in the simplest instantiation are a concatenation of two inputs and a linear layer\n", 514 | "\\begin{align}\n", 515 | "\\textit{read}(x, \\hat{x}_t, h^{dec}_{t-1}) = [x, \\hat{x}_t] \\\\\n", 516 | "\\textit{write}(h_t^{dec}) = W(h_t^{dec}),\n", 517 | "\\end{align}\n", 518 | "\n", 519 | "although they can be replaced with an attentive neural network to improve the model performance.\n", 520 | "\n", 521 | "Below we use the simplest instantiation with GRU cells for encoder and decoder." 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "metadata": { 527 | "id": "4shOGt6yg3Tg", 528 | "colab_type": "code", 529 | "colab": {} 530 | }, 531 | "source": [ 532 | "T = 16\n", 533 | "d, nh, D = 32, 100, 28 * 28\n", 534 | "\n", 535 | "read = lambda x, y, z: torch.cat([x, y], dim=1)\n", 536 | "write = nn.Linear(nh, D)\n", 537 | "\n", 538 | "enc_rnn = nn.GRUCell(2 * D + nh, 2 * d)\n", 539 | "dec_rnn = nn.GRUCell(d, nh)\n", 540 | "\n", 541 | "# initial hidden states and the initial approximation to a digit\n", 542 | "h_enc_init = nn.Parameter(torch.zeros(2 * d))\n", 543 | "h_dec_init = nn.Parameter(torch.zeros(nh))\n", 544 | "canvas_init = nn.Parameter(torch.zeros(D))\n", 545 | "\n", 546 | "write = write.to(device)\n", 547 | "enc_rnn = enc_rnn.to(device)\n", 548 | "dec_rnn = dec_rnn.to(device)\n", 549 | "h_enc_init = h_enc_init.to(device)\n", 550 | "h_dec_init = h_dec_init.to(device)\n", 551 | "canvas_init = canvas_init.to(device)" 552 | ], 553 | "execution_count": 0, 554 | "outputs": [] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": { 559 | "id": "IZQ-tBF8pb1C", 560 | "colab_type": "text" 561 | }, 562 | "source": [ 563 | "## Task 2: VAE Loss function\n", 564 | "\n", 565 | "Implement the loss function for DRAW" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "metadata": { 571 | "id": "mVagi0SVjRPb", 572 | "colab_type": "code", 573 | "colab": {} 574 | }, 575 | "source": [ 576 | "def loss_draw(x, read, enc_rnn, dec_rnn, write, T, h_enc_init, h_dec_init,\n", 577 | " canvas_init):\n", 578 | " \"\"\"\n", 579 | " TODO\n", 580 | " returns\n", 581 | " 1. the avergave value of negative ELBO across the minibatch x\n", 582 | " 2. and the canvases for each step of computations\n", 583 | " \"\"\"\n", 584 | " # batch_size = x.size(0)\n", 585 | " # canvases = [0] * (T + 1)\n", 586 | " # canvases[0] = canvas_init.view(1, -1).repeat(batch_size, 1)\n", 587 | " # h_enc = h_enc_init.view(1, -1).repeat(batch_size, 1)\n", 588 | " # h_dec = h_dec_init.view(1, -1).repeat(batch_size, 1)\n", 589 | " pass\n", 590 | " # return loss_value, canvases" 591 | ], 592 | "execution_count": 0, 593 | "outputs": [] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": { 598 | "id": "fCrTOR6GsDci", 599 | "colab_type": "text" 600 | }, 601 | "source": [ 602 | "## Training" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "metadata": { 608 | "id": "yk7zy3nuLjI9", 609 | "colab_type": "code", 610 | "colab": {} 611 | }, 612 | "source": [ 613 | "train_model(loss_draw, model=[read, enc_rnn, dec_rnn, write, T, h_enc_init,\n", 614 | " h_dec_init, canvas_init], num_epochs=16)" 615 | ], 616 | "execution_count": 0, 617 | "outputs": [] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": { 622 | "id": "pKfW76oYqtjm", 623 | "colab_type": "text" 624 | }, 625 | "source": [ 626 | "## Visualisation\n", 627 | "The following two snippets visualize model samples and the generation procedure. \n", 628 | "- Did DRAW outperform VAE in terms of loss? \n", 629 | "- Is there any noticeable difference in the quality of samples?" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "metadata": { 635 | "id": "5rJmafaUv0dK", 636 | "colab_type": "code", 637 | "colab": {} 638 | }, 639 | "source": [ 640 | "def sample_draw(dec_rnn, write, T, n_samples):\n", 641 | " with torch.no_grad():\n", 642 | " batch_size = n_samples\n", 643 | " canvases = [0] * (T + 1)\n", 644 | " canvases[0] = torch.zeros(batch_size, D)\n", 645 | " h_dec = torch.zeros(batch_size, nh).to(device) # the initial state\n", 646 | " \n", 647 | " for t in range(T):\n", 648 | " z_t = torch.randn(n_samples, d).to(device)\n", 649 | " h_dec = dec_rnn(z_t, h_dec)\n", 650 | " canvases[t + 1] = canvases[t] + write(h_dec).cpu()\n", 651 | " \n", 652 | " canvases = torch.stack(canvases, 0)[1:]\n", 653 | " canvases = torch.sigmoid(canvases)\n", 654 | " canvases = canvases.view(T, n_samples, 28, 28)\n", 655 | " \n", 656 | "\n", 657 | " return canvases" 658 | ], 659 | "execution_count": 0, 660 | "outputs": [] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "metadata": { 665 | "id": "BPRXHL6oQF6u", 666 | "colab_type": "code", 667 | "colab": {} 668 | }, 669 | "source": [ 670 | "# plots random samples\n", 671 | "samples = sample_draw(dec_rnn, write, T, n_samples=50)[-1]\n", 672 | "plot_samples(samples)" 673 | ], 674 | "execution_count": 0, 675 | "outputs": [] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "metadata": { 680 | "id": "lXcVN_fgL8mI", 681 | "colab_type": "code", 682 | "colab": {} 683 | }, 684 | "source": [ 685 | "# plots the steps of image generation\n", 686 | "samples = sample_draw(dec_rnn, write, T, n_samples=5)\n", 687 | "samples = samples.permute(1, 0, 2, 3).contiguous().view(-1, 28, 28)\n", 688 | "plot_samples(samples, h=5, w=16)" 689 | ], 690 | "execution_count": 0, 691 | "outputs": [] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": { 696 | "id": "iljOmVUrUNvj", 697 | "colab_type": "text" 698 | }, 699 | "source": [ 700 | "# Optional Task\n", 701 | "If you have already completed the above tasks, try proposing a network modification to improve its performance on the test set." 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "metadata": { 707 | "id": "xCaW7ajlmyFR", 708 | "colab_type": "code", 709 | "colab": {} 710 | }, 711 | "source": [ 712 | "" 713 | ], 714 | "execution_count": 0, 715 | "outputs": [] 716 | } 717 | ] 718 | } -------------------------------------------------------------------------------- /seminars/day2/2. Gumbel for SS-VAE.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "deep_bayes_SS_VAE_blank.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "laSwDfOZx3RT", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "*(to use GPU in colab go to Runtime -> Change Runtime Type and change the hardware accelerator)*" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "HVGY58iGt0_f", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "# VAE with Discrete Variables For Semi-Supervised Learning\n", 36 | "\n", 37 | "This practical session is inspired by [\"Semi-supervised Learning with\n", 38 | "Deep Generative Models\"](https://arxiv.org/pdf/1406.5298.pdf). We will also use this model to illustrate the Gumbel-Softmax trick." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "metadata": { 44 | "id": "sVmzR0wRtbD1", 45 | "colab_type": "code", 46 | "outputId": "1953e8ff-7c44-4165-b9c8-09075fa1dc79", 47 | "colab": { 48 | "base_uri": "https://localhost:8080/", 49 | "height": 51 50 | } 51 | }, 52 | "source": [ 53 | "from torchvision.datasets import MNIST\n", 54 | "from torch.utils.data import TensorDataset, DataLoader\n", 55 | "import torch\n", 56 | "from torch import nn\n", 57 | "from torch import optim\n", 58 | "import numpy as np\n", 59 | "import matplotlib.pylab as plt\n", 60 | "\n", 61 | "from torch.distributions import Normal, Bernoulli, Independent\n", 62 | "\n", 63 | "torch.manual_seed(0)\n", 64 | "np.random.seed(0)\n", 65 | "\n", 66 | "if torch.cuda.is_available():\n", 67 | " device = torch.device('cuda:0')\n", 68 | "else:\n", 69 | " device = torch.device('cpu')\n", 70 | "\n", 71 | "print('Using torch version {}'.format(torch.__version__))\n", 72 | "print('Using {} device'.format(device))" 73 | ], 74 | "execution_count": 0, 75 | "outputs": [ 76 | { 77 | "output_type": "stream", 78 | "text": [ 79 | "Using torch version 1.1.0\n", 80 | "Using cuda:0 device\n" 81 | ], 82 | "name": "stdout" 83 | } 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "id": "m67VBfpqthA2", 90 | "colab_type": "text" 91 | }, 92 | "source": [ 93 | "For the semi-supervised learning task we remove 95% of labels from the training set. In the modified training set the observed labels have a standard one-hot encoding and the unobserved labels are represented by all-zero ten dimensional vectors." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "metadata": { 99 | "id": "xrKF-qMPthXx", 100 | "colab_type": "code", 101 | "colab": {} 102 | }, 103 | "source": [ 104 | "data = MNIST(root='.', download=True, train=True)\n", 105 | "new_train_labels = torch.zeros(60000, 10)\n", 106 | "observed = np.random.choice(60000, 3000)\n", 107 | "new_train_labels[observed] = torch.eye(10)[data.targets][observed]\n", 108 | "train_data = TensorDataset(data.data.view(-1, 28 * 28).float() / 255,\n", 109 | " new_train_labels)\n", 110 | "\n", 111 | "data = MNIST(root='.', download=True, train=False)\n", 112 | "test_data = TensorDataset(data.data.view(-1, 28 * 28).float() / 255,\n", 113 | " data.targets)" 114 | ], 115 | "execution_count": 0, 116 | "outputs": [] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "ej9Z1YEZt-Rw", 122 | "colab_type": "text" 123 | }, 124 | "source": [ 125 | "## The probabilistic model\n", 126 | "\n", 127 | "In the semi-supervised setting, the generative model is a little more complicated. In particular, it incorporates a new variable $y$ that represents the digits class.\n", 128 | "\n", 129 | "\\begin{align*}\n", 130 | "& p(x, y, z) = p(x \\mid y, z) p(z) p(y) \\\\\n", 131 | "& p(y) = Cat(y \\mid \\pi_0), \\pi_0 = (1/10, \\dots, 1/10) \\\\\n", 132 | "& p(z) = \\mathcal N(z \\mid 0, I) \\\\\n", 133 | "& p(x \\mid y, z) = \\prod_{i=1}^D p_i(y, z)^{x_i} (1 - p_i(y, z))^{1 - x_i}\n", 134 | "\\end{align*}\n", 135 | "\n", 136 | "Typically, whenever we train a model with partial observations, we interpret unobserved variables as latent variables and marginalize over them. In this case, the loss function splits into two terms: one for observed variables (we denote the set of indices of observed labels $P$), another for unobserved.\n", 137 | "\n", 138 | "\\begin{equation}\n", 139 | "L(X, y) = \\sum_{i \\notin P} \\log p(x_i) + \\sum_{i \\in P} \\log p(x_i, y_i)\n", 140 | "\\end{equation}\n", 141 | "\n", 142 | "Again, we can't compute the exact values of marginal likelihoods and resort to variational lower bound on likelihood. To compute lower bounds we define the following variational approximation:\n", 143 | "\n", 144 | "\\begin{align*}\n", 145 | "& q(y, z \\mid x) = q(y \\mid x) q(z \\mid y, x)\\\\\n", 146 | "& \\\\\n", 147 | "& q(y \\mid x) = Cat(y \\mid \\pi(x))\\\\\n", 148 | "& q(z \\mid y, x) = \\mathcal N(z \\mid \\mu_\\phi(x, y), \\operatorname{diag}\\sigma^2_\\phi(y, x))\n", 149 | "\\end{align*}\n", 150 | "\n", 151 | "### ELBO for observed variables\n", 152 | "\n", 153 | "Similiar to VAE:\n", 154 | "\n", 155 | "\\begin{equation}\n", 156 | "\\log p(x, y) = \\log \\mathbb E_{p(z)} p(x, y \\mid z) \\geq \\mathbb E_{q(z \\mid y, x)} \\log \\frac{p(x, y \\mid z) p(z)}{q(z \\mid y, x)}\n", 157 | "\\end{equation}\n", 158 | "\n", 159 | "### ELBO for unobserved variables\n", 160 | "\n", 161 | "\\begin{equation}\n", 162 | "\\log p(x) = \\log \\mathbb E_{p(y)} \\mathbb E_{p(z \\mid y)} \\log p(x\\mid z, y)\\geq \\mathbb E_{q(y \\mid x)} \\mathbb E_{q(z \\mid y, x)} \\log \\frac{p(x, y \\mid z) p(z)}{q(z \\mid y, x) q(y \\mid x)}\n", 163 | "\\end{equation}\n", 164 | "\n", 165 | "### The final objective\n", 166 | "\n", 167 | "\\begin{equation}\n", 168 | "\\mathcal L(X, y) = \\sum_{i \\in P} \\mathbb E_{q(z_i \\mid y_i, x_i)} \\log \\frac{p(x_i, y_i \\mid z_i) p(z_i)}{q(z_i \\mid y_i, x_i)} + \\sum_{i \\notin P} \\mathbb E_{q(y_i \\mid x_i)} \\mathbb E_{q(z_i \\mid y_i, x_i)} \\log \\frac{p(x_i, y_i \\mid z_i) p(z_i)}{q(z_i \\mid y_i, x_i) q(y_i \\mid x_i)}\n", 169 | "\\end{equation}\n", 170 | "\n", 171 | "Again, we will use reparametrized Monte-Carlo estimates to approximate expectation over $z$. To approximate expectaion over discrete variables $y$ we will use Gumbel-Softmax trick.\n", 172 | "\n", 173 | "## Important practical aspect\n", 174 | "\n", 175 | "ELBO maximization does not lead to any semantics in latent variables $y$. \n", 176 | "\n", 177 | "We are going to restrict variational approximations $q(y \\mid x)$ to the ones that correctly classify observation $x$ on fully-observed variables $(x_i, y_i)$. As in the original paper, we will add a cross-entropy regularizer to the objective with weight $\\alpha$:\n", 178 | "\n", 179 | "\\begin{equation}\n", 180 | "\\frac{1}{|P|}\\sum_{i \\in P} y_i^T \\log q(y \\mid x).\n", 181 | "\\end{equation}" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": { 187 | "id": "YvS1V7eRuVFs", 188 | "colab_type": "text" 189 | }, 190 | "source": [ 191 | "## RelaxedOneHotCategorical\n", 192 | "\n", 193 | "In the probabilistic model defined above we are going to replace categorical prior $p(y)$ and categorical variational approximation $q(y | x)$ with Gumbel-Softmax distribution. The distribution class is implemented in **torch.distributions.relaxed_categorical.RelaxedOneHotCategorical**.\n", 194 | "\n", 195 | "For more details see [Categorical Reparameterization with Gumbel-Softmax](https://arxiv.org/abs/1611.01144).\n", 196 | "\n", 197 | "### An illustration for Gumbel-Softmax\n", 198 | "\n", 199 | "- Temperature allows for smooth interpolation between one-hot categorical distribution with low temperature and a $(1/K, \\dots, 1/K)$ vector with high temperatures\n", 200 | "- The exact computation of $\\mathbb E_{q(y|x)} f(y)$ requires computation of $f(y)$ for ten possible labels $y=0, \\dots, 9$. On the other hand, with Gumbel-Softmax relaxation only one sample $y \\sim q(y | x)$ is enough. Therefore, Gumbel-Softmax gives almost a ten-fold training speed increase." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "metadata": { 206 | "id": "zreLi7xnuVby", 207 | "colab_type": "code", 208 | "colab": {} 209 | }, 210 | "source": [ 211 | "import matplotlib.cm as cm\n", 212 | "from torch.distributions.relaxed_categorical import RelaxedOneHotCategorical\n", 213 | "\n", 214 | "n_classes = 4\n", 215 | "logits = torch.randn(1, n_classes)\n", 216 | "print('Probs: ', torch.nn.functional.softmax(logits, 1).squeeze().numpy())\n", 217 | "temperatures = [0.1, 0.5, 1., 5., 10.]\n", 218 | "M = 128 # number of samples used to approximate distribution mean\n", 219 | "\n", 220 | "fig, axes = plt.subplots(nrows=2, ncols=len(temperatures), figsize=(14, 6),\n", 221 | " subplot_kw={'xticks': range(n_classes),\n", 222 | " 'yticks': [0., 0.5, 1.]})\n", 223 | "axes[0, 0].set_ylabel('Expectation')\n", 224 | "axes[1, 0].set_ylabel('Gumbel Softmax Sample')\n", 225 | "\n", 226 | "for n, t in enumerate(temperatures):\n", 227 | " dist = RelaxedOneHotCategorical(t, logits=logits)\n", 228 | " mean = torch.zeros_like(logits)\n", 229 | " for _ in range(M):\n", 230 | " mean += dist.sample() / M\n", 231 | " sample = dist.sample()\n", 232 | " \n", 233 | " axes[0, n].set_title('T = {}'.format(t))\n", 234 | " axes[0, n].set_ylim((0, 1.1))\n", 235 | " axes[1, n].set_ylim((0, 1.1))\n", 236 | " axes[0, n].bar(np.arange(n_classes), mean.numpy().reshape(n_classes),\n", 237 | " color=cm.plasma(0.75 * t / max(temperatures)))\n", 238 | " axes[1, n].bar(np.arange(n_classes), sample.numpy().reshape(n_classes),\n", 239 | " color=cm.plasma(0.75 * t / max(temperatures)))" 240 | ], 241 | "execution_count": 0, 242 | "outputs": [] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": { 247 | "id": "QdgeSVo0uh8M", 248 | "colab_type": "text" 249 | }, 250 | "source": [ 251 | "# SS-VAE implementation\n", 252 | "\n", 253 | "The computational graph for observed labels has the following structure:\n", 254 | "\n", 255 | "![computational graph ss vae xy](https://github.com/bayesgroup/deepbayes-2018/blob/master/day2_vae/ss_vae_xy.png?raw=true)\n", 256 | "\n", 257 | "The computational graph for unobserved lables has the following structure:\n", 258 | "\n", 259 | "![computational graph ss vae xy](https://github.com/bayesgroup/deepbayes-2018/blob/master/day2_vae/ss_vae_x.png?raw=true)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "metadata": { 265 | "id": "qnXphotoubKc", 266 | "colab_type": "code", 267 | "colab": {} 268 | }, 269 | "source": [ 270 | "n_classes, d, nh, D = 10, 32, 500, 28 * 28\n", 271 | "default_T = torch.tensor(0.6, device=device)\n", 272 | "\n", 273 | "yz_dec = nn.Sequential(\n", 274 | " nn.Linear(n_classes + d, nh),\n", 275 | " nn.ReLU(),\n", 276 | " nn.Linear(nh, D))\n", 277 | "\n", 278 | "y_enc = nn.Sequential(\n", 279 | " nn.Linear(D, nh),\n", 280 | " nn.ReLU(),\n", 281 | " nn.Linear(nh, n_classes))\n", 282 | "\n", 283 | "z_enc = nn.Sequential(\n", 284 | " nn.Linear(n_classes + D, nh),\n", 285 | " nn.ReLU(),\n", 286 | " nn.Linear(nh, 2 * d)\n", 287 | " )\n", 288 | "\n", 289 | "yz_dec = yz_dec.to(device)\n", 290 | "y_enc = y_enc.to(device)\n", 291 | "z_enc = z_enc.to(device)" 292 | ], 293 | "execution_count": 0, 294 | "outputs": [] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "id": "wxATI2R-unLl", 300 | "colab_type": "text" 301 | }, 302 | "source": [ 303 | "# The task\n", 304 | "\n", 305 | "Implement the loss function for the semi-supervised variational autoencoder" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "metadata": { 311 | "id": "kDY8iEDoumTz", 312 | "colab_type": "code", 313 | "colab": {} 314 | }, 315 | "source": [ 316 | "def loss(x, y, y_encoder, z_encoder, decoder, T=default_T, alpha=32.):#, verbose=False):\n", 317 | " #TODO\n", 318 | " \"\"\"\n", 319 | " NOTE: \n", 320 | " hyperparameter alpha was tuned for the implementation that computed\n", 321 | " the mean of elbo terms and sum of cross-entropy terms over the observed\n", 322 | " datapoints in the batch \n", 323 | " \n", 324 | " In the modified training set the observed labels have a standard one-hot \n", 325 | " encoding and the unobserved labels are represented by all-zero ten \n", 326 | " dimensional vectors. \n", 327 | " To compute the mask for observed labels you can compute \n", 328 | " y_is_observed = y.sum(1, keepdim=True) \n", 329 | " \n", 330 | " The function has to\n", 331 | " 1. sample y from q(y | x)\n", 332 | " 2. sample z from q(z | x, y)\n", 333 | " 3. compute the evidence lower bound for obervsed and unobserved variables\n", 334 | " 4. compute the cross_entropy regularizer with weight alpha for object with\n", 335 | " observed labels\n", 336 | " 5. return the sum of two losses\n", 337 | " \"\"\"\n", 338 | " pass\n", 339 | " #return loss + alpha * loss_supervised" 340 | ], 341 | "execution_count": 0, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "h3U3fEiAuqQM", 348 | "colab_type": "code", 349 | "colab": {} 350 | }, 351 | "source": [ 352 | "from itertools import chain\n", 353 | "\n", 354 | "def train_model(y_encoder, z_encoder, decoder, batch_size=100, num_epochs=3, learning_rate=1e-3):\n", 355 | " gd = optim.Adam(chain(y_encoder.parameters(),\n", 356 | " z_encoder.parameters(),\n", 357 | " decoder.parameters()), lr=learning_rate)\n", 358 | " dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)\n", 359 | " test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)\n", 360 | " train_losses = []\n", 361 | " for _ in range(num_epochs):\n", 362 | " for i, (x, y) in enumerate(dataloader):\n", 363 | " total = len(dataloader)\n", 364 | " x = x.to(device)\n", 365 | " y = y.to(device)\n", 366 | " loss_value = loss(x, y, y_encoder, z_encoder, decoder)\n", 367 | " (-loss_value).backward()\n", 368 | " train_losses.append(loss_value.cpu().item())\n", 369 | " if (i + 1) % 10 == 0:\n", 370 | " print('\\rTrain loss:', train_losses[-1],\n", 371 | " 'Batch', i + 1, 'of', total, ' ' * 10, end='', flush=True)\n", 372 | " gd.step()\n", 373 | " gd.zero_grad()\n", 374 | " loss_value = 0.\n", 375 | " accuracy = 0.\n", 376 | " for i, (x, y) in enumerate(test_dataloader):\n", 377 | " total = len(test_dataloader)\n", 378 | " x = x.to(device)\n", 379 | " y = y.to(device)\n", 380 | " unobserved_y = torch.zeros((y.shape[0], 10)).to(device)\n", 381 | " loss_value += loss(x, unobserved_y, y_encoder, z_encoder, decoder).item()\n", 382 | " accuracy += (torch.argmax(y_encoder(x), 1) == y).double().mean().item()\n", 383 | " print('Test loss: {}\\t Test accuracy: {}'.format(loss_value / total, accuracy / total))" 384 | ], 385 | "execution_count": 0, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "EwAL5ZtXutfp", 392 | "colab_type": "code", 393 | "colab": {} 394 | }, 395 | "source": [ 396 | "# my implementation omitted log p(y) for observed variables. it has\n", 397 | "# test loss -106.79\n", 398 | "# test accuracy 0.95\n", 399 | "train_model(y_enc, z_enc, yz_dec, num_epochs=16)" 400 | ], 401 | "execution_count": 0, 402 | "outputs": [] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": { 407 | "id": "4ifxO95Puxt_", 408 | "colab_type": "text" 409 | }, 410 | "source": [ 411 | "## Visualizations\n", 412 | "\n", 413 | "Generate 10 images for each label" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "metadata": { 419 | "id": "MEA9Z6x-u067", 420 | "colab_type": "code", 421 | "colab": {} 422 | }, 423 | "source": [ 424 | "def plot_samples_with_fixed_classes(dec):\n", 425 | " decoder_input = torch.cat((torch.eye(10).repeat(10, 1), torch.randn(100, d)), 1)\n", 426 | " decoder_input = decoder_input.to(device)\n", 427 | " images = torch.sigmoid(dec(decoder_input)).view(100, 28, 28).detach().cpu().numpy()\n", 428 | " \n", 429 | " fig, axes = plt.subplots(nrows=10, ncols=10, figsize=(14, 14),\n", 430 | " subplot_kw={'xticks': [], 'yticks': []})\n", 431 | " for i in range(10):\n", 432 | " axes[0, i].set_title('{}'.format(i))\n", 433 | " \n", 434 | " for i in range(100):\n", 435 | " axes[int(i / 10), i % 10].imshow(images[i], cmap='gray')\n", 436 | " \n", 437 | "plot_samples_with_fixed_classes(yz_dec)" 438 | ], 439 | "execution_count": 0, 440 | "outputs": [] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "id": "B_NdhbjEu3lL", 446 | "colab_type": "text" 447 | }, 448 | "source": [ 449 | "### \"Style-transfer\"\n", 450 | "\n", 451 | "Here we infer latent representation $z$ of a given digit $x$ and then generate from $p(x | z, y)$ for different $y$." 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "metadata": { 457 | "id": "5B8XBBp9u4Yn", 458 | "colab_type": "code", 459 | "colab": {} 460 | }, 461 | "source": [ 462 | "def plot_all_digits_with_fixed_style(z_enc, y_enc, dec):\n", 463 | " indices = np.random.choice(10000, 10)\n", 464 | " x, y = test_data[indices][0], torch.eye(10)[test_data[indices][1]]\n", 465 | " x = x.to(device)\n", 466 | " y = y.to(device)\n", 467 | " z = z_enc(torch.cat((x, y), 1))[:, :d]\n", 468 | "\n", 469 | " # generate digits\n", 470 | " images = []\n", 471 | " for i in range(10):\n", 472 | " digit_encodings = torch.eye(10)[i, :].expand(10, 10).to(device)\n", 473 | " images.append(torch.sigmoid(dec(torch.cat((digit_encodings, z), 1)).view(10, 28, 28)).detach().cpu().numpy())\n", 474 | " \n", 475 | " x = x.view(10, 28, 28).detach().cpu().numpy()\n", 476 | "\n", 477 | " # plot\n", 478 | " fig, axes = plt.subplots(nrows=10, ncols=11, figsize=(14, 14),\n", 479 | " subplot_kw={'xticks': [], 'yticks': []})\n", 480 | " \n", 481 | " axes[0, 0].set_title('example')\n", 482 | " for i in range(10):\n", 483 | " axes[0, i + 1].set_title('{}'.format(i))\n", 484 | " axes[i, 0].imshow(x[i], cmap='gray')\n", 485 | " for j in range(10):\n", 486 | " axes[i, j + 1].imshow(images[j][i], cmap='gray')\n", 487 | " \n", 488 | "plot_all_digits_with_fixed_style(z_enc, y_enc, yz_dec)" 489 | ], 490 | "execution_count": 0, 491 | "outputs": [] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": { 496 | "id": "fFQXhqT3u9Kf", 497 | "colab_type": "text" 498 | }, 499 | "source": [ 500 | "### T-SNE for SS-VAE\n", 501 | "\n", 502 | "Do you notice any difference from T-SNE for vanilla VAE? How can you interpret the results?" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "metadata": { 508 | "id": "tJaqQ9KyvAVo", 509 | "colab_type": "code", 510 | "colab": {} 511 | }, 512 | "source": [ 513 | "def plot_tsne(objects, labels):\n", 514 | " from sklearn.manifold import TSNE\n", 515 | " embeddings = TSNE(n_components=2).fit_transform(objects)\n", 516 | " plt.figure(figsize=(8, 8))\n", 517 | " for k in range(10):\n", 518 | " embeddings_for_k = embeddings[labels == k]\n", 519 | " plt.scatter(embeddings_for_k[:, 0], embeddings_for_k[:, 1],\n", 520 | " label='{}'.format(k))\n", 521 | " plt.legend()" 522 | ], 523 | "execution_count": 0, 524 | "outputs": [] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "metadata": { 529 | "id": "UxmfeI34vCKk", 530 | "colab_type": "code", 531 | "colab": {} 532 | }, 533 | "source": [ 534 | "# T-SNE for q(z | x, y) mean\n", 535 | "labels = test_data[:1000][1].numpy()\n", 536 | "encoder_input = torch.cat((test_data[:1000][0],\n", 537 | " torch.eye(10)[labels]), 1).to(device)\n", 538 | "latent_variables = z_enc(encoder_input)[:, :d]\n", 539 | "latent_variables = latent_variables.detach().cpu().numpy()\n", 540 | "\n", 541 | "plot_tsne(latent_variables, labels)" 542 | ], 543 | "execution_count": 0, 544 | "outputs": [] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "metadata": { 549 | "id": "eVpp-hhwvHvb", 550 | "colab_type": "code", 551 | "colab": {} 552 | }, 553 | "source": [ 554 | "# T-SNE for q(y | x) logits\n", 555 | "labels = test_data[:1000][1].numpy()\n", 556 | "latent_variables = y_enc(test_data[:1000][0].to(device))\n", 557 | "latent_variables = latent_variables.detach().cpu().numpy()\n", 558 | "\n", 559 | "plot_tsne(latent_variables, labels)" 560 | ], 561 | "execution_count": 0, 562 | "outputs": [] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "metadata": { 567 | "id": "Jm_7HjqM3VXT", 568 | "colab_type": "code", 569 | "colab": {} 570 | }, 571 | "source": [ 572 | "" 573 | ], 574 | "execution_count": 0, 575 | "outputs": [] 576 | } 577 | ] 578 | } -------------------------------------------------------------------------------- /seminars/day2/readme.md: -------------------------------------------------------------------------------- 1 | # Seminars day 2 2 | 3 | Colab versions (run code on GPU in browser): 4 | * [1. VAE + DRAW](https://colab.research.google.com/drive/1fH_m9sBZK99S4gBbDs_S6cKIUrPDFh5T) 5 | * [1. VAE + DRAW SOLUTION](https://colab.research.google.com/drive/1xQvUA_ha_hezMPB3HFFE6R32qZFIAHic) 6 | * [2. Gumbel for SS-VAE](https://colab.research.google.com/drive/1h4x4h4bgM1QMocLWlHTRn9pgOrJmlnEC) 7 | * [2. Gumbel for SS-VAE SOLUTION](https://colab.research.google.com/drive/16SjN8FwplB1L_cr4jC0HbgvunvyMZKXc) 8 | 9 | -------------------------------------------------------------------------------- /seminars/day3/gan/GAN_sem.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "GAN_sem.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "cYVeeM-5rFUm", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "The visualization used for this seminar is based on Alexandr Verinov's code. " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "2rMQe7eYrFUo", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "# Generative models" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "id": "MS8Gn4MTrFUs", 42 | "colab_type": "text" 43 | }, 44 | "source": [ 45 | "In this seminar we will try several criterions for learning an implicit model. For the first part almost everything is written for you, and you only need to implement the objective for the game and play around with the model. \n", 46 | "\n", 47 | "**0)** Read the code\n", 48 | "\n", 49 | "**1)** Implement objective for a vanilla [Generative Adversarial Networks](https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf) (GAN). The hyperparameters are already set in the code. The model will converge if you implement the objective (1) right. \n", 50 | "\n", 51 | "**2)** Note the discussion in the paper, that the objective for $G$ can be of two kinds: $\\min_G \\log(1 - D)$ and $\\min_G - \\log(D)$. Now implement the second objective and ensure model converges. Most likely, in this example you will not notice the difference, but people usually use the second objective, it really matters in more complicated scenarios. **NOTE:** the objective for D stays the same.\n", 52 | "\n", 53 | "**3 & 4)** Implement [Wasserstein GAN](https://arxiv.org/pdf/1701.07875.pdf) and WGAN-GP. To make the discriminator has Lipschitz property you need to clip discriminator's weights to $[-0.01, 0.01]$ range (WGAN) or use gradient penalty (WGAN-GP). You will need to make few modifications to the code: \n", 54 | "\n", 55 | " - Remove sigmoids from discriminator;\n", 56 | " - Change objective (see eq. 3 and algorithm 1 in [the paper](https://arxiv.org/pdf/1701.07875.pdf)): \n", 57 | " - Add weight clipping for D [see here](https://github.com/martinarjovsky/WassersteinGAN/blob/master/main.py#L172) / gradient penaly (WGAN-GP) [code](https://gist.github.com/DmitryUlyanov/19ce84045135e3f81a477629e685aec8); \n", 58 | "\n", 59 | " \n", 60 | "In general see [implementation 1](https://github.com/martinarjovsky/WassersteinGAN/blob/master/main.py#L172) / [implementation 2](https://github.com/caogang/wgan-gp). They also use different optimizer. \n", 61 | "\n", 62 | "The default hyperparameters may not work well, spend some time to tune them -- play with learning rate, number of D updates per one G update, change architecture (what about weight initialization?). \n", 63 | "\n", 64 | "**5) Bonus: Wasserstein Introspective Neural Networkss**. This is basically WGAN-GP without generator. Read and implement [WINN paper](https://arxiv.org/pdf/1711.08875.pdf) for our toy task. The classification step is almost identical to the discriminative step for WGAN-GP. However on synthesis step, we will not use a generator network, but instead we optimize the same loss as the generator loss in WGAN-GP with respect to the *generated objects* (aka \"pseudo-negative samples\"). Then, we accumulate the generated \"pseudo-negative\" samples and use mini-batches from them as the \"fake data\" for the next classification step.\n", 65 | "\n", 66 | "Here are some tips for you:\n", 67 | "- Initialize your \"fake dataset\" with random noise.\n", 68 | "- During the classification stage, sample fake data from the fake dataset.\n", 69 | "- For the synthesis step, use the fake samples from the previous step as the initial value.\n", 70 | "- You can use an ordinary Adam optimizer to update the samples, but you need to inject small noise on each step (last equation on page 4). Do not forget to early stop after the threshold (page 5, first paragraph) is reached.\n", 71 | "- Add the new generated points to the \"fake dataset\".\n", 72 | "\n", 73 | "To make the visualization work without a generator, you have to supply your generated samples to `vis_points` function." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "metadata": { 79 | "id": "ubl78LmBrFUx", 80 | "colab_type": "code", 81 | "colab": {} 82 | }, 83 | "source": [ 84 | "\"\"\" \n", 85 | " Please, implement everything in one notebook, using if statements to switch between the tasks\n", 86 | "\"\"\"\n", 87 | "# TASK in [1, 2, 3, 4, 5]\n", 88 | "TASK = 1" 89 | ], 90 | "execution_count": 0, 91 | "outputs": [] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "id": "nJPseo0grFU2", 97 | "colab_type": "text" 98 | }, 99 | "source": [ 100 | "# Imports" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "metadata": { 106 | "id": "k17KjAivrP9F", 107 | "colab_type": "code", 108 | "colab": {} 109 | }, 110 | "source": [ 111 | "!pip3 install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl\n", 112 | "!pip3 install torchvision" 113 | ], 114 | "execution_count": 0, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "metadata": { 120 | "id": "dE21X55trFU3", 121 | "colab_type": "code", 122 | "colab": {} 123 | }, 124 | "source": [ 125 | "import numpy as np\n", 126 | "import time\n", 127 | "import torch.nn as nn\n", 128 | "import torch.optim as optim\n", 129 | "import torch\n", 130 | "import matplotlib.pyplot as plt\n", 131 | "%matplotlib inline\n", 132 | "\n", 133 | "torch.set_num_threads(4)\n", 134 | "np.random.seed(12345)\n", 135 | "lims=(-5, 5)" 136 | ], 137 | "execution_count": 0, 138 | "outputs": [] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "id": "u1paAVQvrFU6", 144 | "colab_type": "text" 145 | }, 146 | "source": [ 147 | "# Define sampler from real data and Z " 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": { 153 | "id": "E2MQTdQ4rFU-", 154 | "colab_type": "text" 155 | }, 156 | "source": [ 157 | "Some utility functions." 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "VanMNBMmrFVB", 164 | "colab_type": "code", 165 | "colab": {} 166 | }, 167 | "source": [ 168 | "from scipy.stats import rv_discrete\n", 169 | "\n", 170 | "MEANS = np.array(\n", 171 | " [[-1,-3],\n", 172 | " [1,3],\n", 173 | " [-2,0],\n", 174 | " ])\n", 175 | "COVS = np.array(\n", 176 | " [[[1,0.8],[0.8,1]],\n", 177 | " [[1,-0.5],[-0.5,1]],\n", 178 | " [[1,0],[0,1]],\n", 179 | " ])\n", 180 | "PROBS = np.array([\n", 181 | " 0.2,\n", 182 | " 0.5,\n", 183 | " 0.3\n", 184 | " ])\n", 185 | "assert len(MEANS) == len(COVS) == len(PROBS), \"number of components mismatch\"\n", 186 | "COMPONENTS = len(MEANS)\n", 187 | "\n", 188 | "comps_dist = rv_discrete(values=(range(COMPONENTS), PROBS))\n", 189 | "\n", 190 | "def sample_true(N):\n", 191 | " comps = comps_dist.rvs(size=N)\n", 192 | " conds = np.arange(COMPONENTS)[:,None] == comps[None,:]\n", 193 | " arr = np.array([np.random.multivariate_normal(MEANS[c], COVS[c], size=N)\n", 194 | " for c in range(COMPONENTS)])\n", 195 | " return np.select(conds[:,:,None], arr).astype(np.float32)\n", 196 | "\n", 197 | "NOISE_DIM = 20\n", 198 | "def sample_noise(N):\n", 199 | " return np.random.normal(size=(N,NOISE_DIM)).astype(np.float32)" 200 | ], 201 | "execution_count": 0, 202 | "outputs": [] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": { 207 | "id": "dngENj5orFVF", 208 | "colab_type": "text" 209 | }, 210 | "source": [ 211 | "# Visualization functions" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": { 217 | "id": "qgvLtQqZrFVG", 218 | "colab_type": "text" 219 | }, 220 | "source": [ 221 | "And more utility functions." 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "metadata": { 227 | "id": "-lx2DhoarFVH", 228 | "colab_type": "code", 229 | "colab": {} 230 | }, 231 | "source": [ 232 | "def vis_data(data):\n", 233 | " \"\"\"\n", 234 | " Visualizes data as histogram\n", 235 | " \"\"\"\n", 236 | " hist = np.histogram2d(data[:, 1], data[:, 0], bins=100, range=[lims, lims])\n", 237 | " plt.pcolormesh(hist[1], hist[2], hist[0], alpha=0.5)\n", 238 | "\n", 239 | "fixed_noise = torch.Tensor(sample_noise(1000))\n", 240 | "def vis_g():\n", 241 | " \"\"\"\n", 242 | " Visualizes generator's samples as circles\n", 243 | " \"\"\"\n", 244 | " data = generator(fixed_noise).data.numpy()\n", 245 | " if np.isnan(data).any():\n", 246 | " return\n", 247 | " \n", 248 | " plt.scatter(data[:,0], data[:,1], alpha=0.2, c='b')\n", 249 | " plt.xlim(lims)\n", 250 | " plt.ylim(lims)\n", 251 | " \n", 252 | " \n", 253 | "def vis_points(data):\n", 254 | " \"\"\"\n", 255 | " Visualizes the supplied samples as circles\n", 256 | " \"\"\"\n", 257 | " if np.isnan(data).any():\n", 258 | " return\n", 259 | " \n", 260 | " plt.scatter(data[:,0], data[:,1], alpha=0.2, c='b')\n", 261 | " plt.xlim(lims)\n", 262 | " plt.ylim(lims)\n", 263 | " \n", 264 | "\n", 265 | "def get_grid():\n", 266 | " X, Y = np.meshgrid(np.linspace(lims[0], lims[1], 30), np.linspace(lims[0], lims[1], 30))\n", 267 | " X = X.flatten()\n", 268 | " Y = Y.flatten()\n", 269 | " \n", 270 | " grid = torch.from_numpy(np.vstack([X, Y]).astype(np.float32).T)\n", 271 | " grid.requires_grad = True\n", 272 | " \n", 273 | " return X, Y, grid\n", 274 | " \n", 275 | "X_grid, Y_grid, grid = get_grid()\n", 276 | "def vis_d():\n", 277 | " \"\"\"\n", 278 | " Visualizes discriminator's gradient on grid\n", 279 | " \"\"\"\n", 280 | " \n", 281 | " data_gen = generator(fixed_noise)\n", 282 | "# loss = d_loss(discriminator(data_gen), discriminator(grid))\n", 283 | " loss = g_loss(discriminator(grid))\n", 284 | " loss.backward()\n", 285 | " \n", 286 | " grads = - grid.grad.data.numpy()\n", 287 | " grid.grad.data *= 0 \n", 288 | " plt.quiver(X_grid, Y_grid, grads[:, 0], grads[:, 1], color='black',alpha=0.9)" 289 | ], 290 | "execution_count": 0, 291 | "outputs": [] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": { 296 | "id": "wUhLRejHrFVK", 297 | "colab_type": "text" 298 | }, 299 | "source": [ 300 | "# Define architectures" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "id": "5ZJWX7SOrFVP", 307 | "colab_type": "text" 308 | }, 309 | "source": [ 310 | "After you've done with task 1 you can play with architectures." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "metadata": { 316 | "id": "SwrFxWQ5rFVS", 317 | "colab_type": "code", 318 | "colab": {} 319 | }, 320 | "source": [ 321 | "def get_generator(noise_dim, out_dim, hidden_dim=100):\n", 322 | " layers = [\n", 323 | " nn.Linear(noise_dim, hidden_dim),\n", 324 | " nn.LeakyReLU(),\n", 325 | " nn.Linear(hidden_dim, hidden_dim),\n", 326 | " nn.LeakyReLU(),\n", 327 | " nn.Linear(hidden_dim, out_dim)\n", 328 | " ]\n", 329 | " return nn.Sequential(*layers)\n", 330 | "\n", 331 | "def get_discriminator(in_dim, hidden_dim=100):\n", 332 | " layers = [\n", 333 | " nn.Linear(in_dim, hidden_dim),\n", 334 | " nn.LeakyReLU(),\n", 335 | " nn.Linear(hidden_dim, hidden_dim),\n", 336 | " nn.LeakyReLU(),\n", 337 | " nn.Linear(hidden_dim, hidden_dim),\n", 338 | " nn.LeakyReLU(),\n", 339 | " nn.Linear(hidden_dim, 1),\n", 340 | " nn.Sigmoid()\n", 341 | " ]\n", 342 | " \n", 343 | " return nn.Sequential(*layers)" 344 | ], 345 | "execution_count": 0, 346 | "outputs": [] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": { 351 | "id": "H9bLX5xLrFVX", 352 | "colab_type": "text" 353 | }, 354 | "source": [ 355 | "# Define updates and losses" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "metadata": { 361 | "id": "nOZ6HW7SrFVr", 362 | "colab_type": "code", 363 | "colab": {} 364 | }, 365 | "source": [ 366 | "generator = get_generator(NOISE_DIM, out_dim = 2)\n", 367 | "discriminator = get_discriminator(in_dim = 2)\n", 368 | "\n", 369 | "lr = 0.001\n", 370 | "g_optimizer = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))\n", 371 | "d_optimizer = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))" 372 | ], 373 | "execution_count": 0, 374 | "outputs": [] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": { 379 | "id": "XrAsfK5NrFV1", 380 | "colab_type": "text" 381 | }, 382 | "source": [ 383 | "Notice we are using ADAM optimizer with `beta1=0.5` for both discriminator and discriminator. This is a common practice and works well. Motivation: models should be flexible and adapt itself rapidly to the distributions. \n", 384 | "\n", 385 | "You can try different optimizers and parameters." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "E34DENEyrFV3", 392 | "colab_type": "code", 393 | "colab": {} 394 | }, 395 | "source": [ 396 | "################################\n", 397 | "# IMPLEMENT HERE\n", 398 | "################################\n", 399 | "# Define the g_loss and d_loss here\n", 400 | "# these are the only lines of code you need to change to implement Tasks 1 and 2 \n", 401 | "\n", 402 | "def g_loss(d_scores_fake):\n", 403 | " \"\"\"\n", 404 | " `d_scores_fake` is the output of the discrimonator model applied to a batch of fake data\n", 405 | " \n", 406 | " NOTE: we always define objectives as if we were minimizing them (remember that maximize = negate and minimize)\n", 407 | " \"\"\"\n", 408 | " # if TASK == 1: \n", 409 | " # return something\n", 410 | " # elif TASK == 2:\n", 411 | " # return something else\n", 412 | " \n", 413 | " return # TODO\n", 414 | " \n", 415 | "def d_loss(d_scores_fake, d_scores_real):\n", 416 | " \"\"\"\n", 417 | " `d_scores_fake` is the output of the discriminator model applied to a batch of fake data\n", 418 | " `d_scores_real` is the output of the discriminator model applied to a batch of real data\n", 419 | " \n", 420 | " NOTE: we always define objectives as if we were minimizing them (remember that maximize = negate and minimize)\n", 421 | " \"\"\"\n", 422 | " # if TASK == 1: \n", 423 | " # return something\n", 424 | " # elif TASK == 2:\n", 425 | " # return something else\n", 426 | " \n", 427 | " return # TODO" 428 | ], 429 | "execution_count": 0, 430 | "outputs": [] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": { 435 | "id": "K7DHnmwkrFV7", 436 | "colab_type": "text" 437 | }, 438 | "source": [ 439 | "# Get real data (this is not hte )" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "metadata": { 445 | "id": "oncmevrsrFV8", 446 | "colab_type": "code", 447 | "colab": {} 448 | }, 449 | "source": [ 450 | "data = sample_true(100000)\n", 451 | "def iterate_minibatches(X, batchsize, y=None):\n", 452 | " perm = np.random.permutation(X.shape[0])\n", 453 | " \n", 454 | " for start in range(0, X.shape[0], batchsize):\n", 455 | " end = min(start + batchsize, X.shape[0])\n", 456 | " if y is None:\n", 457 | " yield X[perm[start:end]]\n", 458 | " else:\n", 459 | " yield X[perm[start:end]], y[perm[start:end]]" 460 | ], 461 | "execution_count": 0, 462 | "outputs": [] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "id": "3KTbE54KrFWI", 468 | "colab_type": "text" 469 | }, 470 | "source": [ 471 | "**Legend**:\n", 472 | "- Blue dots are generated samples. \n", 473 | "- Colored histogram at the back shows density of real data. \n", 474 | "- And with arrows we show gradients of the discriminator -- they are the directions that discriminator pushes generator's samples. " 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": { 480 | "id": "ARtDIoGhrFWJ", 481 | "colab_type": "text" 482 | }, 483 | "source": [ 484 | "# Train the model" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "metadata": { 490 | "scrolled": false, 491 | "id": "EeWUNHAUrFWK", 492 | "colab_type": "code", 493 | "colab": {} 494 | }, 495 | "source": [ 496 | "from IPython import display\n", 497 | "\n", 498 | "plt.xlim(lims)\n", 499 | "plt.ylim(lims)\n", 500 | "\n", 501 | "batch_size = 64\n", 502 | "\n", 503 | "# ===========================\n", 504 | "# IMPORTANT PARAMETER:\n", 505 | "# Number of D updates per G update\n", 506 | "# ===========================\n", 507 | "k_d, k_g = 1, 1\n", 508 | "\n", 509 | "try:\n", 510 | " for it, real_data in enumerate(iterate_minibatches(data, batch_size)):\n", 511 | "\n", 512 | " # Optimize D\n", 513 | " for _ in range(k_d):\n", 514 | " d_optimizer.zero_grad()\n", 515 | " \n", 516 | " # Sample noise\n", 517 | " noise = torch.Tensor(sample_noise(real_data.shape[0]))\n", 518 | "\n", 519 | " # Compute gradient\n", 520 | " real_data = torch.Tensor(real_data)\n", 521 | " fake_data = generator(noise)\n", 522 | " loss = d_loss(discriminator(fake_data), discriminator(real_data)) \n", 523 | " loss.backward()\n", 524 | " \n", 525 | " # IMPLEMENT HERE GP FOR TASK 4\n", 526 | " \n", 527 | " # Update\n", 528 | " d_optimizer.step()\n", 529 | "\n", 530 | " # Optimize G\n", 531 | " for _ in range(k_g):\n", 532 | " g_optimizer.zero_grad()\n", 533 | " \n", 534 | " # Sample noise\n", 535 | " noise = torch.Tensor(sample_noise(real_data.shape[0]))\n", 536 | "\n", 537 | " # Compute gradient\n", 538 | " fake_data = generator(noise)\n", 539 | " loss = g_loss(discriminator(fake_data))\n", 540 | " loss.backward()\n", 541 | " \n", 542 | " # Update\n", 543 | " g_optimizer.step()\n", 544 | "\n", 545 | " # Visualize\n", 546 | " if it % 2 == 0:\n", 547 | " plt.clf()\n", 548 | " vis_data(data)\n", 549 | " \n", 550 | " if TASK < 5:\n", 551 | " vis_g()\n", 552 | " else:\n", 553 | " # UNCOMMENT AND SUPPLY YOUR SAMPLES FOR BONUS TASK 5\n", 554 | " # vis_points(generated_samples[-1000:])\n", 555 | " pass\n", 556 | " \n", 557 | " vis_d()\n", 558 | " display.clear_output(wait=True)\n", 559 | " display.display(plt.gcf())\n", 560 | " print(f\"Task {TASK}; Iteration {it}\")\n", 561 | " \n", 562 | "except KeyboardInterrupt:\n", 563 | " pass" 564 | ], 565 | "execution_count": 0, 566 | "outputs": [] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": { 571 | "id": "BE27RENgrFWM", 572 | "colab_type": "text" 573 | }, 574 | "source": [ 575 | "# Describe your findings here" 576 | ] 577 | }, 578 | { 579 | "cell_type": "markdown", 580 | "metadata": { 581 | "id": "6qLED3AsrFWO", 582 | "colab_type": "text" 583 | }, 584 | "source": [ 585 | "London is the capital of Great Britain." 586 | ] 587 | } 588 | ] 589 | } -------------------------------------------------------------------------------- /seminars/day3/gan/readme.md: -------------------------------------------------------------------------------- 1 | # Generative adversarial networks 2 | 3 | * [Colab: seminar](https://colab.research.google.com/drive/1lQr79EAA-1jdwyqVswPzGHfhL5Yx7WCq) 4 | -------------------------------------------------------------------------------- /seminars/day3/gan/solutions_GAN_sem.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Copy of GAN_sem.ipynb","version":"0.3.2","provenance":[{"file_id":"1lQr79EAA-1jdwyqVswPzGHfhL5Yx7WCq","timestamp":1566420056458},{"file_id":"https://github.com/bayesgroup/deepbayes-2018/blob/master/day4_gans/GAN_deep_bayes_updated.ipynb","timestamp":1554219042423}],"collapsed_sections":[]},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"cYVeeM-5rFUm","colab_type":"text"},"source":["The visualization used for this seminar is based on Alexandr Verinov's code. "]},{"cell_type":"markdown","metadata":{"id":"2rMQe7eYrFUo","colab_type":"text"},"source":["# Generative models"]},{"cell_type":"markdown","metadata":{"id":"MS8Gn4MTrFUs","colab_type":"text"},"source":["In this seminar we will try several criterions for learning an implicit model. For the first part almost everything is written for you, and you only need to implement the objective for the game and play around with the model. \n","\n","**0)** Read the code\n","\n","**1)** Implement objective for a vanilla [Generative Adversarial Networks](https://papers.nips.cc/paper/5423-generative-adversarial-nets.pdf) (GAN). The hyperparameters are already set in the code. The model will converge if you implement the objective (1) right. \n","\n","**2)** Note the discussion in the paper, that the objective for $G$ can be of two kinds: $\\min_G \\log(1 - D)$ and $\\min_G - \\log(D)$. Now implement the second objective and ensure model converges. Most likely, in this example you will not notice the difference, but people usually use the second objective, it really matters in more complicated scenarios. **NOTE:** the objective for D stays the same.\n","\n","**3 & 4)** Implement [Wasserstein GAN](https://arxiv.org/pdf/1701.07875.pdf) and WGAN-GP. To make the discriminator has Lipschitz property you need to clip discriminator's weights to $[-0.01, 0.01]$ range (WGAN) or use gradient penalty (WGAN-GP). You will need to make few modifications to the code: \n","\n"," - Remove sigmoids from discriminator;\n"," - Change objective (see eq. 3 and algorithm 1 in [the paper](https://arxiv.org/pdf/1701.07875.pdf)): \n"," - Add weight clipping for D [see here](https://github.com/martinarjovsky/WassersteinGAN/blob/master/main.py#L172) / gradient penaly (WGAN-GP) [code](https://gist.github.com/DmitryUlyanov/19ce84045135e3f81a477629e685aec8); \n","\n"," \n","In general see [implementation 1](https://github.com/martinarjovsky/WassersteinGAN/blob/master/main.py#L172) / [implementation 2](https://github.com/caogang/wgan-gp). They also use different optimizer. \n","\n","The default hyperparameters may not work well, spend some time to tune them -- play with learning rate, number of D updates per one G update, change architecture (what about weight initialization?). \n","\n","**5) Bonus: Wasserstein Introspective Neural Networkss**. This is basically WGAN-GP without generator. Read and implement [WINN paper](https://arxiv.org/pdf/1711.08875.pdf) for our toy task. The classification step is almost identical to the discriminative step for WGAN-GP. However on synthesis step, we will not use a generator network, but instead we optimize the same loss as the generator loss in WGAN-GP with respect to the *generated objects* (aka \"pseudo-negative samples\"). Then, we accumulate the generated \"pseudo-negative\" samples and use mini-batches from them as the \"fake data\" for the next classification step.\n","\n","Here are some tips for you:\n","- Initialize your \"fake dataset\" with random noise.\n","- During the classification stage, sample fake data from the fake dataset.\n","- For the synthesis step, use the fake samples from the previous step as the initial value.\n","- You can use an ordinary Adam optimizer to update the samples, but you need to inject small noise on each step (last equation on page 4). Do not forget to early stop after the threshold (page 5, first paragraph) is reached.\n","- Add the new generated points to the \"fake dataset\".\n","\n","To make the visualization work without a generator, you have to supply your generated samples to `vis_points` function."]},{"cell_type":"code","metadata":{"id":"ubl78LmBrFUx","colab_type":"code","colab":{}},"source":["\"\"\" \n"," Please, implement everything in one notebook, using if statements to switch between the tasks\n","\"\"\"\n","# TASK in [1, 2, 3, 4, 5]\n","TASK = 1"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nJPseo0grFU2","colab_type":"text"},"source":["# Imports"]},{"cell_type":"code","metadata":{"id":"k17KjAivrP9F","colab_type":"code","colab":{}},"source":["!pip3 install https://download.pytorch.org/whl/cpu/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl\n","!pip3 install torchvision"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"dE21X55trFU3","colab_type":"code","colab":{}},"source":["import numpy as np\n","import time\n","import torch.nn as nn\n","import torch.optim as optim\n","import torch\n","import matplotlib.pyplot as plt\n","%matplotlib inline\n","\n","torch.set_num_threads(4)\n","np.random.seed(12345)\n","lims=(-5, 5)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"u1paAVQvrFU6","colab_type":"text"},"source":["# Define sampler from real data and Z "]},{"cell_type":"markdown","metadata":{"id":"E2MQTdQ4rFU-","colab_type":"text"},"source":["Some utility functions."]},{"cell_type":"code","metadata":{"id":"VanMNBMmrFVB","colab_type":"code","colab":{}},"source":["from scipy.stats import rv_discrete\n","\n","MEANS = np.array(\n"," [[-1,-3],\n"," [1,3],\n"," [-2,0],\n"," ])\n","COVS = np.array(\n"," [[[1,0.8],[0.8,1]],\n"," [[1,-0.5],[-0.5,1]],\n"," [[1,0],[0,1]],\n"," ])\n","PROBS = np.array([\n"," 0.2,\n"," 0.5,\n"," 0.3\n"," ])\n","assert len(MEANS) == len(COVS) == len(PROBS), \"number of components mismatch\"\n","COMPONENTS = len(MEANS)\n","\n","comps_dist = rv_discrete(values=(range(COMPONENTS), PROBS))\n","\n","def sample_true(N):\n"," comps = comps_dist.rvs(size=N)\n"," conds = np.arange(COMPONENTS)[:,None] == comps[None,:]\n"," arr = np.array([np.random.multivariate_normal(MEANS[c], COVS[c], size=N)\n"," for c in range(COMPONENTS)])\n"," return np.select(conds[:,:,None], arr).astype(np.float32)\n","\n","NOISE_DIM = 20\n","def sample_noise(N):\n"," return np.random.normal(size=(N,NOISE_DIM)).astype(np.float32)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dngENj5orFVF","colab_type":"text"},"source":["# Visualization functions"]},{"cell_type":"markdown","metadata":{"id":"qgvLtQqZrFVG","colab_type":"text"},"source":["And more utility functions."]},{"cell_type":"code","metadata":{"id":"-lx2DhoarFVH","colab_type":"code","colab":{}},"source":["def vis_data(data):\n"," \"\"\"\n"," Visualizes data as histogram\n"," \"\"\"\n"," hist = np.histogram2d(data[:, 1], data[:, 0], bins=100, range=[lims, lims])\n"," plt.pcolormesh(hist[1], hist[2], hist[0], alpha=0.5)\n","\n","fixed_noise = torch.Tensor(sample_noise(1000))\n","def vis_g():\n"," \"\"\"\n"," Visualizes generator's samples as circles\n"," \"\"\"\n"," data = generator(fixed_noise).data.numpy()\n"," if np.isnan(data).any():\n"," return\n"," \n"," plt.scatter(data[:,0], data[:,1], alpha=0.2, c='b')\n"," plt.xlim(lims)\n"," plt.ylim(lims)\n"," \n"," \n","def vis_points(data):\n"," \"\"\"\n"," Visualizes the supplied samples as circles\n"," \"\"\"\n"," if np.isnan(data).any():\n"," return\n"," \n"," plt.scatter(data[:,0], data[:,1], alpha=0.2, c='b')\n"," plt.xlim(lims)\n"," plt.ylim(lims)\n"," \n","\n","def get_grid():\n"," X, Y = np.meshgrid(np.linspace(lims[0], lims[1], 30), np.linspace(lims[0], lims[1], 30))\n"," X = X.flatten()\n"," Y = Y.flatten()\n"," \n"," grid = torch.from_numpy(np.vstack([X, Y]).astype(np.float32).T)\n"," grid.requires_grad = True\n"," \n"," return X, Y, grid\n"," \n","X_grid, Y_grid, grid = get_grid()\n","def vis_d():\n"," \"\"\"\n"," Visualizes discriminator's gradient on grid\n"," \"\"\"\n"," \n"," data_gen = generator(fixed_noise)\n","# loss = d_loss(discriminator(data_gen), discriminator(grid))\n"," loss = g_loss(discriminator(grid))\n"," loss.backward()\n"," \n"," grads = - grid.grad.data.numpy()\n"," grid.grad.data *= 0 \n"," plt.quiver(X_grid, Y_grid, grads[:, 0], grads[:, 1], color='black',alpha=0.9)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"wUhLRejHrFVK","colab_type":"text"},"source":["# Define architectures"]},{"cell_type":"markdown","metadata":{"id":"5ZJWX7SOrFVP","colab_type":"text"},"source":["After you've done with task 1 you can play with architectures."]},{"cell_type":"code","metadata":{"id":"SwrFxWQ5rFVS","colab_type":"code","colab":{}},"source":["def get_generator(noise_dim, out_dim, hidden_dim=100):\n"," layers = [\n"," nn.Linear(noise_dim, hidden_dim),\n"," nn.LeakyReLU(),\n"," nn.Linear(hidden_dim, hidden_dim),\n"," nn.LeakyReLU(),\n"," nn.Linear(hidden_dim, out_dim)\n"," ]\n"," return nn.Sequential(*layers)\n","\n","def get_discriminator(in_dim, hidden_dim=100):\n"," layers = [\n"," nn.Linear(in_dim, hidden_dim),\n"," nn.LeakyReLU(),\n"," nn.Linear(hidden_dim, hidden_dim),\n"," nn.LeakyReLU(),\n"," nn.Linear(hidden_dim, hidden_dim),\n"," nn.LeakyReLU(),\n"," nn.Linear(hidden_dim, 1)]\n"," if TASK == 1 or TASK == 2:\n"," layers += [nn.Sigmoid()]\n"," \n"," return nn.Sequential(*layers)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"H9bLX5xLrFVX","colab_type":"text"},"source":["# Define updates and losses"]},{"cell_type":"code","metadata":{"id":"nOZ6HW7SrFVr","colab_type":"code","colab":{}},"source":["generator = get_generator(NOISE_DIM, out_dim = 2)\n","discriminator = get_discriminator(in_dim = 2)\n","\n","lr = 0.001\n","g_optimizer = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))\n","d_optimizer = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XrAsfK5NrFV1","colab_type":"text"},"source":["Notice we are using ADAM optimizer with `beta1=0.5` for both discriminator and discriminator. This is a common practice and works well. Motivation: models should be flexible and adapt itself rapidly to the distributions. \n","\n","You can try different optimizers and parameters."]},{"cell_type":"code","metadata":{"id":"E34DENEyrFV3","colab_type":"code","colab":{}},"source":["################################\n","# IMPLEMENT HERE\n","################################\n","# Define the g_loss and d_loss here\n","# these are the only lines of code you need to change to implement Tasks 1 and 2 \n","\n","def g_loss(d_scores_fake):\n"," \"\"\"\n"," `d_scores_fake` is the output of the discrimonator model applied to a batch of fake data\n"," \n"," NOTE: we always define objectives as if we were minimizing them (remember that maximize = negate and minimize)\n"," \"\"\"\n"," if TASK == 1: \n"," return torch.log(1 - d_scores_fake).mean()\n"," elif TASK == 2:\n"," return -torch.log(d_scores_fake).mean()\n"," elif TASK == 3 or TASK == 4 or TASK == 5:\n"," return -d_scores_fake.mean()\n"," \n","def d_loss(d_scores_fake, d_scores_real):\n"," \"\"\"\n"," `d_scores_fake` is the output of the discriminator model applied to a batch of fake data\n"," `d_scores_real` is the output of the discriminator model applied to a batch of real data\n"," \n"," NOTE: we always define objectives as if we were minimizing them (remember that maximize = negate and minimize)\n"," \"\"\"\n"," if TASK == 1: \n"," return -torch.log(d_scores_real).mean() - torch.log(1 - d_scores_fake).mean()\n"," elif TASK == 2:\n"," return -torch.log(d_scores_real).mean() - torch.log(1 - d_scores_fake).mean()\n"," elif TASK == 3 or TASK == 4 or TASK == 5:\n"," return d_scores_fake.mean() - d_scores_real.mean()"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"K7DHnmwkrFV7","colab_type":"text"},"source":["# Get real data (this is not hte )"]},{"cell_type":"code","metadata":{"id":"oncmevrsrFV8","colab_type":"code","colab":{}},"source":["data = sample_true(100000)\n","def iterate_minibatches(X, batchsize, y=None):\n"," perm = np.random.permutation(X.shape[0])\n"," \n"," for start in range(0, X.shape[0], batchsize):\n"," end = min(start + batchsize, X.shape[0])\n"," if y is None:\n"," yield X[perm[start:end]]\n"," else:\n"," yield X[perm[start:end]], y[perm[start:end]]"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"3KTbE54KrFWI","colab_type":"text"},"source":["**Legend**:\n","- Blue dots are generated samples. \n","- Colored histogram at the back shows density of real data. \n","- And with arrows we show gradients of the discriminator -- they are the directions that discriminator pushes generator's samples. "]},{"cell_type":"markdown","metadata":{"id":"ARtDIoGhrFWJ","colab_type":"text"},"source":["# Train the model"]},{"cell_type":"code","metadata":{"id":"laHiSHy-usNO","colab_type":"code","colab":{}},"source":["from torch import autograd\n","\n","\n","def calc_gradient_penalty(discriminator, real_data, fake_data):\n"," alpha = torch.rand(batch_size, 1)\n"," alpha = alpha.expand(real_data.size())\n","\n"," interpolates = alpha * real_data + ((1 - alpha) * fake_data)\n","\n"," interpolates = autograd.Variable(interpolates, requires_grad=True)\n","\n"," disc_interpolates = discriminator(interpolates)\n","\n"," gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,\n"," grad_outputs=torch.ones(disc_interpolates.size()),\n"," create_graph=True, retain_graph=True, only_inputs=True)[0]\n","\n"," gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * 10\n"," \n"," return gradient_penalty"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"scrolled":false,"id":"EeWUNHAUrFWK","colab_type":"code","colab":{}},"source":["from IPython import display\n","\n","plt.xlim(lims)\n","plt.ylim(lims)\n","\n","batch_size = 64\n","\n","# ===========================\n","# IMPORTANT PARAMETER:\n","# Number of D updates per G update\n","# ===========================\n","k_d, k_g = 1, 1\n","\n","if TASK == 5:\n"," generated_samples = torch.randn(batch_size, data.shape[1])\n","\n","try:\n"," for it, real_data in enumerate(iterate_minibatches(data, batch_size)):\n","\n"," # Optimize D\n"," for _ in range(k_d):\n"," \n"," if TASK == 3:\n"," for p in discriminator.parameters():\n"," p.data.clamp_(-0.01, 0.01)\n","\n"," d_optimizer.zero_grad()\n"," \n"," # Sample noise\n"," noise = torch.Tensor(sample_noise(real_data.shape[0]))\n","\n"," # Compute gradient\n"," real_data = torch.Tensor(real_data)\n"," if TASK != 5:\n"," fake_data = generator(noise)\n"," else:\n"," perm = np.random.permutation(generated_samples.shape[0])[:batch_size]\n"," fake_data = generated_samples[perm]\n"," scores_real = discriminator(fake_data)\n"," loss = d_loss(scores_real, discriminator(real_data)) \n"," loss.backward()\n"," \n"," if TASK == 4:\n"," loss_gp = calc_gradient_penalty(discriminator, real_data, fake_data)\n"," loss_gp.backward()\n"," \n"," # Update\n"," d_optimizer.step()\n","\n"," # Optimize G\n"," for _ in range(k_g):\n"," if TASK == 5:\n"," new_generated_samples = nn.Parameter(torch.randn(batch_size, data.shape[1]))\n"," g_optimizer = optim.Adam([new_generated_samples], lr=lr, betas=(0.5, 0.999))\n"," \n"," \n"," if TASK != 5:\n"," g_optimizer.zero_grad()\n"," # Sample noise\n"," noise = torch.Tensor(sample_noise(real_data.shape[0]))\n","\n"," # Compute gradient\n"," fake_data = generator(noise)\n"," loss = g_loss(discriminator(fake_data))\n"," loss.backward()\n","\n"," # Update\n"," g_optimizer.step()\n"," else:\n"," stop_at = torch.rand(1) * (scores_real.max() - scores_real.min()) + scores_real.min()\n"," while True:\n"," g_optimizer.zero_grad()\n"," scores = discriminator(new_generated_samples)\n"," if scores.mean() > stop_at:\n"," break\n"," loss = g_loss(scores) + torch.randn(1) * lr/2\n"," loss.backward()\n"," g_optimizer.step()\n"," generated_samples = torch.cat([generated_samples, new_generated_samples.detach()])\n","\n"," # Visualize\n"," if it % 2 == 0:\n"," plt.clf()\n"," vis_data(data)\n"," \n"," if TASK < 5:\n"," vis_g()\n"," else:\n"," # UNCOMMENT AND SUPPLY YOUR SAMPLES FOR BONUS TASK 5\n"," vis_points(generated_samples[-1000:])\n"," pass\n"," \n"," vis_d()\n"," display.clear_output(wait=True)\n"," display.display(plt.gcf())\n"," print(f\"Task {TASK}; Iteration {it}\")\n"," \n","except KeyboardInterrupt:\n"," pass"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"BE27RENgrFWM","colab_type":"text"},"source":["# Describe your findings here"]},{"cell_type":"markdown","metadata":{"id":"6qLED3AsrFWO","colab_type":"text"},"source":["London is the capital of Great Britain."]}]} -------------------------------------------------------------------------------- /seminars/day3/nf/nf-assignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "NA1jh8NDbTSz" 8 | }, 9 | "source": [ 10 | "# Assignment 1" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "colab_type": "text", 17 | "id": "onzQnZoFbTS0" 18 | }, 19 | "source": [ 20 | "# Density estimation using Real NVP\n", 21 | "\n", 22 | "Normalizing flows is the class of probabilistic models that provides flexible parametrical probabilistic models, where the probability density function can be computed exactly. In the assignment, we will consider a real-valued non-volume preserving normalizing flows (Real NVP) -- a special case of normalizing flow.\n", 23 | "\n", 24 | "#### Problem setting\n", 25 | "\n", 26 | "Our goal is to train a generative network $g_\\theta: Z \\rightarrow X, g = f^{-1}$ that maps latent variable $z \\sim p(z)$ to a sample $x \\sim p(x)$. Where $p(z)$ is a prior distibiution and $p(x)$ is a data distibution. An illustrative example is provided below.\n", 27 | "\n", 28 | "![alt text](https://raw.githubusercontent.com/senya-ashukha/senya-ashukha.github.io/f4ed2d6ac83954b1358168dc98be3f17b6f2abbb/assignments/normalizing-flows/2d-example.png)\n", 29 | "\n", 30 | "#### Change of variable formula\n", 31 | "\n", 32 | "Given an observed data variable $x \\in X$,\n", 33 | "a simple prior probability distribution $p_{Z}$ on a latent variable $z \\in Z$,\n", 34 | "and a bijection $f: X \\rightarrow Z$ (with $g = f^{-1}$),\n", 35 | "the change of variable formula defines a model distribution of $X$ by\n", 36 | "\n", 42 | "\n", 43 | "![alt text](https://raw.githubusercontent.com/senya-ashukha/senya-ashukha.github.io/f4ed2d6ac83954b1358168dc98be3f17b6f2abbb/assignments/normalizing-flows/f.png)\n", 44 | "where $\\frac{\\partial f(x)}{\\partial x^T}$ is the Jacobian of $f$ at $x$.\n", 45 | "\n", 46 | "Exact samples from the resulting distribution can be generated by using the inverse transform sampling rule. A sample $z \\sim p_{Z}$ is drawn in the latent space, and its inverse image $x = f^{-1}(z) = g(z)$ generates a sample in the original space. Computing the density at a point $x$ is accomplished by computing the density of its image $f(x)$ and multiplying by the associated Jacobian determinant $\\det\\left(\\frac{\\partial f(x)}{\\partial x^T}\\right)$.\n", 47 | "\n", 48 | "#### Real NVP\n", 49 | "\n", 50 | "Real NVP presents a class of functions where $\\log\\left(\\left|\\det\\left(\\frac{\\partial f(x)}{\\partial x^T}\\right)\\right|\\right)$ can be computed efficiently (see, 3.3 Properties, https://arxiv.org/abs/1605.08803). Every layer of Real NVP is a coupling layer followed by permutation layer. Combination of coupling and permutation layers can be implemented as a masked version of the coupling layer:\n", 51 | "## $$y = b \\odot x + (1 - b) \\odot \\Big(x \\odot \\exp\\big(s(b \\odot x)\\big) + t(b \\odot x)\\Big)$$\n", 52 | "\n", 53 | "where $s$ and $t$ stand for scale and translation, and are functions from $R^{D} \\mapsto R^{D}$, and $\\odot$ is the Hadamard product or element-wise product, $b$ is a binary mask. For more details on the model see the paper Density estimation using Real NVP https://arxiv.org/abs/1605.08803.\n", 54 | "\n", 55 | "# In this assignment:\n", 56 | "1. Implementation of Real NVP\n", 57 | "2. Training Real NVP on 2d circles or moons dataset\n", 58 | "3. Visualization of the generative model\n", 59 | "4. Optional Research Assignment\n", 60 | "\n", 61 | "Additional information:\n", 62 | "- You will need the following python packages: PyTorch, Numpy, sklearn, matplotlib.\n", 63 | "- If you have an urgent question or find a typo or a mistake, send it to ars.ashuha@gmail.com. The title should include \"BDL Assignment 2 2018\".\n", 64 | "- A submission policy will be released later." 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "colab_type": "text", 71 | "id": "NgMBEYdwbTS2" 72 | }, 73 | "source": [ 74 | "# Implementation of Real NVP" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 0, 80 | "metadata": { 81 | "colab": {}, 82 | "colab_type": "code", 83 | "id": "loxXeNy9bTS3" 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "import torch\n", 88 | "from torch import nn\n", 89 | "from torch.nn.parameter import Parameter" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 0, 95 | "metadata": { 96 | "colab": {}, 97 | "colab_type": "code", 98 | "id": "GpyZ5AqObTS7" 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "class RealNVP(nn.Module):\n", 103 | " def __init__(self, nets, nett, mask, prior):\n", 104 | " super(RealNVP, self).__init__()\n", 105 | " \n", 106 | " # Create a flow\n", 107 | " # nets: a function that returns a PyTorch neural network, e.g., nn.Sequential, s = nets(), s: dim(X) -> dim(X)\n", 108 | " # nett: a function that returns a PyTorch neural network, e.g., nn.Sequential, t = nett(), t: dim(X) -> dim(X)\n", 109 | " # mask: a torch.Tensor of size #number_of_coupling_layers x #dim(X)\n", 110 | " # prior: an object from torch.distributions e.g., torch.distributions.MultivariateNormal\n", 111 | " \n", 112 | " self.prior = prior\n", 113 | " self.mask = nn.Parameter(mask, requires_grad=False)\n", 114 | " self.t = torch.nn.ModuleList([nett() for _ in range(len(masks))])\n", 115 | " self.s = torch.nn.ModuleList([nets() for _ in range(len(masks))])\n", 116 | " \n", 117 | " def g(self, z):\n", 118 | " # Compute and return g(z) = x, \n", 119 | " # where self.mask[i], self.t[i], self.s[i] define a i-th masked coupling layer \n", 120 | " # z: a torch.Tensor of shape batchSize x 1 x dim(X)\n", 121 | " # return x: a torch.Tensor of shape batchSize x 1 x dim(X)\n", 122 | " return x\n", 123 | "\n", 124 | " def f(self, x):\n", 125 | " # Compute f(x) = z and log_det_Jakobian of f, \n", 126 | " # where self.mask[i], self.t[i], self.s[i] define a i-th masked coupling layer \n", 127 | " # x: a torch.Tensor, of shape batchSize x dim(X), is a datapoint\n", 128 | " # return z: a torch.Tensor of shape batchSize x dim(X), a hidden representations\n", 129 | " # return log_det_J: a torch.Tensor of len batchSize\n", 130 | " \n", 131 | " return z, log_det_J\n", 132 | " \n", 133 | " def log_prob(self, x):\n", 134 | " # Compute and return log p(x)\n", 135 | " # using the change of variable formula and log_det_J computed by f\n", 136 | " # return logp: torch.Tensor of len batchSize\n", 137 | " return logp\n", 138 | " \n", 139 | " def sample(self, batchSize): \n", 140 | " # Draw and return batchSize samples from flow using implementation of g\n", 141 | " # return x: torch.Tensor of shape batchSize x 1 x dim(X)\n", 142 | " return x" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 0, 148 | "metadata": { 149 | "colab": {}, 150 | "colab_type": "code", 151 | "id": "Qy_HOwQzbTS9" 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "nets = # a function that take no arguments and return a pytorch model, dim(X) -> dim(X)\n", 156 | "nett = # a function that take no arguments and return a pytorch model, dim(X) -> dim(X)\n", 157 | "\n", 158 | "# Check nets and nett are working i.e., computing without errors\n", 159 | "# Check that resulting dimensions s and t are the same and equal dim(X)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 0, 165 | "metadata": { 166 | "colab": {}, 167 | "colab_type": "code", 168 | "id": "ckUHr3ZTbTTA" 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "import numpy as np\n", 173 | "masks = # torch.Tensor of size #number_of_coupling_layers x #dim(X)\n", 174 | "# Check that when dim(X) == 2, the mask for every layer has just one 1.0 and one 0.0 elements." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 0, 180 | "metadata": { 181 | "colab": {}, 182 | "colab_type": "code", 183 | "id": "Ov8AduZnbTTC" 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "from torch import distributions\n", 188 | "prior = distributions.MultivariateNormal(torch.zeros(2), torch.eye(2))\n", 189 | "# Check that prior has log_prob and sample methods" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 0, 195 | "metadata": { 196 | "colab": {}, 197 | "colab_type": "code", 198 | "id": "J6YPmM_3bTTE" 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "flow = RealNVP(nets, nett, masks, prior)\n", 203 | "# Check that a flow is reversible g(f(x)) = x\n", 204 | "# With a big chance you have some errors in RealNVP.log_prob, think hard on it." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 0, 210 | "metadata": { 211 | "colab": {}, 212 | "colab_type": "code", 213 | "id": "vb80JOSSbTTG" 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "from sklearn import datasets\n", 218 | "\n", 219 | "trainable_parametrs = # list of all trainable parameters in a flow\n", 220 | "optimizer = # choose an optimizer, use module torch.optim\n", 221 | "\n", 222 | "for t in range(5001): \n", 223 | " noisy_circles = datasets.make_circles(n_samples=100, factor=.5, noise=.05)\n", 224 | " loss = # compute the maximum-likelihood loss\n", 225 | " \n", 226 | " optimizer.zero_grad()\n", 227 | " loss.backward()\n", 228 | " optimizer.step()\n", 229 | " \n", 230 | " if t % 500 == 0:\n", 231 | " print('iter %s:' % t, 'loss = %.3f' % loss)\n", 232 | " \n", 233 | "# Check that the loss decreases\n", 234 | "# Is the visualization below good?" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "colab_type": "text", 241 | "id": "6hOxCpXYbTTJ" 242 | }, 243 | "source": [ 244 | "# Visualization" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 0, 250 | "metadata": { 251 | "colab": {}, 252 | "colab_type": "code", 253 | "id": "RmK9g7CIbTTK" 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "from sklearn import datasets\n", 258 | "import matplotlib.pyplot as plt\n", 259 | "%matplotlib inline\n", 260 | "from pylab import rcParams\n", 261 | "rcParams['figure.figsize'] = 10, 8\n", 262 | "rcParams['figure.dpi'] = 300\n", 263 | "\n", 264 | "noisy_circles = datasets.make_circles(n_samples=100, factor=.5, noise=.05)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": { 270 | "colab_type": "text", 271 | "id": "Gz5YG8FNbTTN" 272 | }, 273 | "source": [ 274 | "Draw several plots: \n", 275 | "- samples from flow\n", 276 | "- samples from prior\n", 277 | "- data samples\n", 278 | "- mapping form data to prior\n", 279 | "\n", 280 | "The goal is to obtain figure similar to https://arxiv.org/abs/1605.08803" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 0, 286 | "metadata": { 287 | "colab": {}, 288 | "colab_type": "code", 289 | "id": "qbhwTtHjpHC9" 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "noisy_moons = datasets.make_moons(n_samples=1000, noise=.05)[0].astype(np.float32)\n", 294 | "z = flow.f(torch.from_numpy(noisy_moons))[0].detach().numpy()\n", 295 | "plt.subplot(221)\n", 296 | "plt.scatter(z[:, 0], z[:, 1])\n", 297 | "plt.title(r'$z = f(X)$')\n", 298 | "\n", 299 | "z = np.random.multivariate_normal(np.zeros(2), np.eye(2), 1000)\n", 300 | "plt.subplot(222)\n", 301 | "plt.scatter(z[:, 0], z[:, 1])\n", 302 | "plt.title(r'$z \\sim p(z)$')\n", 303 | "\n", 304 | "plt.subplot(223)\n", 305 | "x = datasets.make_moons(n_samples=1000, noise=.05)[0].astype(np.float32)\n", 306 | "plt.scatter(x[:, 0], x[:, 1], c='r')\n", 307 | "plt.title(r'$X \\sim p(X)$')\n", 308 | "\n", 309 | "plt.subplot(224)\n", 310 | "x = flow.sample(1000).detach().numpy()\n", 311 | "plt.scatter(x[:, 0, 0], x[:, 0, 1], c='r')\n", 312 | "plt.title(r'$X = g(z)$')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": { 318 | "colab_type": "text", 319 | "id": "atnq5yzPbTTN" 320 | }, 321 | "source": [ 322 | "## Provide answers for the folowing questions:" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "colab_type": "text", 329 | "id": "HcEgstlZbTTP" 330 | }, 331 | "source": [ 332 | "1. What architecture worked better in your experiments?\n", 333 | "\n", 334 | "\n", 335 | "**Your answer with justification**\n" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "colab_type": "text", 342 | "id": "E8Jg4GGgbTTQ" 343 | }, 344 | "source": [ 345 | "2. Did you find what is important for stable training (initializations, nonlinearities, ...)?\n", 346 | "\n", 347 | "\n", 348 | "**Your answer with justification**\n" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": { 354 | "colab_type": "text", 355 | "id": "LttbxRD1bTTQ" 356 | }, 357 | "source": [ 358 | "3. How convergence speed (in iterations) depends on the complexity of architecture?\n", 359 | "\n", 360 | "\n", 361 | "**Your answer with justification**\n" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": { 367 | "colab_type": "text", 368 | "id": "nJZuZwn5bTTR" 369 | }, 370 | "source": [ 371 | "# Optional Research Assignments:" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": { 377 | "colab_type": "text", 378 | "id": "fiQecpiKbTTS" 379 | }, 380 | "source": [ 381 | "This assignment is optional. It will give you up to 2 additional points for one of the assignments. \n", 382 | "#### 1. VAE: \n", 383 | "Use normalizing flow to get more expressive $q(z\\,|\\,x)$, https://arxiv.org/abs/1505.05770. TL;dr; use decoder to predict $z_0 \\sim q_0(z_0|x)$, then use NF to obtain $z_k = f_k \\circ \\dots \\circ f_1(z)$ sample from a more flexible distribution. Compare it with a conventional normal distribution, what is working better? Compare it with conventional VAE that uses the same number of parameters as VAE with NF.\n", 384 | "\n", 385 | "#### 2. Expressiveness: \n", 386 | "- Train the flow on an another 2d-dataset e.g., a mixture of 6 Gaussians (see figure 10, https://openreview.net/pdf?id=Hkg313AcFX). Is it possible to fit it with a normalizing flow? What is more beneficial increase size of s and t or increase the depth? Provide a justification for the answer e.g. plots.\n", 387 | "- Use the flow to sample images from a more complex dataset, e.g. downsampled MNIST (8x8 should be fine). Do your findings remains the same? Also, provide plots.\n" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 0, 393 | "metadata": { 394 | "colab": {}, 395 | "colab_type": "code", 396 | "id": "S2y9MeyJbTTV" 397 | }, 398 | "outputs": [], 399 | "source": [] 400 | } 401 | ], 402 | "metadata": { 403 | "colab": { 404 | "collapsed_sections": [], 405 | "name": "nf-assignment.ipynb", 406 | "provenance": [], 407 | "version": "0.3.2" 408 | }, 409 | "kernelspec": { 410 | "display_name": "Python 3", 411 | "language": "python", 412 | "name": "python3" 413 | }, 414 | "language_info": { 415 | "codemirror_mode": { 416 | "name": "ipython", 417 | "version": 3 418 | }, 419 | "file_extension": ".py", 420 | "mimetype": "text/x-python", 421 | "name": "python", 422 | "nbconvert_exporter": "python", 423 | "pygments_lexer": "ipython3", 424 | "version": "3.7.3" 425 | } 426 | }, 427 | "nbformat": 4, 428 | "nbformat_minor": 1 429 | } 430 | -------------------------------------------------------------------------------- /seminars/day3/nf/readme.md: -------------------------------------------------------------------------------- 1 | # Normalizing flow seminars 2 | 3 | [Colab: seminar](https://colab.research.google.com/drive/1jRcSdInpHHOO8_jicw8DqZjpLO2bQZMQ) 4 | -------------------------------------------------------------------------------- /seminars/day4/AdaGram/AdaGram seminar.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction\n", 8 | "\n", 9 | "Welcome to the seminar! We hope you will find it useful and fun.\n", 10 | "Before we start, let's make sure you have everything ready.\n", 11 | "\n", 12 | "## Installing python\n", 13 | "\n", 14 | "If you are reading this inside a jupyter notebook, likely it's already installed.\n", 15 | "Besides python, you will need `jupyter`, `numpy` and `julia` libraries, you can install them using `pip install`.\n", 16 | "We have tested this notebook under python3, but python2 should work too.\n", 17 | "\n", 18 | "## Installing julia\n", 19 | "\n", 20 | "You will also need to install the julia interpreter. Go to https://julialang.org/downloads/ and get yourself a v1.1+ that suits your operating system.\n", 21 | "If you download julia as a binary, unpack it somewhere and add this to your `.bashrc`:\n", 22 | "```\n", 23 | "export PATH=\"/path/to/julia/bin:$PATH\n", 24 | "```\n", 25 | "You might need to reload kernel after you update `PATH`.\n", 26 | "\n", 27 | "Type `julia` in the terminal, if you see a nice ascii art with a julia logo and the interpreter prompt it worked well. Now you need to install the AdaGram package (lives here: https://bitbucket.org/sbos/adagram_deepbayes2019). In the julia interpreter type in the following command:\n", 28 | "```\n", 29 | "using Pkg\n", 30 | "Pkg.add(PackageSpec(url=\"https://bitbucket.org/sbos/adagram_deepbayes2019.git\"))\n", 31 | "```\n", 32 | "\n", 33 | "## Download an AdaGram model\n", 34 | "\n", 35 | "If you have wget installed, you can simply run the cell below." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "/bin/sh: wget: command not found\r\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "!wget https://w2v.s3.amazonaws.com/huang_super_200D_0.1_min20_hs_t1e-17.model" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Otherwise, download this file manually to the same dir where this notebook is located (or remember the path and modify it below accordingly)\n", 60 | "\n", 61 | "Also, `git clone https://bitbucket.org/sbos/adagram_deepbayes2019.git` somewhere on your laptop. Further we will refer to this directory as `./adagram_deepbayes2019`.\n", 62 | "\n", 63 | "## Check that everything works" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from julia.api import Julia\n", 73 | "# if this cell fails, uncomment the line below\n", 74 | "# jl = Julia(compiled_modules=False)\n", 75 | "import julia\n", 76 | "julia.install()\n", 77 | "from julia import AdaGram" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "vm, vocab = AdaGram.load_model(\"huang_super_200D_0.1_min20_hs_t1e-17.model\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "### Prior probabilities of senses of the word \"apple\"" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 6, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "array([5.07910968e-01, 2.38101072e-01, 2.53984509e-01, 3.13751697e-06,\n", 105 | " 2.85204898e-07, 2.59277179e-08, 2.35706526e-09, 2.14278660e-10,\n", 106 | " 1.94798782e-11, 1.77089802e-12, 1.60990729e-13, 1.46355208e-14,\n", 107 | " 1.33050189e-15, 1.20954717e-16, 1.09958834e-17, 9.99625763e-19,\n", 108 | " 9.08750694e-20, 8.26136994e-21, 7.51033631e-22, 6.82757847e-23,\n", 109 | " 6.20688951e-24, 5.64262683e-25, 5.12966076e-26, 4.66332796e-27,\n", 110 | " 4.23938905e-28, 3.85399005e-29, 3.50362732e-30, 3.18511574e-31,\n", 111 | " 2.89555977e-32, 2.89555977e-33])" 112 | ] 113 | }, 114 | "execution_count": 6, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "AdaGram.expected_pi(vm, vocab.word2id[\"apple\"])" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### Nearest neighbours of the first sense" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 9, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "[('macintosh', 1, 0.823969841003418),\n", 139 | " ('computers', 1, 0.7634434103965759),\n", 140 | " ('ibm', 1, 0.7630185484886169),\n", 141 | " ('intel-based', 1, 0.7424857020378113),\n", 142 | " ('iigs', 1, 0.7415772676467896),\n", 143 | " ('pc', 1, 0.7379290461540222),\n", 144 | " ('ms-dos', 1, 0.7352752685546875),\n", 145 | " ('kaypro', 1, 0.7326763272285461),\n", 146 | " ('powerpc-based', 1, 0.7302876114845276),\n", 147 | " ('dos', 1, 0.7282707095146179)]" 148 | ] 149 | }, 150 | "execution_count": 9, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "AdaGram.nearest_neighbors(vm, vocab, \"apple\", 1, min_count=2.)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "### Nearest neighbours of the first sense" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 11, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "[('pomegranate', 1, 0.8346080183982849),\n", 175 | " ('almond', 1, 0.8157211542129517),\n", 176 | " ('apricot', 1, 0.8051078915596008),\n", 177 | " ('plum', 1, 0.7945712804794312),\n", 178 | " ('peach', 1, 0.7862921357154846),\n", 179 | " ('cherry', 1, 0.7756718993186951),\n", 180 | " ('tamarind', 1, 0.7648524641990662),\n", 181 | " ('pear', 1, 0.7564710378646851),\n", 182 | " ('lemon', 1, 0.7523407936096191),\n", 183 | " ('blueberry', 1, 0.7521582245826721)]" 184 | ] 185 | }, 186 | "execution_count": 11, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "AdaGram.nearest_neighbors(vm, vocab, \"apple\", 2, min_count=2.)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "### Disambiguation" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 12, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "data": { 209 | "text/plain": [ 210 | "array([4.56448855e-05, 9.81135272e-01, 1.88190832e-02, 0.00000000e+00,\n", 211 | " 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,\n", 212 | " 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,\n", 213 | " 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,\n", 214 | " 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,\n", 215 | " 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,\n", 216 | " 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,\n", 217 | " 0.00000000e+00, 0.00000000e+00])" 218 | ] 219 | }, 220 | "execution_count": 12, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "AdaGram.disambiguate(vm, vocab, \"apple\", \"fresh tasty breakfast\".split(' '))" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "If that worked well, we are ready to start!" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "# Theory\n", 241 | "\n", 242 | "If you need a hint, you can find them here https://docs.google.com/presentation/d/1NpTfMQ3UklgStRvhv8l7i-Pp84UpP_8l1kH1p4JOt1U/edit?usp=sharing\n", 243 | "\n", 244 | "## How many clusters one should expect in a Dirichlet process mixture model?\n", 245 | "\n", 246 | "Remember from the lecture, that a measure $G \\sim DP(\\alpha, H)$ has a countable number of distinct values of $\\phi \\sim G$. This allows us to use Dirichlet processes as priors over assignments, e.g. cluster assingments in a mixture model.\n", 247 | "\n", 248 | "Formally, if $\\phi_1, \\ldots, \\phi_n \\sim G$ one can write the following predictive distribution over $\\phi_{n+1}$:\n", 249 | "\n", 250 | "$$\n", 251 | " \\phi_{n+1} | \\phi_{1}, \\ldots, \\phi_{n} \\sim \\frac{1}{\\alpha + n} ( \\alpha H + \\sum_{i=1}^n \\delta(\\phi - \\phi_i)).\n", 252 | "$$\n", 253 | "\n", 254 | "From this formula, it is clear that some $\\phi$s will be equal. Defining $K$ as the number of distinct $\\phi$s and $n_k$ as the number of $\\phi$s equal to the $k$-th value $\\phi_k$, we can rewrite this equation:\n", 255 | "\n", 256 | "$$\n", 257 | " \\phi_{n+1} | \\phi_{1}, \\ldots, \\phi_{n} \\sim \\frac{1}{\\alpha + n} ( \\alpha H + \\sum_{k=1}^K n_k \\delta(\\phi - \\phi_k)).\n", 258 | "$$\n", 259 | "\n", 260 | "This gives rise to the famous Chinese Restaurant Process, where each $\\phi_k$ is a parameter associated with the $k$-th table. The $n+1$-th customer is choosing between the existing $K$ tables, each of which can be chooses with probability $\\propto n_k$ and creating a new one with probability $\\propto \\alpha$.\n", 261 | "If the $n+1$-th customer chooses $\\phi_k$, then $n_k$ increases by 1, otherwise $\\phi_{K+1}$ gets sampled from the base measure $H$ and we set $n_{K+1} = 1$. This procedure then repeats.\n", 262 | "\n", 263 | "Now, when we know this, let's try to think how many clusters one should get asymptotically as $n \\rightarrow \\infty$ in expectation? " 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "**Solution**: \n", 271 | "\n", 272 | "We can express the number of tables $K$ as the number of times a new table has been created:\n", 273 | "$$\n", 274 | " K = \\sum_{i=1}^n \\mathbb{1}[ \\text{table created at } i].\n", 275 | "$$\n", 276 | "Then recall that expectation of an indicator is the probability of it to be true, hence\n", 277 | "$$\n", 278 | " \\mathbb{E} K = \\sum_{i=1}^n \\frac{\\alpha}{i - 1 + \\alpha}.\n", 279 | "$$\n", 280 | "This formula can be recognized as harmonic series, see https://en.wikipedia.org/wiki/Harmonic_number. \n", 281 | "We can use this fact and after some calculations we get\n", 282 | "$\\mathbb{E} K = \\alpha \\log(n) + \\text{const}$ when $n \\rightarrow \\infty$" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "## Variational approximation for the truncated stick-breaking process\n", 290 | "\n", 291 | "Consider the standard stick-breaking construction of the Dirichlet process.\n", 292 | "1. Stick-breaking proportions are sampled. $\\beta_k \\sim \\text{Beta}(1, \\alpha), \\quad k=1,\\ldots,\\infty$.\n", 293 | "2. Parameters are sampled. $\\phi_k \\sim H, \\quad k=1,\\ldots,\\infty$. \n", 294 | "3. Objects are assigned to clusters according to the stick lengths $\\pi_k$. $p(z_i = k) = \\pi_k, \\quad \\pi_k = \\beta_k \\prod_{t=1}^{k-1} (1 - \\beta_t)$\n", 295 | "4. $x_i | z_i \\sim p(\\cdot | \\phi_{z_i})$\n", 296 | "\n", 297 | "After we observe data $\\mathbf{x} = \\{ x_1, x_2, \\ldots, x_n \\}$ we want to infer the posterior over DP parameters, in this case, $\\mathbf{\\beta}$ and $\\boldsymbol{\\phi}$ as well as the cluster assignments $\\mathbf{z}$.\n", 298 | "As it is often the case at this summer school, we wish to do that using variational inference. \n", 299 | "\n", 300 | "We choose a **finite**, fully-factorized family of variational approximations:\n", 301 | "$$\n", 302 | " q(\\mathbf{\\beta}, \\mathbf{z}, \\boldsymbol{\\phi}) = \\prod_{k=1}^\\infty \\left[ q(\\beta_k) q(\\phi_k) \\right] \\prod_{i=1}^n q(z_i) \\approx p(\\mathbf{\\beta}, \\mathbf{z}, \\boldsymbol{\\phi} | \\mathbf{x}).\n", 303 | "$$\n", 304 | "In this approximation, only $K$ clusters are modelled. We choose $q(\\beta_K = 1) = 1$ and $q(\\beta_k) = p(\\beta_k), q(\\phi_k) = p_H(\\phi_k)$ for $k > K$ which automatically assigns zero probability mass to all succeeding clusters.\n", 305 | "\n", 306 | "Assume you are given with $q(z_i)$ for each object $x_i$. Derive a variational update for $q^*(\\boldsymbol{\\beta}) = \\arg\\min_{q(\\boldsymbol{\\phi})} \\text{KL}( q(\\mathbf{\\beta}, \\mathbf{z}, \\boldsymbol{\\phi}) || p(\\mathbf{\\beta}, \\mathbf{z}, \\boldsymbol{\\phi} | \\mathbf{x}))$." 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "**Solution**: The full solution can be found in the paper http://cs.columbia.edu/~blei/papers/BleiJordan2004.pdf, equation 18. In the paper's notation $\\beta_k$ is $V_k$." 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "**Bonus question**: look at the resulting parametrization. Does it look anyhow redundant? Could you use less parameters to fully describe $q^*(\\boldsymbol{\\beta})$?" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "# Practice\n", 328 | "\n", 329 | "Remember you have just computed the expected number of clusters in a DP?\n", 330 | "Let's see if the AdaGram model follows this analysis.\n", 331 | "\n", 332 | "Plot the number of senses found for each word (the function `AdaGram.expected_pi` used a few cells above is helpful). Choose a reasonable treshold probability to prune out unused samples. " 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 24, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "Total words: 448927\n", 345 | "28895\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "# The set of all words known to the model\n", 351 | "print(\"Total words: \", len(vocab.word2id.keys()))\n", 352 | "\n", 353 | "# Word frequencies\n", 354 | "print(vm.frequencies[vocab.word2id[\"apple\"]])" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "Now let's train some models! Unfortunately, many interesting properties of AdaGram can only be assessed when training on a relatively large corpus. It may be complicated during the time and compute limited practical session, so instead we will train models on synthetic data.\n", 362 | "\n", 363 | "In our example, the word **a** can be encountered in two different contexts, one described by words **b** and **c** and the other one by **f** and **g**. So a good model should be able to discover these two \"senses\" of **a**." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 30, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "# generating train data\n", 373 | "for _ in range(100):\n", 374 | " !echo \"b c b c b c a b c b c b b g f g a g g f g a a g f g\" >> synth_train.txt\n", 375 | "\n", 376 | "# generating test data\n", 377 | "for _ in range(100):\n", 378 | " !echo \"a c b b b a b c c f g g a g f g g a b c a b b c a c b f g f g a g f a g\" >> synth_test.txt \n", 379 | " \n", 380 | "# you are more than welcome to generate something more interesting" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 31, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "name": "stdout", 390 | "output_type": "stream", 391 | "text": [ 392 | "Cloning into 'adagram_deepbayes2019'...\n", 393 | "remote: Counting objects: 509, done.\u001b[K\n", 394 | "remote: Compressing objects: 100% (167/167), done.\u001b[K\n", 395 | "remote: Total 509 (delta 333), reused 509 (delta 333)\u001b[KB/s \n", 396 | "Receiving objects: 100% (509/509), 10.13 MiB | 148.00 KiB/s, done.\n", 397 | "Resolving deltas: 100% (333/333), done.\n" 398 | ] 399 | } 400 | ], 401 | "source": [ 402 | "!git clone https://bitbucket.org/sbos/adagram_deepbayes2019.git" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "We will now prepare a dictionary file, this is needed only once" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 33, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "!./adagram_deepbayes2019/utils/dictionary.sh ./synth_train.txt ./synth.vocab" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "To train a model run this command:" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 34, 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "name": "stdout", 435 | "output_type": "stream", 436 | "text": [ 437 | "Building dictionary... Done!\n", 438 | " From worker 2:\t64000 words read, 3412/5200\n", 439 | " From worker 2:\t3.85% -1.3335 0.0240 0.0240 2.98/3.00 24.45 kwords/sec\n", 440 | " From worker 2:\t7.69% -1.3210 0.0231 0.0231 2.99/3.00 40.15 kwords/sec\n", 441 | " From worker 2:\t11.54% -1.3065 0.0221 0.0221 2.99/3.00 38.05 kwords/sec\n", 442 | " From worker 2:\t15.38% -1.2951 0.0212 0.0212 3.00/3.00 39.41 kwords/sec\n", 443 | " From worker 2:\t19.23% -1.2881 0.0202 0.0202 3.00/3.00 39.07 kwords/sec\n", 444 | " From worker 2:\t23.08% -1.2833 0.0192 0.0192 3.00/3.00 38.95 kwords/sec\n", 445 | " From worker 2:\t64000 words read, 1624/5200\n", 446 | " From worker 2:\t28.46% -1.2785 0.0179 0.0179 3.00/3.00 39.66 kwords/sec\n", 447 | " From worker 2:\t32.31% -1.2758 0.0169 0.0169 3.00/3.00 37.32 kwords/sec\n", 448 | " From worker 2:\t36.15% -1.2737 0.0160 0.0160 3.00/3.00 36.16 kwords/sec\n", 449 | " From worker 2:\t40.00% -1.2718 0.0150 0.0150 3.00/3.00 37.71 kwords/sec\n", 450 | " From worker 2:\t43.85% -1.2702 0.0140 0.0140 3.00/3.00 37.34 kwords/sec\n", 451 | " From worker 2:\t47.69% -1.2687 0.0131 0.0131 3.00/3.00 38.01 kwords/sec\n", 452 | " From worker 2:\t64000 words read, 5034/5200\n", 453 | " From worker 2:\t53.08% -1.2669 0.0117 0.0117 3.00/3.00 40.44 kwords/sec\n", 454 | " From worker 2:\t56.92% -1.2657 0.0108 0.0108 3.00/3.00 39.97 kwords/sec\n", 455 | " From worker 2:\t60.77% -1.2645 0.0098 0.0098 3.00/3.00 41.05 kwords/sec\n", 456 | " From worker 2:\t64.62% -1.2634 0.0088 0.0088 3.00/3.00 40.50 kwords/sec\n", 457 | " From worker 2:\t68.46% -1.2624 0.0079 0.0079 3.00/3.00 37.86 kwords/sec\n", 458 | " From worker 2:\t72.31% -1.2614 0.0069 0.0069 3.00/3.00 38.10 kwords/sec\n", 459 | " From worker 2:\t64000 words read, 3246/5200\n", 460 | " From worker 2:\t77.69% -1.2601 0.0056 0.0056 3.00/3.00 37.10 kwords/sec\n", 461 | " From worker 2:\t81.54% -1.2592 0.0046 0.0046 3.00/3.00 38.69 kwords/sec\n", 462 | " From worker 2:\t85.38% -1.2583 0.0037 0.0037 3.00/3.00 38.74 kwords/sec\n", 463 | " From worker 2:\t89.23% -1.2575 0.0027 0.0027 3.00/3.00 39.29 kwords/sec\n", 464 | " From worker 2:\t93.08% -1.2566 0.0017 0.0017 3.00/3.00 38.24 kwords/sec\n", 465 | " From worker 2:\t96.92% -1.2558 0.0008 0.0008 3.00/3.00 38.65 kwords/sec\n", 466 | " From worker 2:\t64000 words read, 1454/5200\n", 467 | "Learning complete 260001 / 260000.0\n" 468 | ] 469 | } 470 | ], 471 | "source": [ 472 | "!julia ./adagram_deepbayes2019/train.jl --window 3 --min-freq 1 --prototypes 3 \\\n", 473 | " --alpha 1. --epochs 100 synth_train.txt synth.vocab synth_prot3_alpha1.model " 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "metadata": {}, 479 | "source": [ 480 | "Pay attention to the parameters `--prototypes` and `--alpha`. Here we used allowed the DP to have up to 3 mixture component for each word and set $\\alpha = 1$.\n", 481 | "\n", 482 | "Now we can assess the test likelihood of our model." 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 36, 488 | "metadata": {}, 489 | "outputs": [ 490 | { 491 | "name": "stdout", 492 | "output_type": "stream", 493 | "text": [ 494 | " From worker 2:\t3400 words read, 7200/7200\n", 495 | " From worker 2:\t0 words read, 7200/7200\n", 496 | "-1.6320696059620439\n", 497 | "-1.6320696059620439\n" 498 | ] 499 | } 500 | ], 501 | "source": [ 502 | "!julia ./adagram_deepbayes2019/likelihood.jl --window 3 synth_prot3_alpha1.model synth_test.txt" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "The last number printed is the likelihood we are looking for. \n", 510 | "\n", 511 | "Now play with the parameters and see if you can extract two different senses of the word **a**. \n", 512 | "Is this model better than the standard skip-gram (`--prototypes 1`) in terms of the test likelihood?\n", 513 | "If you forgot how to use AdaGram, see examples in the introduction." 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [] 522 | } 523 | ], 524 | "metadata": { 525 | "kernelspec": { 526 | "display_name": "Python 3", 527 | "language": "python", 528 | "name": "python3" 529 | }, 530 | "language_info": { 531 | "codemirror_mode": { 532 | "name": "ipython", 533 | "version": 3 534 | }, 535 | "file_extension": ".py", 536 | "mimetype": "text/x-python", 537 | "name": "python", 538 | "nbconvert_exporter": "python", 539 | "pygments_lexer": "ipython3", 540 | "version": "3.7.4" 541 | } 542 | }, 543 | "nbformat": 4, 544 | "nbformat_minor": 2 545 | } 546 | -------------------------------------------------------------------------------- /seminars/day4/AdaGram/readme.md: -------------------------------------------------------------------------------- 1 | ### Seminar on Adaptive skip-gram model 2 | -------------------------------------------------------------------------------- /seminars/day4/gp/BayesOpt/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from matplotlib import pyplot as plt 5 | 6 | 7 | def plot_1D_function(func, X, axis_bounds=([0, 1], [-6.5, 16.5])): 8 | """ Plot one dimensional function at given points 9 | 10 | Parameters 11 | ---------- 12 | func : callable 13 | 1D function to plot. It should take torch.tensor as input and 14 | return torch.tensor 15 | 16 | X : torch.tensor, shape=(n_samples, ) 17 | Training inputs 18 | 19 | axis_bounds : list 20 | list of length 4 that defines axis bounds. Default values correspond to 21 | Forrester function 22 | """ 23 | plt.figure(figsize=(8, 7)) 24 | x_grid = np.linspace(axis_bounds[0][0], axis_bounds[0][1], 300) 25 | plt.plot(x_grid, func(torch.from_numpy(x_grid)).numpy(), 26 | label='Forrester function') 27 | plt.scatter(X.cpu().numpy(), func(X).cpu().numpy(), s=50, 28 | label='Initial sample') 29 | 30 | plt.xlabel('x', fontsize=22) 31 | plt.ylabel('f(x)', fontsize=22) 32 | plt.legend(fontsize=18, loc='upper left') 33 | plt.xlim(axis_bounds[0]) 34 | plt.ylim(axis_bounds[1]) 35 | 36 | 37 | def plot_acquisition(acquisition, X, y, X_candidate): 38 | """ 39 | Parameters 40 | ---------- 41 | acquisition : botorch.acquisition.Acquisition 42 | 43 | X : torch.tensor, shape=(batch, 1, dim) 44 | Current design inputs 45 | 46 | y : .torch.tensor, shape=(n_samples, 1) 47 | Current design targets 48 | 49 | X_candidate : torch.tensor, shape=(n_candidates, 1) 50 | New candidate points 51 | """ 52 | 53 | x_grid = torch.linspace(0, 1, 200).reshape(-1, 1, 1).to(X) 54 | with torch.no_grad(): 55 | acqu = acquisition(x_grid).cpu().numpy() 56 | posterior = acquisition.model.posterior(x_grid) 57 | 58 | y_mean = posterior.mean.cpu().numpy().ravel() 59 | y_std = torch.sqrt(posterior.variance).numpy().ravel() 60 | lower = y_mean - 1.96 * y_std 61 | upper = y_mean + 1.96 * y_std 62 | 63 | if max(-acqu - min(-acqu)) > 0: 64 | acqu_normalized = (-acqu - min(-acqu)) / (max(-acqu - min(-acqu))) 65 | else: 66 | acqu_normalized = (-acqu - min(-acqu)) 67 | 68 | factor = max(upper) - min(lower) 69 | 70 | x_grid = x_grid.cpu().numpy().ravel() 71 | 72 | plt.plot(X.cpu().numpy(), y.cpu().numpy(), '.r', markersize=10) 73 | 74 | plt.plot(x_grid, 0.2 * factor * acqu_normalized 75 | - abs(min(lower)) - 0.25 * factor, 76 | '-r', lw=2, label='Acquisition') 77 | 78 | plt.plot(x_grid, y_mean, '-k', lw=1, alpha=0.6) 79 | plt.plot(x_grid, upper, '-k', alpha=0.2) 80 | plt.plot(x_grid, lower, '-k', alpha=0.2) 81 | 82 | color = plt.rcParams['axes.prop_cycle'].by_key()['color'][0] 83 | plt.fill_between(x_grid, lower.ravel(), upper.ravel(), color=color, 84 | alpha=0.1) 85 | 86 | plt.ylim(min(lower) - 0.25 * factor, 87 | max(upper) + 0.05 * factor) 88 | plt.axvline(x=X_candidate.cpu().numpy(), color='r') 89 | plt.xlabel('x', fontsize=14) 90 | plt.ylabel('f(x)', fontsize=14) 91 | plt.legend() 92 | plt.show() 93 | 94 | 95 | def plot_convergence(X, y, maximize=False): 96 | """ 97 | Plot convergence history: distance between consecutive x's and value of 98 | the best selected sample 99 | 100 | Parameters 101 | ---------- 102 | X : torch.tensor, shape=(n_samples, dim) 103 | History of evaluated input values 104 | 105 | y : torch.tensor, shape=(n_samples,) 106 | History of evaluated objective values 107 | 108 | Returns 109 | ------- 110 | 111 | """ 112 | fig, axes = plt.subplots(1, 2, figsize=(9, 4.5)) 113 | 114 | dist = torch.norm(X[1:] - X[:-1], dim=-1).cpu().numpy() 115 | if maximize: 116 | cum_best = np.maximum.accumulate(y.cpu().numpy()) 117 | else: 118 | cum_best = np.minimum.accumulate(y.cpu().numpy()) 119 | 120 | axes[0].plot(dist, '.-', c='r',) 121 | axes[0].set_xlabel('Iteration', fontsize=14) 122 | axes[0].set_ylabel(r"$d(x_i - x_{i - 1})$", fontsize=14) 123 | axes[0].set_title("Distance between consecutive x's", fontsize=14) 124 | axes[0].grid(True) 125 | 126 | 127 | axes[1].plot(cum_best, '.-') 128 | axes[1].set_xlabel('Iteration', fontsize=14) 129 | axes[1].set_ylabel('Best y', fontsize=14) 130 | axes[1].set_title('Value of the best selected sample', fontsize=14) 131 | axes[1].grid(True) 132 | 133 | fig.tight_layout() 134 | -------------------------------------------------------------------------------- /seminars/day4/gp/GP/EI_vs_logEI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/seminars/day4/gp/GP/EI_vs_logEI.png -------------------------------------------------------------------------------- /seminars/day4/gp/GP/airline.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/seminars/day4/gp/GP/airline.npz -------------------------------------------------------------------------------- /seminars/day4/gp/GP/airline_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/seminars/day4/gp/GP/airline_result.png -------------------------------------------------------------------------------- /seminars/day4/gp/GP/gp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bayesgroup/deepbayes-2019/48114e19c926827df95662afbb2d27050344fbba/seminars/day4/gp/GP/gp.png -------------------------------------------------------------------------------- /seminars/day4/gp/GP/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from matplotlib import pyplot as plt 4 | 5 | 6 | def plot_model(model, xlim=None, scaler_x=None, scaler_y=None): 7 | """ 8 | Plot 1D GP model 9 | 10 | Parameters 11 | ---------- 12 | model : gpytorch.models.GP 13 | 14 | xlim : tuple(float, float) or None 15 | 16 | scaler_x : sklearn.preprocessing.StandardScaler 17 | 18 | scaler_y : sklearn.preprocessing.StandardScaler 19 | 20 | Returns 21 | ------- 22 | 23 | """ 24 | X = model.train_inputs[0].cpu().numpy() 25 | y = model.train_targets.cpu().numpy() 26 | 27 | if xlim is None: 28 | xmin = float(X.min()) 29 | xmax = float(X.max()) 30 | x_range = xmax - xmin 31 | xlim = [xmin - 0.05 * x_range, 32 | xmax + 0.05 * x_range] 33 | 34 | model_tensor_example = list(model.parameters())[0] 35 | 36 | x = torch.linspace(xlim[0], xlim[1], 200).to(model_tensor_example) 37 | if scaler_x is not None: 38 | x = torch.tensor(scaler_x.transform(x.reshape(-1, 1))).squeeze() 39 | 40 | model.eval() 41 | predictive_distribution = model.predict(x) 42 | 43 | lower, upper = predictive_distribution.confidence_region() 44 | prediction = predictive_distribution.mean.cpu().numpy() 45 | 46 | if scaler_x is not None: 47 | X = scaler_x.inverse_transform(X) 48 | x = scaler_x.inverse_transform(x) 49 | else: 50 | x = x.numpy() 51 | 52 | if scaler_y is not None: 53 | y = scaler_y.inverse_transform(y.reshape(-1, 1)).ravel() 54 | lower = scaler_y.inverse_transform(lower) 55 | upper = scaler_y.inverse_transform(upper) 56 | prediction = scaler_y.inverse_transform(prediction) 57 | 58 | plt.scatter(X, y, marker='x', c='k') 59 | plt.plot(x, prediction) 60 | plt.fill_between(x, lower, upper, alpha=0.1) 61 | plt.xlabel('x', fontsize=14) 62 | plt.ylabel('y', fontsize=14) 63 | -------------------------------------------------------------------------------- /seminars/day4/gp/readme.md: -------------------------------------------------------------------------------- 1 | # Seminar on Gaussian processes 2 | 3 | * [Google Colab: GP_assignment](https://colab.research.google.com/github/yeahrmek/BayesOpt_tutorial/blob/master/GP/gp_practice.ipynb 4 | ) 5 | * [Google Colab: GP_solution](https://colab.research.google.com/github/yeahrmek/BayesOpt_tutorial/blob/master/GP/gp_solution.ipynb 6 | ) 7 | * [Google Colab: BayesOpt_assignment](https://colab.research.google.com/github/yeahrmek/BayesOpt_tutorial/blob/master/BayesOpt/bayesopt_practice.ipynb) 8 | * [Google Colab: BayesOpt_solution](https://colab.research.google.com/github/yeahrmek/BayesOpt_tutorial/blob/master/BayesOpt/bayesopt_solution.ipynb) 9 | -------------------------------------------------------------------------------- /seminars/day5/readme.md: -------------------------------------------------------------------------------- 1 | [Link to Google Colab](https://colab.research.google.com/drive/1D-Qwxkh4YPAEOlSbjlKtRsFlaKnHsnpi) 2 | -------------------------------------------------------------------------------- /seminars/day6/SparseVD-assignment-colab.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.3"},"colab":{"name":"SparseVD-assignment-colab.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[]}},"cells":[{"cell_type":"markdown","metadata":{"id":"ghreHi4mIxNL","colab_type":"text"},"source":["# Variational Dropout Sparsifies Deep Neural Networks\n","\n","Variational Dropout ([arXiv:1506.02557](https://arxiv.org/abs/1506.02557)) provides a Bayesian interpretation of the conventional dropout procedure. Later it was shown that Variational Dropout can be used for model sparsification (Sparse VD), an the effect can be achieved via optimization of variational lower bound wrt individual dropout rates for every weight of the model ([arXiv:1701.05369](https://arxiv.org/abs/1701.05369)).\n","\n","#### Sparse VD\n","\n","Sparse VD model optimizes VLB $\\mathcal{L}(\\phi)$ with respect to parameters $\\phi$ of a variational approximation $q_\\phi(w)$:\n","\n","#$$\\mathcal{L}(\\phi) = L_\\mathcal{D}(\\phi) - D_{KL}(q_\\phi(w)\\,\\|\\,p(w)) \\to\\max_{\\phi\\in\\Phi}$$\n","#$$L_\\mathcal{D}(\\phi) = \\sum_{n=1}^N \\mathrm{E}_{q_\\phi(w)}[\\log p(y_n\\,|\\,x_n, w)],$$\n","\n","where $p(w)$ is the log-uniform prior distibution, the variational approximation $q_\\phi(w)$ is a fullly factorized gaussian, the likelihood $p(y\\,|\\,x, w)$ is defined by a neural network with parametrs $w$. The optimization is performed by stochasic optimization methods e.g., Adam, etc.\n","\n","For computational convenience, the KL divergence is approximated as follows:\n","#$$-D_{KL}(q(w_{ij}\\,|\\,\\theta_{ij}, \\alpha_{ij})\\,\\|\\,p(w_{ij})) \\approx$$\n","#$$ \\approx k_1\\sigma(k_2 + k_3\\log \\alpha_{ij})) - 0.5\\log(1+\\alpha_{ij}^{-1}) + \\mathrm{C}$$\n","#$$ k_1=0.63576 \\quad k_2=1.87320 \\quad k_3=1.48695$$\n","\n","\n","**Note:** In the paper two parametrizations of q are used. The fist one is $\\phi_i=\\{\\mu_{i}, \\sigma_i\\}$ that means $w_{ij} \\sim N(w_{ij} | \\mu_{ij}, \\sigma^2_{ij})$ and the second one is $\\phi_{ij}=\\{\\mu_{ij}, \\alpha_{ij}\\}$ that means $w_{ij} \\sim N(w_{ij} | \\mu_{ij}, \\alpha_{ij}\\mu^2_{ij})$. This two parametrization are connected as $\\sigma^2_{ij} = \\alpha_{ij}\\mu^2_{ij}$. Do not be confused.\n","\n","![alt text](https://raw.githubusercontent.com/senya-ashukha/senya-ashukha.github.io/master/images/svd3.png)\n","\n","# In this assignment:\n","1. Implementation of fully-connected Sparse VD layer\n","2. Training Lenet-300-100 on MNIST dataset\n","3. Optional Research Assignment\n","\n","Additional information:\n","- If you have a problem with importing logger, download logger.py and file to the same folder and run a notebook from it\n","- You will need the following python packages: pytorch, numpy, sklearn, pylab (matplotlib), tabulate\n","- If you have an urgent question or find a typo or a mistake, send it to ars.ashuha@gmail.com. The title should include \"BDL Assignment 3, 2019\""]},{"cell_type":"code","metadata":{"id":"GeSEBQr7I30V","colab_type":"code","outputId":"9962a90d-671d-4b9f-e180-304101521474","executionInfo":{"status":"ok","timestamp":1565987699241,"user_tz":-180,"elapsed":12840,"user":{"displayName":"Arsenii Ashukha","photoUrl":"https://lh3.googleusercontent.com/-Y7_AyoCFcVE/AAAAAAAAAAI/AAAAAAAAuJo/jzY9NgqhShs/s64/photo.jpg","userId":"07795188509773231496"}},"colab":{"base_uri":"https://localhost:8080/","height":326}},"source":["!pip3 install tabulate\n","!pip3 install torch==1.1.0 torchvision==0.3.0 -f https://download.pytorch.org/whl/torch_stable.html\n","!wget https://raw.githubusercontent.com/senya-ashukha/senya-ashukha.github.io/master/assignments/local_logger.py "],"execution_count":0,"outputs":[{"output_type":"stream","text":["Requirement already satisfied: tabulate in /usr/local/lib/python3.6/dist-packages (0.8.3)\n","Looking in links: https://download.pytorch.org/whl/torch_stable.html\n","Requirement already satisfied: torch==1.1.0 in /usr/local/lib/python3.6/dist-packages (1.1.0)\n","Requirement already satisfied: torchvision==0.3.0 in /usr/local/lib/python3.6/dist-packages (0.3.0)\n","Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch==1.1.0) (1.16.4)\n","Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.6/dist-packages (from torchvision==0.3.0) (4.3.0)\n","Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from torchvision==0.3.0) (1.12.0)\n","Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from pillow>=4.1.1->torchvision==0.3.0) (0.46)\n","--2019-08-16 20:34:57-- https://raw.githubusercontent.com/senya-ashukha/senya-ashukha.github.io/master/assignments/local_logger.py\n","Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n","Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 2423 (2.4K) [text/plain]\n","Saving to: ‘local_logger.py’\n","\n","local_logger.py 100%[===================>] 2.37K --.-KB/s in 0s \n","\n","2019-08-16 20:34:57 (47.5 MB/s) - ‘local_logger.py’ saved [2423/2423]\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"Y1KwSUNUIxNM","colab_type":"code","colab":{}},"source":["import math\n","import time\n","\n","import numpy as np\n","import torch\n","import torch.nn as nn\n","import torch.nn.functional as F\n","import torch.optim as optim\n","from torch.autograd import Variable\n","from torch.nn import Parameter\n","from torchvision import datasets, transforms\n","\n","from local_logger import Logger"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"YZgS1405IxNP","colab_type":"text"},"source":["## Implementation of Sparse VD layer"]},{"cell_type":"code","metadata":{"id":"stfYmmw2IxNQ","colab_type":"code","colab":{}},"source":["class LinearSVDO(nn.Module):\n"," def __init__(self, in_features, out_features, threshold, bias=True):\n"," super(LinearSVDO, self).__init__()\n"," \"\"\"\n"," in_features: int, a number of input features\n"," out_features: int, a number of neurons\n"," threshold: float, a threshold for clipping weights\n"," \"\"\"\n"," \n"," self.in_features = in_features\n"," self.out_features = out_features\n"," self.threshold = threshold\n","\n"," self.mu = # torch.nn.parameter.Parameter of size out_features x in_features\n"," self.log_sigma = # torch.nn.parameter.Parameter of size out_features x in_features\n"," self.bias = # torch.nn.parameter.Parameter of size 1 x out_features\n"," self.reset_parameters()\n"," \n"," def reset_parameters(self):\n"," self.bias.data.zero_()\n"," self.mu.data.normal_(0, 0.02)\n"," self.log_sigma.data.fill_(-5) \n"," \n"," def forward(self, x): \n"," # x is a torch.Tensor of shape (number_of_objects, in_features)\n"," # log_alpha is a torch.Tensor of shape (out_features, in_features)\n"," self.log_alpha = # Compute using self.log_sigma and self.mu\n"," # clipping for a numerical stability\n"," self.log_alpha = torch.clamp(self.log_alpha, -10, 10) \n"," \n"," if self.training:\n"," # LRT = local reparametrization trick\n"," # lrt_mean is a torch.Tensor of shape (x.shape[0], out_features)\n"," lrt_mean = # compute mean activation using LRT\n"," # lrt_std is a torch.Tensor of shape (x.shape[0], out_features)\n"," lrt_std = # compute std of activations unsig lrt, \n"," # do not forget use torch.sqrt(x + 1e-8) instead of torch.sqrt(x)\n"," # eps is a torch.Tensor of shape (x.shape[0], out_features)\n"," eps = # sample of noise for reparametrization\n"," return # sample of activation\n"," \n"," out = # compute the output of the layer\n"," # use weights W = E q = self.mu\n"," # clip all weight with log_alpha > threshold\n"," return out\n"," \n"," def kl_reg(self):\n"," k1, k2, k3 = torch.Tensor([0.63576]).cuda(), torch.Tensor([1.8732]).cuda(), torch.Tensor([1.48695]).cuda()\n"," # kl is a scalar torch.Tensor \n"," kl = # eval KL using the approximation\n"," return kl"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1eGNAiqIIxNS","colab_type":"text"},"source":["## Define LeNet-300-100"]},{"cell_type":"code","metadata":{"id":"F3GHJdsBIxNT","colab_type":"code","colab":{}},"source":["class Net(nn.Module):\n"," def __init__(self, threshold):\n"," super(Net, self).__init__()\n"," self.fc1 = LinearSVDO(28*28, 300, threshold)\n"," self.fc2 = LinearSVDO(300, 100, threshold)\n"," self.fc3 = LinearSVDO(100, 10, threshold)\n"," self.threshold=threshold\n","\n"," def forward(self, x):\n"," x = F.relu(self.fc1(x))\n"," x = F.relu(self.fc2(x))\n"," x = F.log_softmax(self.fc3(x), dim=1)\n"," return x"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"Jx_r09UuIxNV","colab_type":"text"},"source":["## Function for loading MNIST"]},{"cell_type":"code","metadata":{"id":"mETTLjwNIxNW","colab_type":"code","colab":{}},"source":["def get_mnist(batch_size):\n"," trsnform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])\n"," train_loader = torch.utils.data.DataLoader(\n"," datasets.MNIST('../data', train=True, download=True,\n"," transform=trsnform), batch_size=batch_size, shuffle=True)\n"," test_loader = torch.utils.data.DataLoader(\n"," datasets.MNIST('../data', train=False, download=True,\n"," transform=trsnform), batch_size=batch_size, shuffle=True)\n","\n"," return train_loader, test_loader"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"CO1hxF-FIxNY","colab_type":"text"},"source":["## Create SGVLB loss"]},{"cell_type":"code","metadata":{"id":"1f0TJpalIxNZ","colab_type":"code","colab":{}},"source":["class SGVLB(nn.Module):\n"," def __init__(self, net, train_size):\n"," super(SGVLB, self).__init__()\n"," self.train_size = train_size # int, the len of dataset\n"," self.net = net # nn.Module\n"," \n"," def forward(self, input, target, kl_weight=1.0):\n"," \"\"\"\n"," input: is a torch.Tensor (a predictions of the model) \n"," target: is a torch.Tensor (a tensor of labels) \n"," \"\"\"\n"," assert not target.requires_grad\n"," kl = 0.0\n"," for module in self.net.children():\n"," if hasattr(module, 'kl_reg'):\n"," kl = kl + module.kl_reg()\n"," \n"," sgvlb_loss = # a scalar torch.Tensor, SGVLB loss\n"," return sgvlb_loss"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UiORIFm1IxNc","colab_type":"text"},"source":["## Define the model"]},{"cell_type":"code","metadata":{"id":"dDzfpK-6IxNc","colab_type":"code","colab":{}},"source":["model = Net(threshold=3).cuda()\n","optimizer = # optimizer\n","scheduler = # decrease learning rate by torch.optim.lr_scheduler\n","\n","logger = Logger('sparse_vd', fmt={\n"," 'tr_loss': '3.1e',\n"," 'te_loss': '3.1e',\n"," 'sp_0': '.3f',\n"," 'sp_1': '.3f',\n"," 'sp_2': '.3f',\n"," 'lr': '3.1e',\n"," 'kl': '.2f',\n"," 'time': '.2f',\n","})\n","\n","train_loader, test_loader = get_mnist(batch_size=100)\n","sgvlb = SGVLB(model, len(train_loader.dataset)).cuda()"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"JDE0oUHqIxNf","colab_type":"text"},"source":["## Train the model"]},{"cell_type":"code","metadata":{"id":"1g4rNHeEIxNf","colab_type":"code","colab":{}},"source":["kl_weight = 0.02\n","epochs = 100\n","\n","for epoch in range(1, epochs + 1):\n"," time_start = time.perf_counter()\n"," scheduler.step()\n"," model.train()\n"," train_loss, train_acc = 0, 0 \n"," kl_weight = min(kl_weight+0.02, 1)\n"," logger.add_scalar(epoch, 'kl', kl_weight)\n"," logger.add_scalar(epoch, 'lr', scheduler.get_lr()[0])\n"," for batch_idx, (data, target) in enumerate(train_loader):\n"," data = data.cuda()\n"," target = target.cuda()\n"," \n"," data = data.view(-1, 28*28)\n"," optimizer.zero_grad()\n"," \n"," output = model(data)\n"," pred = output.data.max(1)[1] \n"," loss = sgvlb(output, target, kl_weight)\n"," loss.backward()\n"," optimizer.step()\n"," \n"," train_loss += float(loss) \n"," train_acc += np.sum(pred.cpu().numpy() == target.cpu().data.numpy())\n","\n"," logger.add_scalar(epoch, 'tr_loss', train_loss / len(train_loader.dataset))\n"," logger.add_scalar(epoch, 'tr_acc', train_acc / len(train_loader.dataset) * 100)\n"," \n"," model.eval()\n"," test_loss, test_acc = 0, 0\n"," for batch_idx, (data, target) in enumerate(test_loader):\n"," data = data.cuda()\n"," target = target.cuda()\n"," data = data.view(-1, 28*28)\n"," output = model(data)\n"," test_loss += float(sgvlb(output, target, kl_weight))\n"," pred = output.data.max(1)[1] \n"," test_acc += np.sum(pred.cpu().numpy() == target.cpu().data.numpy())\n"," \n"," logger.add_scalar(epoch, 'te_loss', test_loss / len(test_loader.dataset))\n"," logger.add_scalar(epoch, 'te_acc', test_acc / len(test_loader.dataset) * 100)\n"," \n"," for i, c in enumerate(model.children()):\n"," if hasattr(c, 'kl_reg'):\n"," logger.add_scalar(epoch, 'sp_%s' % i, (c.log_alpha.cpu().data.numpy() > model.threshold).mean())\n"," \n"," logger.add_scalar(epoch, 'time', time.perf_counter() - time_start)\n"," logger.iter_info()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"TjjJvGG4IxNi","colab_type":"code","colab":{}},"source":["all_w, kep_w = 0, 0\n","\n","for c in model.children():\n"," kep_w += (c.log_alpha.cpu().data.numpy() < model.threshold).sum()\n"," all_w += c.log_alpha.cpu().data.numpy().size\n","\n","# compression_ratio should be > 30\n","compression_ratio = all_w/kep_w\n","print('compression_ratio =', compression_ratio)\n","assert compression_ratio > 30"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"rIXMZn3dIxNl","colab_type":"text"},"source":["## Disk space"]},{"cell_type":"code","metadata":{"id":"TJTUgfC8IxNm","colab_type":"code","colab":{}},"source":["import scipy\n","import numpy as np\n","from scipy.sparse import csc_matrix, csc_matrix, coo_matrix, dok_matrix\n","\n","row, col, data = [], [], []\n","M = list(model.children())[0].mu.cpu().data.numpy()\n","LA = list(model.children())[0].log_alpha.cpu().data.numpy()\n","\n","for i in range(300):\n"," for j in range(28*28):\n"," if LA[i, j] < 3:\n"," row += [i]\n"," col += [j]\n"," data += [M[i, j]]\n","\n","Mcsr = csc_matrix((data, (row, col)), shape=(300, 28*28))\n","Mcsc = csc_matrix((data, (row, col)), shape=(300, 28*28))\n","Mcoo = coo_matrix((data, (row, col)), shape=(300, 28*28))"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"jOaGnsLdIxNo","colab_type":"code","colab":{}},"source":["np.savez_compressed('M_w', M)\n","scipy.sparse.save_npz('Mcsr_w', Mcsr)\n","scipy.sparse.save_npz('Mcsc_w', Mcsc)\n","scipy.sparse.save_npz('Mcoo_w', Mcoo)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"xyLEbDEcIxNq","colab_type":"code","colab":{}},"source":["!ls -lah | grep .npz "],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"SzpK__NFIxNs","colab_type":"text"},"source":["## Visualization"]},{"cell_type":"code","metadata":{"id":"nMadPxZ3IxNt","colab_type":"code","colab":{}},"source":["import matplotlib.pyplot as plt\n","%matplotlib inline\n","import matplotlib as mpl\n","\n","from matplotlib import rcParams\n","rcParams['figure.figsize'] = 16, 4\n","rcParams['figure.dpi'] = 200\n","\n","\n","mask = (model.fc1.log_alpha.cpu().detach().numpy() < 3).astype(np.float)\n","mu = model.fc1.mu.cpu().detach().numpy()\n","\n","# Normalize color map\n","max_val = np.max(np.abs(mask * mu))\n","norm = mpl.colors.Normalize(vmin=-max_val,vmax=max_val)\n","plt.imshow(mask * mu, cmap='RdBu', interpolation=None, norm=norm)\n","plt.colorbar()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"gcQ0j99_IxNv","colab_type":"code","colab":{}},"source":["s = 0\n","z = np.zeros((28*15, 28*15))\n","\n","for i in range(15):\n"," for j in range(15):\n"," s += 1\n"," z[i*28:(i+1)*28, j*28:(j+1)*28] = np.abs((mask * mu)[s].reshape(28, 28))\n","\n","plt.figure(figsize=(8, 5))\n","plt.imshow(z, cmap='hot_r')\n","plt.colorbar()\n","plt.axis('off')"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"R02Ko0KXIxNx","colab_type":"text"},"source":["# Optional Research Assignment (up to 2 points)"]},{"cell_type":"markdown","metadata":{"id":"-OXfBwhZIxNy","colab_type":"text"},"source":["1. Study the model: \n"," - How sparsity and accuracy depend on maximum of KL-multiplier (kl_weight)?\n"," - How quality depends on the initialization of log_sigma (log_sigma)?\n"," - Study the KL approximation: what if we use the reparametrization trick to obtain an unbiased MC estimate of KL?\n","2. Compression:\n"," - What can we do to obtain better compression results with small quality degradation?\n"," - Propose and eval several options.\n","3. Study the Local reparametrization trick: \n"," - Does it really accelerate convergence?\n"," - Does variance of gradient decrease?\n"," \n","You can do one out of three parts. You need to provide evidence for results e.g., plots, etc."]},{"cell_type":"code","metadata":{"id":"AEyJ_GZoJEa1","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]} -------------------------------------------------------------------------------- /seminars/day6/SparseVD-assignment.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.3"},"colab":{"name":"SparseVD-assignment.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":[]}},"cells":[{"cell_type":"markdown","metadata":{"id":"gaNt3kQaJnSw","colab_type":"text"},"source":["# Assignment 3"]},{"cell_type":"markdown","metadata":{"id":"biSNbjH4JnSy","colab_type":"text"},"source":["# Variational Dropout Sparsifies Deep Neural Networks\n","\n","Variational Dropout ([arXiv:1506.02557](https://arxiv.org/abs/1506.02557)) provides a Bayesian interpretation of the conventional dropout procedure. Later it was shown that Variational Dropout can be used for model sparsification (Sparse VD), an the effect can be achieved via optimization of variational lower bound wrt individual dropout rates for every weight of the model ([arXiv:1701.05369](https://arxiv.org/abs/1701.05369)).\n","\n","#### Sparse VD\n","\n","Sparse VD model optimizes VLB $\\mathcal{L}(\\phi)$ with respect to parameters $\\phi$ of a variational approximation $q_\\phi(w)$:\n","\n","#$$\\mathcal{L}(\\phi) = L_\\mathcal{D}(\\phi) - D_{KL}(q_\\phi(w)\\,\\|\\,p(w)) \\to\\max_{\\phi\\in\\Phi}$$\n","#$$L_\\mathcal{D}(\\phi) = \\sum_{n=1}^N \\mathrm{E}_{q_\\phi(w)}[\\log p(y_n\\,|\\,x_n, w)],$$\n","\n","where $p(w)$ is the log-uniform prior distibution, the variational approximation $q_\\phi(w)$ is a fullly factorized gaussian, the likelihood $p(y\\,|\\,x, w)$ is defined by a neural network with parametrs $w$. The optimization is performed by stochasic optimization methods e.g., Adam, etc.\n","\n","For computational convenience, the KL divergence is approximated as follows:\n","#$$-D_{KL}(q(w_{ij}\\,|\\,\\theta_{ij}, \\alpha_{ij})\\,\\|\\,p(w_{ij})) \\approx$$\n","#$$ \\approx k_1\\sigma(k_2 + k_3\\log \\alpha_{ij})) - 0.5\\log(1+\\alpha_{ij}^{-1}) + \\mathrm{C}$$\n","#$$ k_1=0.63576 \\quad k_2=1.87320 \\quad k_3=1.48695$$\n","\n","\n","**Note:** In the paper two parametrizations of q are used. The fist one is $\\phi_i=\\{\\mu_{i}, \\sigma_i\\}$ that means $w_{ij} \\sim N(w_{ij} | \\mu_{ij}, \\sigma^2_{ij})$ and the second one is $\\phi_{ij}=\\{\\mu_{ij}, \\alpha_{ij}\\}$ that means $w_{ij} \\sim N(w_{ij} | \\mu_{ij}, \\alpha_{ij}\\mu^2_{ij})$. This two parametrization are connected as $\\sigma^2_{ij} = \\alpha_{ij}\\mu^2_{ij}$. Do not be confused.\n","\n","![alt text](https://raw.githubusercontent.com/senya-ashukha/senya-ashukha.github.io/master/images/svd3.png)\n","\n","\n","# In this assignment:\n","1. Implementation of fully-connected Sparse VD layer\n","2. Training Lenet-300-100 on MNIST dataset\n","3. Optional Research Assignment\n","\n","Additional information:\n","- If you have a problem with importing logger, download logger.py and file to the same folder and run a notebook from it\n","- You will need the following python packages: pytorch, numpy, sklearn, pylab (matplotlib), tabulate\n","- If you have an urgent question or find a typo or a mistake, send it to ars.ashuha@gmail.com. The title should include \"BDL Assignment 3, 2019\""]},{"cell_type":"code","metadata":{"id":"NXhZgXYQJnSz","colab_type":"code","colab":{}},"source":["import math\n","import time\n","\n","import numpy as np\n","import torch\n","import torch.nn as nn\n","import torch.nn.functional as F\n","import torch.optim as optim\n","from torch.autograd import Variable\n","from torch.nn import Parameter\n","from torchvision import datasets, transforms\n","\n","from logger import Logger"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"R2V-uQDOJnS2","colab_type":"text"},"source":["## Implementation of Sparse VD layer"]},{"cell_type":"code","metadata":{"id":"GdxU0K10JnS3","colab_type":"code","colab":{}},"source":["class LinearSVDO(nn.Module):\n"," def __init__(self, in_features, out_features, threshold, bias=True):\n"," super(LinearSVDO, self).__init__()\n"," \"\"\"\n"," in_features: int, a number of input features\n"," out_features: int, a number of neurons\n"," threshold: float, a threshold for clipping weights\n"," \"\"\"\n"," \n"," self.in_features = in_features\n"," self.out_features = out_features\n"," self.threshold = threshold\n","\n"," self.mu = # torch.nn.parameter.Parameter of size out_features x in_features\n"," self.log_sigma = # torch.nn.parameter.Parameter of size out_features x in_features\n"," self.bias = # torch.nn.parameter.Parameter of size 1 x out_features\n"," self.reset_parameters()\n"," \n"," def reset_parameters(self):\n"," self.bias.data.zero_()\n"," self.mu.data.normal_(0, 0.02)\n"," self.log_sigma.data.fill_(-5) \n"," \n"," def forward(self, x): \n"," # x is a torch.Tensor of shape (number_of_objects, in_features)\n"," # log_alpha is a torch.Tensor of shape (out_features, in_features)\n"," self.log_alpha = # Compute using self.log_sigma and self.mu\n"," # clipping for a numerical stability\n"," self.log_alpha = torch.clamp(self.log_alpha, -10, 10) \n"," \n"," if self.training:\n"," # LRT = local reparametrization trick\n"," # lrt_mean is a torch.Tensor of shape (x.shape[0], out_features)\n"," lrt_mean = # compute mean activation using LRT\n"," # lrt_std is a torch.Tensor of shape (x.shape[0], out_features)\n"," lrt_std = # compute std of activations unsig lrt, \n"," # do not forget use torch.sqrt(x + 1e-8) instead of torch.sqrt(x)\n"," # eps is a torch.Tensor of shape (x.shape[0], out_features)\n"," eps = # sample of noise for reparametrization\n"," return # sample of activation\n"," \n"," out = # compute the output of the layer\n"," # use weights W = E q = self.mu\n"," # clip all weight with log_alpha > threshold\n"," return out\n"," \n"," def kl_reg(self):\n"," k1, k2, k3 = torch.Tensor([0.63576]), torch.Tensor([1.8732]), torch.Tensor([1.48695])\n"," # kl is a scalar torch.Tensor \n"," kl = # eval KL using the approximation\n"," return kl"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ps7_riOmJnS5","colab_type":"text"},"source":["## Define LeNet-300-100"]},{"cell_type":"code","metadata":{"id":"5w7rroIeJnS7","colab_type":"code","colab":{}},"source":["class Net(nn.Module):\n"," def __init__(self, threshold):\n"," super(Net, self).__init__()\n"," self.fc1 = LinearSVDO(28*28, 300, threshold)\n"," self.fc2 = LinearSVDO(300, 100, threshold)\n"," self.fc3 = LinearSVDO(100, 10, threshold)\n"," self.threshold=threshold\n","\n"," def forward(self, x):\n"," x = F.relu(self.fc1(x))\n"," x = F.relu(self.fc2(x))\n"," x = F.log_softmax(self.fc3(x), dim=1)\n"," return x"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"wlpR1Xr2JnS9","colab_type":"text"},"source":["## Function for loading MNIST"]},{"cell_type":"code","metadata":{"id":"6WWJNZYEJnS_","colab_type":"code","colab":{}},"source":["def get_mnist(batch_size):\n"," trsnform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])\n"," train_loader = torch.utils.data.DataLoader(\n"," datasets.MNIST('../data', train=True, download=True,\n"," transform=trsnform), batch_size=batch_size, shuffle=True)\n"," test_loader = torch.utils.data.DataLoader(\n"," datasets.MNIST('../data', train=False, download=True,\n"," transform=trsnform), batch_size=batch_size, shuffle=True)\n","\n"," return train_loader, test_loader"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"HeFnFEUdJnTB","colab_type":"text"},"source":["## Create SGVLB loss"]},{"cell_type":"code","metadata":{"id":"_3fVDkE1JnTC","colab_type":"code","colab":{}},"source":["class SGVLB(nn.Module):\n"," def __init__(self, net, train_size):\n"," super(SGVLB, self).__init__()\n"," self.train_size = train_size # int, the len of dataset\n"," self.net = net # nn.Module\n"," \n"," def forward(self, input, target, kl_weight=1.0):\n"," \"\"\"\n"," input: is a torch.Tensor (a predictions of the model) \n"," target: is a torch.Tensor (a tensor of labels) \n"," \"\"\"\n"," assert not target.requires_grad\n"," kl = 0.0\n"," for module in self.net.children():\n"," if hasattr(module, 'kl_reg'):\n"," kl = kl + module.kl_reg()\n"," \n"," sgvlb_loss = # a scalar torch.Tensor, SGVLB loss\n"," return sgvlb_loss"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"1i_Q00ESJnTF","colab_type":"text"},"source":["## Define the model"]},{"cell_type":"code","metadata":{"id":"2ROeFVbJJnTF","colab_type":"code","colab":{}},"source":["model = Net(threshold=3)\n","optimizer = # optimizer\n","scheduler = # decrease learning rate by torch.optim.lr_scheduler\n","\n","logger = Logger('sparse_vd', fmt={\n"," 'tr_loss': '3.1e',\n"," 'te_loss': '3.1e',\n"," 'sp_0': '.3f',\n"," 'sp_1': '.3f',\n"," 'sp_2': '.3f',\n"," 'lr': '3.1e',\n"," 'kl': '.2f',\n"," 'time': '.2f',\n","})\n","\n","train_loader, test_loader = get_mnist(batch_size=100)\n","sgvlb = SGVLB(model, len(train_loader.dataset))"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"iMR2MYENJnTI","colab_type":"text"},"source":["## Train the model"]},{"cell_type":"code","metadata":{"id":"1gIcWJ2zJnTI","colab_type":"code","colab":{}},"source":["# here is a cpu version of the code\n","# the solution/colab file have a gpu version that is faster 11s/epoch insted of 30s/epoch now\n","# if you use gpu version, be sure your sgvlb loss and model are on gpu \n","\n","kl_weight = 0.02\n","epochs = 100\n","\n","for epoch in range(1, epochs + 1):\n"," time_start = time.perf_counter()\n"," scheduler.step()\n"," model.train()\n"," train_loss, train_acc = 0, 0 \n"," kl_weight = min(kl_weight+0.02, 1)\n"," logger.add_scalar(epoch, 'kl', kl_weight)\n"," logger.add_scalar(epoch, 'lr', scheduler.get_lr()[0])\n"," for batch_idx, (data, target) in enumerate(train_loader):\n"," data = data.view(-1, 28*28)\n"," optimizer.zero_grad()\n"," \n"," output = model(data)\n"," pred = output.data.max(1)[1] \n"," loss = sgvlb(output, target, kl_weight)\n"," loss.backward()\n"," optimizer.step()\n"," \n"," train_loss += float(loss) \n"," train_acc += np.sum(pred.numpy() == target.data.numpy())\n","\n"," logger.add_scalar(epoch, 'tr_loss', train_loss / len(train_loader.dataset))\n"," logger.add_scalar(epoch, 'tr_acc', train_acc / len(train_loader.dataset) * 100)\n"," \n"," model.eval()\n"," test_loss, test_acc = 0, 0\n"," for batch_idx, (data, target) in enumerate(test_loader):\n"," data = data.view(-1, 28*28)\n"," output = model(data)\n"," test_loss += float(sgvlb(output, target, kl_weight))\n"," pred = output.data.max(1)[1] \n"," test_acc += np.sum(pred.numpy() == target.data.numpy())\n"," \n"," logger.add_scalar(epoch, 'te_loss', test_loss / len(test_loader.dataset))\n"," logger.add_scalar(epoch, 'te_acc', test_acc / len(test_loader.dataset) * 100)\n"," \n"," for i, c in enumerate(model.children()):\n"," if hasattr(c, 'kl_reg'):\n"," logger.add_scalar(epoch, 'sp_%s' % i, (c.log_alpha.data.numpy() > model.threshold).mean())\n"," \n"," logger.add_scalar(epoch, 'time', time.perf_counter() - time_start)\n"," logger.iter_info()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"1QxpeBe-JnTN","colab_type":"code","colab":{}},"source":["all_w, kep_w = 0, 0\n","\n","for c in model.children():\n"," kep_w += (c.log_alpha.data.numpy() < model.threshold).sum()\n"," all_w += c.log_alpha.data.numpy().size\n","\n","# compression_ratio should be > 30\n","compression_ratio = all_w/kep_w\n","print('compression_ratio =', compression_ratio)\n","assert compression_ratio > 30"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"-nOBQsM6JnTR","colab_type":"text"},"source":["## Disk space"]},{"cell_type":"code","metadata":{"id":"QNSNrt8UJnTT","colab_type":"code","colab":{}},"source":["import scipy\n","import numpy as np\n","from scipy.sparse import csc_matrix, csc_matrix, coo_matrix, dok_matrix\n","\n","row, col, data = [], [], []\n","M = list(model.children())[0].mu.data.numpy()\n","LA = list(model.children())[0].log_alpha.data.numpy()\n","\n","for i in range(300):\n"," for j in range(28*28):\n"," if LA[i, j] < 3:\n"," row += [i]\n"," col += [j]\n"," data += [M[i, j]]\n","\n","Mcsr = csc_matrix((data, (row, col)), shape=(300, 28*28))\n","Mcsc = csc_matrix((data, (row, col)), shape=(300, 28*28))\n","Mcoo = coo_matrix((data, (row, col)), shape=(300, 28*28))"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"t8PKeFPLJnTV","colab_type":"code","colab":{}},"source":["np.savez_compressed('M_w', M)\n","scipy.sparse.save_npz('Mcsr_w', Mcsr)\n","scipy.sparse.save_npz('Mcsc_w', Mcsc)\n","scipy.sparse.save_npz('Mcoo_w', Mcoo)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"jpgaUhgvJnTX","colab_type":"code","colab":{}},"source":["!ls -lah | grep .npz "],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"wKMPx_8sJnTZ","colab_type":"text"},"source":["## Visualization"]},{"cell_type":"code","metadata":{"id":"_aEHgR9UJnTb","colab_type":"code","colab":{}},"source":["import matplotlib.pyplot as plt\n","%matplotlib inline\n","import matplotlib as mpl\n","\n","from matplotlib import rcParams\n","rcParams['figure.figsize'] = 16, 4\n","rcParams['figure.dpi'] = 200\n","\n","\n","mask = (model.fc1.log_alpha.detach().numpy() < 3).astype(np.float)\n","W = model.fc1.mu.detach().numpy()\n","\n","# Normalize color map\n","max_val = np.max(np.abs(mask * W))\n","norm = mpl.colors.Normalize(vmin=-max_val,vmax=max_val)\n","\n","plt.imshow(mask * W, cmap='RdBu', interpolation=None, norm=norm)\n","plt.colorbar()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"eBY9gsrmJnTd","colab_type":"code","colab":{}},"source":["s = 0\n","z = np.zeros((28*15, 28*15))\n","\n","for i in range(15):\n"," for j in range(15):\n"," s += 1\n"," z[i*28:(i+1)*28, j*28:(j+1)*28] = np.abs((mask * mu)[s].reshape(28, 28))\n","\n","plt.figure(figsize=(8, 5))\n","plt.imshow(z, cmap='hot_r')\n","plt.colorbar()\n","plt.axis('off')"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"f_9cgWYVJnTg","colab_type":"text"},"source":["# Optional Research Assignment (up to 2 points)"]},{"cell_type":"markdown","metadata":{"id":"o9VHbVe8JnTh","colab_type":"text"},"source":["1. Study the model: \n"," - How sparsity and accuracy depend on maximum of KL-multiplier (kl_weight)?\n"," - How quality depends on the initialization of log_sigma (log_sigma)?\n"," - Study the KL approximation: what if we use the reparametrization trick to obtain an unbiased MC estimate of KL?\n","2. Compression:\n"," - What can we do to obtain better compression results with small quality degradation?\n"," - Propose and eval several options.\n","3. Study the Local reparametrization trick: \n"," - Does it really accelerate convergence?\n"," - Does variance of gradient decrease?\n"," \n","You can do one out of three parts. You need to provide evidence for results e.g., plots, etc."]},{"cell_type":"code","metadata":{"id":"BUC9yS-UJnTh","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]} -------------------------------------------------------------------------------- /seminars/day6/local_logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import random 4 | import numpy as np 5 | 6 | from collections import OrderedDict 7 | from tabulate import tabulate 8 | from pandas import DataFrame 9 | from time import gmtime, strftime 10 | 11 | 12 | class Logger: 13 | def __init__(self, name='name', fmt=None): 14 | self.handler = True 15 | self.scalar_metrics = OrderedDict() 16 | self.fmt = fmt if fmt else dict() 17 | 18 | base = './logs' 19 | if not os.path.exists(base): os.mkdir(base) 20 | 21 | time = gmtime() 22 | hash = ''.join([chr(random.randint(97, 122)) for _ in range(3)]) 23 | fname = '-'.join(sys.argv[0].split('/')[-3:]) 24 | self.path = '%s/%s-%s-%s-%s' % (base, fname, name, hash, strftime('%m-%d-%H:%M', time)) 25 | 26 | self.logs = self.path + '.csv' 27 | self.output = self.path + '.out' 28 | self.checkpoint = self.path + '.cpt' 29 | 30 | def prin(*args): 31 | str_to_write = ' '.join(map(str, args)) 32 | with open(self.output, 'a') as f: 33 | f.write(str_to_write + '\n') 34 | f.flush() 35 | 36 | print(str_to_write) 37 | sys.stdout.flush() 38 | 39 | self.print = prin 40 | 41 | def add_scalar(self, t, key, value): 42 | if key not in self.scalar_metrics: 43 | self.scalar_metrics[key] = [] 44 | self.scalar_metrics[key] += [(t, value)] 45 | 46 | def iter_info(self, order=None): 47 | names = list(self.scalar_metrics.keys()) 48 | if order: 49 | names = order 50 | values = [self.scalar_metrics[name][-1][1] for name in names] 51 | t = int(np.max([self.scalar_metrics[name][-1][0] for name in names])) 52 | fmt = ['%s'] + [self.fmt[name] if name in self.fmt else '.1f' for name in names] 53 | 54 | if self.handler: 55 | self.handler = False 56 | self.print(tabulate([[t] + values], ['epoch'] + names, floatfmt=fmt)) 57 | else: 58 | self.print(tabulate([[t] + values], ['epoch'] + names, tablefmt='plain', floatfmt=fmt).split('\n')[1]) 59 | 60 | def save(self): 61 | result = None 62 | for key in self.scalar_metrics.keys(): 63 | if result is None: 64 | result = DataFrame(self.scalar_metrics[key], columns=['t', key]).set_index('t') 65 | else: 66 | df = DataFrame(self.scalar_metrics[key], columns=['t', key]).set_index('t') 67 | result = result.join(df, how='outer') 68 | result.to_csv(self.logs) 69 | self.print('The log/output/model have been saved to: ' + self.path + ' + .csv/.out/.cpt') -------------------------------------------------------------------------------- /seminars/day6/readme.md: -------------------------------------------------------------------------------- 1 | # Seminar on Sparsification of deep neural networks 2 | [Link to a Google Colab version](https://colab.research.google.com/drive/1Jr9ZpPIUJZMw6v0SRbnspCl2uKcoKQ-L) 3 | --------------------------------------------------------------------------------