├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── _config.yml
├── _data
    ├── options.yml
    └── social.yml
├── _includes
    ├── footer.html
    ├── head.html
    ├── header.html
    └── print-footer.html
├── _layouts
    ├── default.html
    ├── full-width.html
    ├── page.html
    └── post.html
├── _plugins
    ├── fullwidth.rb
    ├── main_column_img.rb
    ├── margin_figure.rb
    ├── marginnote.rb
    ├── mathjaxtag.rb
    ├── newthought.rb
    └── sidenote.rb
├── _sass
    ├── _fonts.scss
    ├── _settings.scss
    └── _syntax-highlighting.scss
├── autoregressive
    ├── autoregressive.png
    ├── fvsbn.png
    ├── index.md
    ├── index.tex
    └── nade.png
├── css
    ├── tufte.css
    ├── tufte.orginal.css
    └── tufte.scss
├── docs
    ├── LICENSE
    ├── Makefile
    ├── autoregressive
    │   ├── autoregressive.png
    │   ├── fvsbn.png
    │   ├── index.html
    │   ├── index.tex
    │   └── nade.png
    ├── css
    │   ├── tufte.css
    │   └── tufte.orginal.css
    ├── flow
    │   ├── flow-graphical.PNG
    │   ├── iaf.PNG
    │   ├── index.html
    │   └── maf.PNG
    ├── fonts
    │   ├── et-bembo
    │   │   ├── et-bembo-bold-line-figures
    │   │   │   ├── et-bembo-bold-line-figures.eot
    │   │   │   ├── et-bembo-bold-line-figures.svg
    │   │   │   ├── et-bembo-bold-line-figures.ttf
    │   │   │   └── et-bembo-bold-line-figures.woff
    │   │   ├── et-bembo-display-italic-old-style-figures
    │   │   │   ├── et-bembo-display-italic-old-style-figures.eot
    │   │   │   ├── et-bembo-display-italic-old-style-figures.svg
    │   │   │   ├── et-bembo-display-italic-old-style-figures.ttf
    │   │   │   └── et-bembo-display-italic-old-style-figures.woff
    │   │   ├── et-bembo-roman-line-figures
    │   │   │   ├── et-bembo-roman-line-figures.eot
    │   │   │   ├── et-bembo-roman-line-figures.svg
    │   │   │   ├── et-bembo-roman-line-figures.ttf
    │   │   │   └── et-bembo-roman-line-figures.woff
    │   │   ├── et-bembo-roman-old-style-figures
    │   │   │   ├── et-bembo-roman-old-style-figures.eot
    │   │   │   ├── et-bembo-roman-old-style-figures.svg
    │   │   │   ├── et-bembo-roman-old-style-figures.ttf
    │   │   │   └── et-bembo-roman-old-style-figures.woff
    │   │   └── et-bembo-semi-bold-old-style-figures
    │   │   │   ├── et-bembo-semi-bold-old-style-figures.eot
    │   │   │   ├── et-bembo-semi-bold-old-style-figures.svg
    │   │   │   ├── et-bembo-semi-bold-old-style-figures.ttf
    │   │   │   └── et-bembo-semi-bold-old-style-figures.woff
    │   ├── et-book
    │   │   ├── et-book-bold-line-figures
    │   │   │   ├── et-book-bold-line-figures.eot
    │   │   │   ├── et-book-bold-line-figures.svg
    │   │   │   ├── et-book-bold-line-figures.ttf
    │   │   │   └── et-book-bold-line-figures.woff
    │   │   ├── et-book-display-italic-old-style-figures
    │   │   │   ├── et-book-display-italic-old-style-figures.eot
    │   │   │   ├── et-book-display-italic-old-style-figures.svg
    │   │   │   ├── et-book-display-italic-old-style-figures.ttf
    │   │   │   └── et-book-display-italic-old-style-figures.woff
    │   │   ├── et-book-roman-line-figures
    │   │   │   ├── et-book-roman-line-figures.eot
    │   │   │   ├── et-book-roman-line-figures.svg
    │   │   │   ├── et-book-roman-line-figures.ttf
    │   │   │   └── et-book-roman-line-figures.woff
    │   │   ├── et-book-roman-old-style-figures
    │   │   │   ├── et-book-roman-old-style-figures.eot
    │   │   │   ├── et-book-roman-old-style-figures.svg
    │   │   │   ├── et-book-roman-old-style-figures.ttf
    │   │   │   └── et-book-roman-old-style-figures.woff
    │   │   └── et-book-semi-bold-old-style-figures
    │   │   │   ├── et-book-semi-bold-old-style-figures.eot
    │   │   │   ├── et-book-semi-bold-old-style-figures.svg
    │   │   │   ├── et-book-semi-bold-old-style-figures.ttf
    │   │   │   └── et-book-semi-bold-old-style-figures.woff
    │   ├── icomoon.eot
    │   ├── icomoon.svg
    │   ├── icomoon.ttf
    │   └── icomoon.woff
    ├── gan
    │   ├── cyclegan_gendisc.png
    │   ├── gan.png
    │   ├── index.html
    │   └── index.tex
    ├── index.html
    ├── introduction
    │   ├── index.html
    │   ├── learning.png
    │   ├── learning_1.png
    │   └── learning_2.png
    └── vae
    │   ├── index.html
    │   ├── klgap.png
    │   └── vae.png
├── flow
    ├── flow-graphical.PNG
    ├── flow-graphical.png
    ├── iaf.PNG
    ├── iaf.png
    ├── index.md
    ├── maf.PNG
    └── maf.png
├── fonts
    ├── et-bembo
    │   ├── et-bembo-bold-line-figures
    │   │   ├── et-bembo-bold-line-figures.eot
    │   │   ├── et-bembo-bold-line-figures.svg
    │   │   ├── et-bembo-bold-line-figures.ttf
    │   │   └── et-bembo-bold-line-figures.woff
    │   ├── et-bembo-display-italic-old-style-figures
    │   │   ├── et-bembo-display-italic-old-style-figures.eot
    │   │   ├── et-bembo-display-italic-old-style-figures.svg
    │   │   ├── et-bembo-display-italic-old-style-figures.ttf
    │   │   └── et-bembo-display-italic-old-style-figures.woff
    │   ├── et-bembo-roman-line-figures
    │   │   ├── et-bembo-roman-line-figures.eot
    │   │   ├── et-bembo-roman-line-figures.svg
    │   │   ├── et-bembo-roman-line-figures.ttf
    │   │   └── et-bembo-roman-line-figures.woff
    │   ├── et-bembo-roman-old-style-figures
    │   │   ├── et-bembo-roman-old-style-figures.eot
    │   │   ├── et-bembo-roman-old-style-figures.svg
    │   │   ├── et-bembo-roman-old-style-figures.ttf
    │   │   └── et-bembo-roman-old-style-figures.woff
    │   └── et-bembo-semi-bold-old-style-figures
    │   │   ├── et-bembo-semi-bold-old-style-figures.eot
    │   │   ├── et-bembo-semi-bold-old-style-figures.svg
    │   │   ├── et-bembo-semi-bold-old-style-figures.ttf
    │   │   └── et-bembo-semi-bold-old-style-figures.woff
    ├── et-book
    │   ├── et-book-bold-line-figures
    │   │   ├── et-book-bold-line-figures.eot
    │   │   ├── et-book-bold-line-figures.svg
    │   │   ├── et-book-bold-line-figures.ttf
    │   │   └── et-book-bold-line-figures.woff
    │   ├── et-book-display-italic-old-style-figures
    │   │   ├── et-book-display-italic-old-style-figures.eot
    │   │   ├── et-book-display-italic-old-style-figures.svg
    │   │   ├── et-book-display-italic-old-style-figures.ttf
    │   │   └── et-book-display-italic-old-style-figures.woff
    │   ├── et-book-roman-line-figures
    │   │   ├── et-book-roman-line-figures.eot
    │   │   ├── et-book-roman-line-figures.svg
    │   │   ├── et-book-roman-line-figures.ttf
    │   │   └── et-book-roman-line-figures.woff
    │   ├── et-book-roman-old-style-figures
    │   │   ├── et-book-roman-old-style-figures.eot
    │   │   ├── et-book-roman-old-style-figures.svg
    │   │   ├── et-book-roman-old-style-figures.ttf
    │   │   └── et-book-roman-old-style-figures.woff
    │   └── et-book-semi-bold-old-style-figures
    │   │   ├── et-book-semi-bold-old-style-figures.eot
    │   │   ├── et-book-semi-bold-old-style-figures.svg
    │   │   ├── et-book-semi-bold-old-style-figures.ttf
    │   │   └── et-book-semi-bold-old-style-figures.woff
    ├── icomoon.eot
    ├── icomoon.svg
    ├── icomoon.ttf
    └── icomoon.woff
├── gan
    ├── cyclegan_gendisc.png
    ├── gan.png
    ├── index.md
    └── index.tex
├── index.md
├── introduction
    ├── index.md
    ├── learning.png
    ├── learning_1.png
    └── learning_2.png
└── vae
    ├── index.md
    ├── klgap.png
    └── vae.png


/.gitignore:
--------------------------------------------------------------------------------
1 | _site
2 | .sass-cache
3 | .DS\_Store
4 | config.codekit
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Aditya Grover, Stefano Ermon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TEMPDIR := $(shell mktemp -d -t tmp.XXX)
 2 | 
 3 | publish:
 4 | 	echo 'hmmm'
 5 | 	cp -r ./_site/* $(TEMPDIR)
 6 | 	cd $(TEMPDIR) && \
 7 | 	ls -a  && \
 8 | 	git init && \
 9 | 	git add . && \
10 | 	git commit -m 'publish site' && \
11 | 	git remote add origin https://github.com/deepgenerativemodels/notes.git && \
12 | 	git push origin master:refs/heads/gh-pages --force
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Notes on Deep Generative Models
 2 | 
 3 | These notes form a concise introductory course on deep generative models. They are based on Stanford [CS236](https://deepgenerativemodels.github.io/), taught by [Aditya Grover](http://aditya-grover.github.io/) and [Stefano Ermon](http://cs.stanford.edu/~ermon/), and have been written by [Aditya Grover](http://aditya-grover.github.io/), with the [help](https://github.com/deepgenerativemodels/notes/commits/master) of many students and course staff.
 4 | 
 5 | The compiled version is available [here](https://deepgenerativemodels.github.io/notes/index.html).
 6 | 
 7 | ## Contributing
 8 | 
 9 | This material is under construction! Although we have written up most of it, you will probably find several typos. If you do, please let us know, or submit a pull request with your fixes via Github.
10 | 
11 | 
12 | The notes are written in Markdown and are compiled into HTML using Jekyll. Please add your changes directly to the Markdown source code. In order to install jekyll, you can follow the instructions posted on their website (https://jekyllrb.com/docs/installation/). 
13 | 
14 | Note that jekyll is only supported on GNU/Linux, Unix, or macOS. Thus, if you run Windows 10 on your local machine, you will have to install Bash on Ubuntu on Windows. Windows gives instructions on how to do that <a href="https://docs.microsoft.com/en-us/windows/wsl/install-win10">here</a> and Jekyll's <a href="https://jekyllrb.com/docs/windows/">website</a> offers helpful instructions on how to proceed through the rest of the process.
15 | 
16 | To compile Markdown to HTML (i.e. after you have made changes to markdown and want them to be accessible to students viewing the docs), 
17 | run the following commands from the root of your cloned version of the https://github.com/deepgenerativemodels/notes repo:
18 | 1) rm -r docs/
19 | 2) jekyll serve  # This should create a folder called _site. Note: This creates a running server; press Ctrl-C to stop the server before proceeding
20 | 3) mv _site docs  # Change the name of the _site folder to "docs". This won't work if the server is still running.
21 | 4) git add file_names
22 | 5) git commit -am "your commit message describing what you did"
23 | 6) git push origin master
24 | 
25 | Note that if you cloned the ermongroup/cs228-notes repo directly onto your local machine (instead of forking it) then you may see an error like "remote: Permission to ermongroup/cs228-notes.git denied to userjanedoe". If that is the case, then you need to fork their repo first. Then, if your github profile were userjanedoe, you would need to first push your local updates to your forked repo like so:
26 | 
27 | git push https://github.com/deepgenerativemodels/notes.git master
28 | 
29 | And then you could go and submit the pull request through the GitHub website.
30 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
 1 | baseurl: /notes
 2 | title: Deep Generative Models
 3 | subtitle: Lecture notes
 4 | author: Aditya Grover
 5 | simple_search: http://google.com/search
 6 | description: Lecture notes for Deep Generative Models.
 7 | name: notes
 8 | markdown_ext: "markdown,mkdown,mkdn,mkd,md"
 9 | permalink: /articles/:short_year/:title
10 | timezone: America/New_York
11 | excerpt_separator: <!--more-->  # you can specify your own separator, of course.
12 | exclude: ['Gemfile', 'Gemfile.lock', 'Rakefile', 'README.md']
13 | destination: docs
14 | google_analytics: UA-129020129-1
15 | post:
16 |   template: _post.txt
17 |   extension: md
18 | page:
19 |   template: _page.txt
20 |   extension: md
21 | editor: gvim
22 | git:
23 |   branch: master
24 | transfer:
25 |   command: rsync
26 |   settings: -av
27 |   source: _site/


--------------------------------------------------------------------------------
/_data/options.yml:
--------------------------------------------------------------------------------
1 | mathjax: true
2 | lato_font_load: true


--------------------------------------------------------------------------------
/_data/social.yml:
--------------------------------------------------------------------------------
 1 | - link: //www.twitter.com/twitter_handle 
 2 |   icon: icon-twitter
 3 | - link: //plus.google.com/+googlePlusName 
 4 |   icon: icon-googleplus
 5 | - link: //github.com/GithubHandle 
 6 |   icon: icon-github
 7 | - link: //www.flickr.com/photos/FlickrUserID 
 8 |   icon: icon-flickr
 9 | - link: /feed 
10 |   icon: icon-feed


--------------------------------------------------------------------------------
/_includes/footer.html:
--------------------------------------------------------------------------------
 1 | <footer>
 2 |   <hr class="slender">
 3 |   <!-- <ul class="footer&#45;links"> -->
 4 |   <!--   <li><a href="mailto:hate@spam.net"><span class="icon&#45;mail"></span></a></li>     -->
 5 |   <!--   {% for entry in site.data.social %} -->
 6 |   <!--     <li> -->
 7 |   <!--       <a href="{{ entry.link }}"><span class="{{ entry.icon }}"></span></a> -->
 8 |   <!--     </li> -->
 9 |   <!--   {% endfor %}   -->
10 |   <!-- </ul> -->
11 | <div class="credits">
12 | <!-- <span>&#38;copy; {{ site.time | date: '%Y' }} <!&#45;&#45; &#38;#38;nbsp;&#38;#38;nbsp;{{site.author | upcase }} &#45;&#45;></span></br> <br> -->
13 | <span>Site created with <a href="//jekyllrb.com">Jekyll</a> using the <a href="//github.com/clayh53/tufte-jekyll">Tufte theme</a>. &copy; {{ site.time | date: '%Y' }}</span> 
14 | </div>  
15 | </footer>
16 | 


--------------------------------------------------------------------------------
/_includes/head.html:
--------------------------------------------------------------------------------
 1 | <head>
 2 |   <meta charset="utf-8">
 3 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
 4 |   <meta name="viewport" content="width=device-width, initial-scale=1">
 5 | 
 6 |   <title>{% if page.title %}{{ page.title }}{% else %}{{ site.title }}{% endif %}</title>
 7 |   <meta name="description" content="{% if page.excerpt %}{{ page.excerpt | strip_html | strip_newlines | truncate: 160 }}{% else %}{{ site.description }}{% endif %}">
 8 | 
 9 | 
10 |   <link rel="stylesheet" href="{{ "/css/tufte.css" | prepend: site.baseurl }}">	
11 |   
12 | 
13 |   <!-- Google Fonts loaded here depending on setting in _data/options.yml true loads font, blank does not-->
14 |   {% if site.data.options.lato_font_load %}
15 |     <link href='//fonts.googleapis.com/css?family=Lato:400,400italic' rel='stylesheet' type='text/css'>
16 |   {% endif %}
17 |   <!-- Load up MathJax script if needed ... specify in /_data/options.yml file-->
18 |   {% if site.data.options.mathjax %}
19 |     <script type="text/javascript" src="//cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
20 |   {% endif %}
21 | 
22 |   <script>
23 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
24 |   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
25 |   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
26 |   })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
27 | 
28 |   ga('create', '{{ site.google_analytics }}', 'auto');
29 |   ga('send', 'pageview');
30 | 
31 |   </script>
32 | 
33 |   <link rel="canonical" href="{{ page.url | replace:'index.html','' | prepend: site.baseurl | prepend: site.url }}">
34 |   <link rel="alternate" type="application/rss+xml" title="{{ site.title }}" href="{{ "/feed.xml" | prepend: site.baseurl | prepend: site.url }}" />
35 | </head>
36 | 


--------------------------------------------------------------------------------
/_includes/header.html:
--------------------------------------------------------------------------------
1 | <!--- Header and nav template site-wide -->
2 | <header>
3 |     <nav class="group">
4 |         <a href="{{ site.baseurl }}/">Contents</a>
5 | 	<a href="http://deepgenerativemodels.github.io">Class</a>
6 | 	<a href="http://github.com/deepgenerativemodels/notes">Github</a>
7 | 	</nav>
8 | </header>
9 | 


--------------------------------------------------------------------------------
/_includes/print-footer.html:
--------------------------------------------------------------------------------
1 | <span class="print-footer">{% if page.date %}{{ page.title }} - {{ page.date | date: "%B %-d, %Y" }} - {{site.author}}{% else %}{{ page.title }} - {{site.author}}{% endif %}</span>


--------------------------------------------------------------------------------
/_layouts/default.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   {% include head.html %}
 4 |   <body>
 5 |     {% include header.html %}
 6 |     <article class="group">
 7 |       {{ content }}
 8 |     </article>
 9 |     {% include print-footer.html %}
10 |     {% include footer.html %}
11 |   </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/_layouts/full-width.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   {% include head.html %}
 4 |   <body class="full-width">
 5 |     {% include header.html %}
 6 |     <article>
 7 |       {{ content }}
 8 |     </article>
 9 |     {% include print-footer.html %}
10 |     {% include footer.html %}
11 |   </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/_layouts/page.html:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | ---
4 | <h1>{{ page.title | capitalize }}</h1>
5 | <p class="subtitle">{{ page.date | date: "%B %-d, %Y" }}</p>
6 | 
7 | 
8 | {{ content }}
9 | 


--------------------------------------------------------------------------------
/_layouts/post.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: default
 3 | ---
 4 | <h1>{{ page.title | capitalize }}</h1>
 5 | <p class="subtitle">{{ page.date | date: "%B %-d, %Y" }}</p>
 6 | 
 7 | 
 8 | <script type="text/x-mathjax-config">
 9 |   MathJax.Hub.Config({
10 |     TeX: {
11 |       Macros: {
12 |         e: "\\epsilon",
13 |         xti: "x^{(i)}",
14 |         yti: "y^{(i)}",
15 |         bfy: "{\\bf y}",
16 |         bfx: "{\\bf x}",
17 |         bfg: "{\\bf g}",
18 |         bfbeta: "{\\bf \\beta}",
19 |         tp: "\\tilde p",
20 |         pt: "p_\\theta",
21 |         Exp: "{\\mathbb{E}}",
22 |         Ind: "{\\mathbb{I}}",
23 |         KL: "{\\mathbb{KL}}",
24 |         Dc: "{\\mathcal{D}}",
25 |         Tc: "{\\mathcal{T}}",
26 |         Xc: "{\\mathcal{X}}",
27 |         note: ["\\textcolor{blue}{[NOTE: #1]}",1]
28 |       }
29 |     }
30 |   });
31 | </script>
32 | 
33 | 
34 | {{ content }}
35 | 
36 | 


--------------------------------------------------------------------------------
/_plugins/fullwidth.rb:
--------------------------------------------------------------------------------
 1 | ## This has a fairly harmless hack that wraps the img tag in a div to prevent it from being
 2 | ## wrapped in a paragraph tag instead, which would totally fuck things up layout-wise
 3 | ## Usage {% fullwidth 'path/to/image' 'caption goes here in quotes' %}
 4 | #
 5 | module Jekyll
 6 |   class RenderFullWidthTag < Liquid::Tag
 7 |     
 8 |     require "shellwords"
 9 | 
10 |     def initialize(tag_name, text, tokens)
11 |       super
12 |       @text = text.shellsplit
13 |     end
14 | 
15 |     def render(context)
16 |       baseurl = context.registers[:site].config['baseurl']
17 |       if @text[0].start_with?('http://', 'https://','//')
18 |         "<figure class='fullwidth'><img src='#{@text[0]}'/>"+
19 |         "<figcaption>#{@text[1]}</figcaption></figure>"
20 |       else
21 |         "<figure class='fullwidth'><img src='#{baseurl}/#{@text[0]}'/>"+
22 |         "<figcaption>#{@text[1]}</figcaption></figure>"
23 |       end
24 |     end
25 |   end
26 | end
27 | 
28 | Liquid::Template.register_tag('fullwidth', Jekyll::RenderFullWidthTag)
29 | 


--------------------------------------------------------------------------------
/_plugins/main_column_img.rb:
--------------------------------------------------------------------------------
 1 | ## Liquid tag 'maincolumn-figure' used to add image data that fits within the
 2 | ## main column area of the layout
 3 | ## Usage {% maincolumn 'path/to/image' 'This is the caption' %}
 4 | #
 5 | module Jekyll
 6 |   class RenderMainColumnTag < Liquid::Tag
 7 | 
 8 |   	require "shellwords"
 9 | 
10 |     def initialize(tag_name, text, tokens)
11 |       super
12 |       @text = text.shellsplit
13 |     end
14 | 
15 |     def render(context)
16 |       baseurl = context.registers[:site].config['baseurl']
17 |       if @text[0].start_with?('http://', 'https://','//')
18 |         "<figure><figcaption>#{@text[1]}</figcaption><img src='#{@text[0]}'/></figure>"
19 |       else
20 |         "<figure><figcaption>#{@text[1]}</figcaption><img src='#{baseurl}/#{@text[0]}'/></figure>"
21 |       end
22 |     end
23 |   end
24 | end
25 | 
26 | Liquid::Template.register_tag('maincolumn', Jekyll::RenderMainColumnTag)
27 | 


--------------------------------------------------------------------------------
/_plugins/margin_figure.rb:
--------------------------------------------------------------------------------
 1 | ## Liquid tag 'maincolumn' used to add image data that fits within the main
 2 | ## column area of the layout
 3 | ## Usage {% marginfigure 'margin-id-whatever' 'path/to/image' 'This is the caption' %}
 4 | #
 5 | module Jekyll
 6 |   class RenderMarginFigureTag < Liquid::Tag
 7 | 
 8 |   	require "shellwords"
 9 | 
10 |     def initialize(tag_name, text, tokens)
11 |       super
12 |       @text = text.shellsplit
13 |     end
14 | 
15 |     def render(context)
16 |       baseurl = context.registers[:site].config['baseurl']
17 |       if @text[1].start_with?('http://', 'https://', '//')
18 |         "<label for='#{@text[0]}' class='margin-toggle'>&#8853;</label>"+
19 |         "<input type='checkbox' id='#{@text[0]}' class='margin-toggle'/>"+
20 |         "<span class='marginnote'><img class='fullwidth' src='#{@text[1]}'/><br>#{@text[2]}</span>"
21 |       else
22 |         "<label for='#{@text[0]}' class='margin-toggle'>&#8853;</label>"+
23 |         "<input type='checkbox' id='#{@text[0]}' class='margin-toggle'/>"+
24 |         "<span class='marginnote'><img class='fullwidth' src='#{baseurl}/#{@text[1]}'/><br>#{@text[2]}</span>"
25 |       end
26 |     end
27 |   end
28 | end
29 | 
30 | Liquid::Template.register_tag('marginfigure', Jekyll::RenderMarginFigureTag)
31 | 


--------------------------------------------------------------------------------
/_plugins/marginnote.rb:
--------------------------------------------------------------------------------
 1 | module Jekyll
 2 |   class RenderMarginNoteTag < Liquid::Tag
 3 | 
 4 |     require "shellwords"
 5 | 
 6 |     def initialize(tag_name, text, tokens)
 7 |       super
 8 |       @text = text.shellsplit
 9 |     end
10 | 
11 |     def render(context)
12 |       "<label for='#{@text[0]}' class='margin-toggle'> &#8853;</label><input type='checkbox' id='#{@text[0]}' class='margin-toggle'/><span class='marginnote'>#{@text[1]} </span>"
13 |     end
14 |   end
15 | end
16 | 
17 | Liquid::Template.register_tag('marginnote', Jekyll::RenderMarginNoteTag)
18 | 
19 | 


--------------------------------------------------------------------------------
/_plugins/mathjaxtag.rb:
--------------------------------------------------------------------------------
 1 | module Jekyll
 2 |   class MathJaxBlockTag < Liquid::Tag
 3 |     def render(context)
 4 |       '<div class="mathblock"><script type="math/tex; mode=display">'
 5 |     end
 6 |   end
 7 | class MathJaxInlineTag < Liquid::Tag
 8 |     def render(context)
 9 |       '<span>&#8203;<script type="math/tex">'
10 |     end
11 |   end
12 | class MathJaxEndBlockTag < Liquid::Tag
13 |     def render(context)
14 |       '</script></div>'
15 |     end
16 |   end
17 | class MathJaxEndInlineTag < Liquid::Tag
18 |     def render(context)
19 |       '</script></span>'
20 |     end
21 |   end  
22 | end
23 |  
24 | Liquid::Template.register_tag('math', Jekyll::MathJaxBlockTag)
25 | Liquid::Template.register_tag('m', Jekyll::MathJaxInlineTag)
26 | Liquid::Template.register_tag('endmath', Jekyll::MathJaxEndBlockTag)
27 | Liquid::Template.register_tag('em', Jekyll::MathJaxEndInlineTag)


--------------------------------------------------------------------------------
/_plugins/newthought.rb:
--------------------------------------------------------------------------------
 1 | ## Newthought tag will render anything in the tag with small caps
 2 | ## Usage {% newthought Your text string here} will render a span
 3 | ## YOUR TEXT STRING HERE (sort of, you know, small caps) if you are using the tufte.css file
 4 | 
 5 | module Jekyll
 6 |   class RenderNewThoughtTag < Liquid::Tag
 7 | 
 8 | require "shellwords"
 9 | 
10 |     def initialize(tag_name, text, tokens)
11 |       super
12 |       @text = text.shellsplit
13 |     end
14 | 
15 | 
16 |     def render(context)
17 |       "<span class='newthought'>#{@text[0]}</span> "
18 |     end
19 |   end
20 | end
21 | 
22 | Liquid::Template.register_tag('newthought', Jekyll::RenderNewThoughtTag)


--------------------------------------------------------------------------------
/_plugins/sidenote.rb:
--------------------------------------------------------------------------------
 1 | module Jekyll
 2 |   class RenderSideNoteTag < Liquid::Tag
 3 | 
 4 |     require "shellwords"
 5 | 
 6 |     def initialize(tag_name, text, tokens)
 7 |       super
 8 |       @text = text.shellsplit
 9 |     end
10 | 
11 |     def render(context)
12 |       "<label for='#{@text[0]}' class='margin-toggle sidenote-number'></label><input type='checkbox' id='#{@text[0]}' class='margin-toggle'/><span class='sidenote'>#{@text[1]} </span>"
13 |     end
14 |   end
15 | end
16 | 
17 | Liquid::Template.register_tag('sidenote', Jekyll::RenderSideNoteTag)
18 | 
19 | 


--------------------------------------------------------------------------------
/_sass/_fonts.scss:
--------------------------------------------------------------------------------
 1 | // Font imports file. If you don't want these fonts, comment out these and add your own into the fonts directory 
 2 | // and point the src attribute to the file.
 3 | // 
 4 | 
 5 | @charset "UTF-8";
 6 | //
 7 | // @font-face {
 8 | //   font-family: ETBembo;
 9 | //   src: url("../fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.eot");
10 | //   src: url("../fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.eot?#iefix") format("embedded-opentype"), url("../fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.woff") format("woff"), url("../fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.ttf") format("truetype"), url("../fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.svg#etbemboromanosf") format("svg");
11 | //   font-weight: normal;
12 | //   font-style: normal
13 | // }
14 | //
15 | // @font-face {
16 | //   font-family: ETBembo;
17 | //   src: url("../fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.eot");
18 | //   src: url("../fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.eot?#iefix") format("embedded-opentype"), url("../fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.woff") format("woff"), url("../fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.ttf") format("truetype"), url("../fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.svg#etbemboromanosf") format("svg");
19 | //   font-weight: normal;
20 | //   font-style: italic
21 | // }
22 | //
23 | // @font-face {
24 | //   font-family: ETBembo;
25 | //   src: url("../fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.eot");
26 | //   src: url("../fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.eot?#iefix") format("embedded-opentype"), url("../fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.woff") format("woff"), url("../fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.ttf") format("truetype"), url("../fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.svg#etbemboromanosf") format("svg");
27 | //   font-weight: bold;
28 | //   font-style: normal
29 | // }
30 | //
31 | // @font-face {
32 | //   font-family: ETBemboRomanOldStyle;
33 | //   src: url("../fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.eot");
34 | //   src: url("../fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.eot?#iefix") format("embedded-opentype"), url("../fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.woff") format("woff"), url("../fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.ttf") format("truetype"), url("../fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.svg#etbemboromanosf") format("svg");
35 | //   font-weight: normal;
36 | //   font-style: normal;
37 | // }
38 | 
39 | 
40 | @font-face {
41 |   font-family: "et-book";
42 |   src: url("../fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot");
43 |   src: url("../fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot?#iefix") format("embedded-opentype"), url("../fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff") format("woff"), url("../fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf") format("truetype"), url("../fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.svg#etbookromanosf") format("svg");
44 |   font-weight: normal;
45 |   font-style: normal
46 | }
47 | 
48 | @font-face {
49 |   font-family: "et-book";
50 |   src: url("../fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot");
51 |   src: url("../fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot?#iefix") format("embedded-opentype"), url("../fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff") format("woff"), url("../fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf") format("truetype"), url("../fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.svg#etbookromanosf") format("svg");
52 |   font-weight: normal;
53 |   font-style: italic
54 | }
55 | 
56 | @font-face {
57 |   font-family: "et-book";
58 |   src: url("../fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot");
59 |   src: url("../fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot?#iefix") format("embedded-opentype"), url("../fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff") format("woff"), url("../fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf") format("truetype"), url("../fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.svg#etbookromanosf") format("svg");
60 |   font-weight: bold;
61 |   font-style: normal
62 | }
63 | 
64 | @font-face {
65 |   font-family: "et-book-roman-old-style";
66 |   src: url("../fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot");
67 |   src: url("../fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot?#iefix") format("embedded-opentype"), url("../fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff") format("woff"), url("../fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf") format("truetype"), url("../fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.svg#etbookromanosf") format("svg");
68 |   font-weight: normal;
69 |   font-style: normal;
70 | }
71 | 


--------------------------------------------------------------------------------
/_sass/_settings.scss:
--------------------------------------------------------------------------------
 1 | /* This file contains all the constants for colors and font styles */
 2 | 
 3 | $body-font:   et-book, Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif;
 4 | // $body-font:   ETBembo, Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif;
 5 | // Note that Gill Sans is the top of the stack and corresponds to what is used in Tufte's books
 6 | // However, it is not a free font, so if it is not present on the computer that is viewing the webpage
 7 | // The free Google 'Lato' font is used instead. It is similar.
 8 | $sans-font:  "Gill Sans", "Gill Sans MT", "Lato", Calibri, sans-serif;
 9 | $code-font: Consolas, "Liberation Mono", Menlo, Courier, monospace;
10 | $url-font: "Lucida Console", "Lucida Sans Typewriter", Monaco, "Bitstream Vera Sans Mono", monospace;
11 | $text-color: #111;
12 | $bg-color: #fffff8;
13 | $contrast-color: #a00000;
14 | $border-color: #333333;
15 | $link-style: underline; // choices are 'color' or 'underline'. Default is color using $contrast-color set above
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/_sass/_syntax-highlighting.scss:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Syntax highlighting styles
 3 |  */
 4 | $spacing-unit:     30px;
 5 | %vertical-rhythm {
 6 |     margin-bottom: $spacing-unit / 2;
 7 | }
 8 | 
 9 | .highlight {
10 |     background: #fffff8;
11 |     @extend %vertical-rhythm;
12 | 
13 |     .c     { color: #998; font-style: italic } // Comment
14 |     .err   { color: #a61717; background-color: #e3d2d2 } // Error
15 |     .k     { font-weight: bold } // Keyword
16 |     .o     { font-weight: bold } // Operator
17 |     .cm    { color: #998; font-style: italic } // Comment.Multiline
18 |     .cp    { color: #999; font-weight: bold } // Comment.Preproc
19 |     .c1    { color: #998; font-style: italic } // Comment.Single
20 |     .cs    { color: #999; font-weight: bold; font-style: italic } // Comment.Special
21 |     .gd    { color: #000; background-color: #fdd } // Generic.Deleted
22 |     .gd .x { color: #000; background-color: #faa } // Generic.Deleted.Specific
23 |     .ge    { font-style: italic } // Generic.Emph
24 |     .gr    { color: #a00 } // Generic.Error
25 |     .gh    { color: #999 } // Generic.Heading
26 |     .gi    { color: #000; background-color: #dfd } // Generic.Inserted
27 |     .gi .x { color: #000; background-color: #afa } // Generic.Inserted.Specific
28 |     .go    { color: #888 } // Generic.Output
29 |     .gp    { color: #555 } // Generic.Prompt
30 |     .gs    { font-weight: bold } // Generic.Strong
31 |     .gu    { color: #aaa } // Generic.Subheading
32 |     .gt    { color: #a00 } // Generic.Traceback
33 |     .kc    { font-weight: bold } // Keyword.Constant
34 |     .kd    { font-weight: bold } // Keyword.Declaration
35 |     .kp    { font-weight: bold } // Keyword.Pseudo
36 |     .kr    { font-weight: bold } // Keyword.Reserved
37 |     .kt    { color: #458; font-weight: bold } // Keyword.Type
38 |     .m     { color: #099 } // Literal.Number
39 |     .s     { color: #d14 } // Literal.String
40 |     .na    { color: #008080 } // Name.Attribute
41 |     .nb    { color: #0086B3 } // Name.Builtin
42 |     .nc    { color: #458; font-weight: bold } // Name.Class
43 |     .no    { color: #008080 } // Name.Constant
44 |     .ni    { color: #800080 } // Name.Entity
45 |     .ne    { color: #900; font-weight: bold } // Name.Exception
46 |     .nf    { color: #900; font-weight: bold } // Name.Function
47 |     .nn    { color: #555 } // Name.Namespace
48 |     .nt    { color: #000080 } // Name.Tag
49 |     .nv    { color: #008080 } // Name.Variable
50 |     .ow    { font-weight: bold } // Operator.Word
51 |     .w     { color: #bbb } // Text.Whitespace
52 |     .mf    { color: #099 } // Literal.Number.Float
53 |     .mh    { color: #099 } // Literal.Number.Hex
54 |     .mi    { color: #099 } // Literal.Number.Integer
55 |     .mo    { color: #099 } // Literal.Number.Oct
56 |     .sb    { color: #d14 } // Literal.String.Backtick
57 |     .sc    { color: #d14 } // Literal.String.Char
58 |     .sd    { color: #d14 } // Literal.String.Doc
59 |     .s2    { color: #d14 } // Literal.String.Double
60 |     .se    { color: #d14 } // Literal.String.Escape
61 |     .sh    { color: #d14 } // Literal.String.Heredoc
62 |     .si    { color: #d14 } // Literal.String.Interpol
63 |     .sx    { color: #d14 } // Literal.String.Other
64 |     .sr    { color: #009926 } // Literal.String.Regex
65 |     .s1    { color: #d14 } // Literal.String.Single
66 |     .ss    { color: #990073 } // Literal.String.Symbol
67 |     .bp    { color: #999 } // Name.Builtin.Pseudo
68 |     .vc    { color: #008080 } // Name.Variable.Class
69 |     .vg    { color: #008080 } // Name.Variable.Global
70 |     .vi    { color: #008080 } // Name.Variable.Instance
71 |     .il    { color: #099 } // Literal.Number.Integer.Long
72 | }
73 | 


--------------------------------------------------------------------------------
/autoregressive/autoregressive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/autoregressive/autoregressive.png


--------------------------------------------------------------------------------
/autoregressive/fvsbn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/autoregressive/fvsbn.png


--------------------------------------------------------------------------------
/autoregressive/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: post
  3 | title: Autoregressive Models
  4 | ---
  5 | 
  6 | We begin our study into generative modeling with autoregressive models. As before, we assume we are given access to a dataset $$\mathcal{D}$$ of $$n$$-dimensional datapoints $$\mathbf{x}$$. For simplicity, we assume the datapoints are binary, i.e., $$\mathbf{x} \in \{0,1\}^n$$.
  7 | 
  8 | Representation
  9 | ==============
 10 | 
 11 | By the chain rule of probability, we can factorize the joint distribution over the $$n$$-dimensions as 
 12 | 
 13 | {% math %}
 14 | p(\mathbf{x}) = \prod\limits_{i=1}^{n}p(x_i \vert x_1, x_2, \ldots, x_{i-1}) = 
 15 | \prod\limits_{i=1}^{n} p(x_i \vert \mathbf{x}_{< i } )
 16 | {% endmath %}
 17 | 
 18 | where $$\mathbf{x}_{< i}=[x_1, x_2, \ldots, x_{i-1}]$$ denotes the vector of random variables with index less than $$i$$. 
 19 | 
 20 | The chain rule factorization can be expressed graphically as a Bayesian network.
 21 | 
 22 | 
 23 | <figure>
 24 | <img src="autoregressive.png" alt="drawing" width="400" class="center"/>
 25 | <figcaption>
 26 | Graphical model for an autoregressive Bayesian network with no conditional independence assumptions.
 27 |  </figcaption>
 28 | </figure>
 29 | 
 30 | Such a Bayesian network that makes no conditional independence assumptions is said to obey the *autoregressive* property.
 31 | The term *autoregressive* originates from the literature on time-series models where observations from the previous time-steps are used to predict the value at the current time step. Here, we fix an ordering of the variables $$x_1, x_2, \ldots, x_n$$ and the distribution for the $$i$$-th random variable depends on the values of all the preceding random variables in the chosen ordering $$x_1, x_2, \ldots, x_{i-1}$$.
 32 | 
 33 | If we allow for every conditional $$p(x_i \vert \mathbf{x}_{< i})$$ to be specified in a tabular form, then such a representation is fully general and can represent any possible distribution over $$n$$ random variables. However, the space complexity for such a representation grows exponentially with $$n$$.
 34 | 
 35 | To see why, let us consider the conditional for the last dimension, given by $$p(x_n \vert \mathbf{x}_{< n})$$. In order to fully specify this conditional, we need to specify a probability distribution for each of the $$2^{n-1}$$ configurations of the variables $$x_1, x_2, \ldots, x_{n-1}$$. For any one of the $$2^{n-1}$$ possible configurations of the variables, the probabilities should sum to one. Therefore, we need only one parameter for each configuration, so the total number of parameters for specifying this conditional is given by $$2^{n-1}$$. Hence, a tabular representation for the conditionals is impractical for learning the joint distribution factorized via chain rule.
 36 | 
 37 | In an *autoregressive generative model*, the conditionals are specified as parameterized functions with a fixed number of parameters. That is, we assume the conditional distributions $$p(x_i \vert \mathbf{x}_{< i})$$ to correspond to a Bernoulli random variable and learn a function that maps the preceding random variables $$x_1, x_2, \ldots, x_{i-1}$$ to the
 38 | mean of this distribution. Hence, we have
 39 | {% math %}
 40 | p_{\theta_i}(x_i \vert \mathbf{x}_{< i}) = \mathrm{Bern}(f_i(x_1, x_2, \ldots, x_{i-1}))
 41 | {% endmath %}
 42 | where $$\theta_i$$ denotes the set of parameters used to specify the mean
 43 | function $$f_i: \{0,1\}^{i-1}\rightarrow [0,1]$$. 
 44 | 
 45 | 
 46 | The number of parameters of an autoregressive generative model are given by $$\sum_{i=1}^n \vert \theta_i \vert$$. As we shall see in the examples below, the number of parameters are much fewer than the tabular setting considered previously. Unlike the tabular setting however, an autoregressive generative model cannot represent all possible distributions. Its expressiveness is limited by the fact that we are limiting the conditional distributions to correspond to a Bernoulli random variable with the mean specified via a restricted class of parameterized functions.
 47 | 
 48 | <figure>
 49 | <img src="fvsbn.png" alt="drawing" width="200" class="center"/>
 50 | <figcaption>
 51 |  A fully visible sigmoid belief network over four variables. The conditionals are denoted by \(\widehat{x}_1, \widehat{x}_2, \widehat{x}_3, \widehat{x}_4\) respectively.
 52 |  </figcaption>
 53 | </figure>
 54 | In the simplest case, we can specify the function as a linear combination of the input elements followed by a sigmoid non-linearity (to restrict the output to lie between 0 and 1). This gives us the formulation of a *fully-visible sigmoid belief network* ([FVSBN](https://papers.nips.cc/paper/1153-does-the-wake-sleep-algorithm-produce-good-density-estimators.pdf)).
 55 | 
 56 | {% math %}
 57 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\alpha^{(i)}_0 + \alpha^{(i)}_1 x_1 + \ldots + \alpha^{(i)}_{i-1} x_{i-1})
 58 | {% endmath %} 
 59 | 
 60 | where $$\sigma$$ denotes the sigmoid function and $$\theta_i=\{\alpha^{(i)}_0,\alpha^{(i)}_1, \ldots, \alpha^{(i)}_{i-1}\}$$ denote the parameters of the mean function. The conditional for variable $$i$$ requires $$i$$ parameters, and hence the total number of parameters in the model is given by $$\sum_{i=1}^ni= O(n^2)$$. Note that the number of parameters are much fewer than the exponential complexity of the tabular case.
 61 | 
 62 | A natural way to increase the expressiveness of an autoregressive generative model is to use more flexible parameterizations for the mean function e.g., multi-layer perceptrons (MLP). For example, consider the case of a neural network with 1 hidden layer. The mean function for variable $$i$$ can be expressed as
 63 | 
 64 | {% math %}
 65 | \mathbf{h}_i = \sigma(A_i \mathbf{x_{< i}} + \mathbf{c}_i)\\
 66 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )
 67 | {% endmath %}
 68 | 
 69 | where $$\mathbf{h}_i \in \mathbb{R}^d$$ denotes the hidden layer activations for the MLP and $$\theta_i = \{A_i \in \mathbb{R}^{d\times (i-1)},  \mathbf{c}_i \in \mathbb{R}^d, \boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d, b_i \in \mathbb{R}\}$$
 70 | are the set of parameters for the mean function $$\mu_i(\cdot)$$. The total number of parameters in this model is dominated by the matrices $$A_i$$ and given by $$O(n^2 d)$$.
 71 | 
 72 | 
 73 | <figure>
 74 | <img src="nade.png" alt="drawing" width="200" class="center"/>
 75 | <figcaption>
 76 |  A neural autoregressive density estimator over four variables. The conditionals are denoted by \(\widehat{x}_1, \widehat{x}_2, \widehat{x}_3, \widehat{x}_4\) respectively. The blue connections denote the tied weights \(W[., i]\) used for computing the hidden layer activations.
 77 |  </figcaption>
 78 | </figure>
 79 | 
 80 | The *Neural Autoregressive Density Estimator* ([NADE](http://proceedings.mlr.press/v15/larochelle11a/larochelle11a.pdf)) provides an alternate MLP-based parameterization that is more statistically and computationally efficient than the vanilla approach. In NADE, parameters are shared across the functions used for evaluating the conditionals. In particular, the hidden layer activations are specified as
 81 | 
 82 | {% math %}
 83 | \mathbf{h}_i = \sigma(W_{., < i} \mathbf{x_{< i}} + \mathbf{c})\\
 84 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )
 85 | {% endmath %}
 86 | where $$\theta=\{W\in \mathbb{R}^{d\times n}, \mathbf{c} \in \mathbb{R}^d, \{\boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d\}^n_{i=1}, \{b_i \in \mathbb{R}\}^n_{i=1}\}$$is
 87 | the full set of parameters for the mean functions $$f_1(\cdot), f_2(\cdot), \ldots, f_n(\cdot)$$. The weight matrix $$W$$ and the bias vector $$\mathbf{c}$$ are shared across the conditionals. Sharing parameters offers two benefits:
 88 | 
 89 | 1.  The total number of parameters gets reduced from $$O(n^2 d)$$ to $$O(nd)$$ \[readers are encouraged to check!\].
 90 | 
 91 | 2.  The hidden unit activations can be evaluated in $$O(nd)$$ time via the following recursive strategy:
 92 |     {% math %}
 93 |     \mathbf{h}_i = \sigma(\mathbf{a}_i)\\
 94 |     \mathbf{a}_{i+1} = \mathbf{a}_{i} + W[., i]x_i
 95 |     {% endmath %}
 96 |     with the base case given by $$\mathbf{a}_1=\mathbf{c}$$.
 97 | 
 98 | 
 99 | ###  Extensions to NADE
100 | 
101 | The [RNADE](https://arxiv.org/abs/1306.0186) algorithm extends NADE to learn generative models over real-valued data. Here, the conditionals are modeled via a continuous distribution such as a equi-weighted mixture of $$K$$ Gaussians. Instead of learning a mean function, we now learn the means $$\mu_{i,1}, \mu_{i,2},\ldots, \mu_{i,K}$$ and variances $$\Sigma_{i,1}, \Sigma_{i,2},\ldots, \Sigma_{i,K}$$ of the $$K$$ Gaussians for every conditional. For statistical and computational efficiency, a single function $$g_i: \mathbb{R}^{i-1}\rightarrow\mathbb{R}^{2K}$$ outputs all the means and variances of the $$K$$ Gaussians for the $$i$$-th conditional distribution.
102 | 
103 | Notice that NADE requires specifying a single, fixed ordering of the variables. The choice of ordering can lead to different models. The [EoNADE](https://arxiv.org/abs/1310.1757) algorithm allows training an ensemble of NADE models with different orderings.
104 | 
105 | Learning and inference
106 | ======================
107 | 
108 | Recall that learning a generative model involves optimizing the closeness between the data and model distributions. One commonly used notion of closeness in the KL divergence between the data and the model distributions.
109 | 
110 | {% math %}
111 | \min_{\theta\in \mathcal{M}}d_{KL}
112 | (p_{\mathrm{data}}, p_{\theta}) = \mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\mathrm{data}}(\mathbf{x}) - \log p_{\theta}(\mathbf{x})\right]
113 | {% endmath %}
114 | 
115 | Before moving any further, we make two comments about the KL divergence. First, we note that the KL divergence between any two distributions is asymmetric. As we navigate through this chapter, the reader is encouraged to think what could go wrong if we decided to optimize the reverse KL divergence instead. Secondly, the KL divergences heavily penalizes any model distribution $$p_\theta$$ which assigns low probability to a datapoint that is likely to be sampled under $$p_{\mathrm{data}}$$. In the extreme case, if the density $$p_\theta(\mathbf{x})$$ evaluates to zero for a datapoint sampled from $$p_{\mathrm{data}}$$, the objective evaluates to $$+\infty$$.
116 | 
117 | Since $$p_{\mathrm{data}}$$ does not depend on $$\theta$$, we can equivalently recover the optimal parameters via maximizing likelihood estimation.
118 | 
119 | {% math %}
120 | \max_{\theta\in \mathcal{M}}\mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\theta}(\mathbf{x})\right].
121 | {% endmath %}
122 | 
123 | Here, $$\log p_{\theta}(\mathbf{x})$$ is referred to as the log-likelihood of the datapoint $$\mathbf{x}$$ with respect to the model distribution $$p_\theta$$.
124 | 
125 | To approximate the expectation over the unknown $$p_{\mathrm{data}}$$, we make an assumption: points in the dataset $$\mathcal{D}$$ are sampled i.i.d. from $$p_{\mathrm{data}}$$. This allows us to obtain an unbiased Monte Carlo estimate of the objective as
126 | 
127 | {% math %}
128 | \max_{\theta\in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\log p_{\theta}(\mathbf{x}) = \mathcal{L}(\theta \vert \mathcal{D}).
129 |  {% endmath %}
130 | 
131 | 
132 | The maximum likelihood estimation (MLE) objective has an intuitive interpretation: pick the model parameters $$\theta \in \mathcal{M}$$ that maximize the log-probability of the observed datapoints in $$\mathcal{D}$$.
133 | 
134 | In practice, we optimize the MLE objective using mini-batch gradient ascent. The algorithm operates in iterations. At every iteration $$t$$, we sample a mini-batch $$\mathcal{B}_t$$ of datapoints sampled randomly from the dataset ($$\vert \mathcal{B}_t\vert < \vert \mathcal{D} \vert$$) and compute gradients of the objective evaluated for the mini-batch. These parameters at iteration $$t+1$$ are then given via the following update rule
135 | {% math %}
136 | \theta^{(t+1)} = \theta^{(t)} + r_t \nabla_\theta\mathcal{L}(\theta^{(t)} \vert \mathcal{B}_t)
137 | {% endmath %}
138 | 
139 | where $$\theta^{(t+1)}$$ and $$\theta^{(t)}$$ are the parameters at iterations $$t+1$$ and $$t$$ respectively, and $$r_t$$ is the learning rate at iteration $$t$$. Typically, we only specify the initial learning rate $$r_1$$ and update the rate based on a schedule. [Variants](http://cs231n.github.io/optimization-1/) of stochastic gradient ascent, such as RMS prop and Adam, employ modified update rules that work slightly better in practice.
140 | 
141 | From a practical standpoint, we must think about how to choose hyperparameters (such as the initial learning rate) and a stopping criteria for the gradient descent. For both these questions, we follow the standard practice in machine learning of monitoring the objective on a validation dataset. Consequently, we choose the hyperparameters with the best performance on the validation dataset and stop updating the parameters when the validation log-likelihoods cease to improve[^1].
142 | 
143 | Now that we have a well-defined objective and optimization procedure, the only remaining task is to evaluate the objective in the context of an autoregressive generative model. To this end, we substitute the factorized joint distribution of an autoregressive model in the MLE objective to get
144 | 
145 | {% math %}
146 | \max_{\theta \in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\sum_{i=1}^n\log p_{\theta_i}(x_i \vert \mathbf{x}_{< i})
147 | {% endmath %}
148 | 
149 | where $$\theta = \{\theta_1, \theta_2, \ldots, \theta_n\}$$ now denotes the
150 | collective set of parameters for the conditionals.
151 | 
152 | Inference in an autoregressive model is straightforward. For density estimation of an arbitrary point $$\mathbf{x}$$, we simply evaluate the log-conditionals $$\log p_{\theta_i}(x_i \vert \mathbf{x}_{< i})$$ for each $$i$$ and add these up to obtain the log-likelihood assigned by the model to $$\mathbf{x}$$. Since we know conditioning vector $$\mathbf{x}$$, each of the conditionals can be evaluated in parallel. Hence, density estimation is efficient on modern hardware.
153 | 
154 | Sampling from an autoregressive model is a sequential procedure. Here, we first sample $$x_1$$, then we sample $$x_2$$ conditioned on the sampled $$x_1$$, followed by $$x_3$$ conditioned on both $$x_1$$ and $$x_2$$ and so on until we sample $$x_n$$ conditioned on the previously sampled $$\mathbf{x}_{< n}$$. For applications requiring real-time generation of high-dimensional data such as audio synthesis, the sequential sampling can be an expensive process. Later in this course, we will discuss how parallel WaveNet, an autoregressive model sidesteps this expensive sampling process.
155 | 
156 | <!-- TODO: add NADE samples figure -->
157 | 
158 | Finally, an autoregressive model does not directly learn unsupervised representations of the data. In the next few set of lectures, we will look at latent variable models (e.g., variational autoencoders) which explicitly learn latent representations of the data.
159 | 
160 | <!-- 
161 | 
162 | Additional parameterizations
163 | ==============
164 | Coming soon: MADE, Char-RNN, Pixel-CNN, Wavenet -->
165 | 
166 | Footnotes
167 | ==============
168 | 
169 | [^1]: Given the non-convex nature of such problems, the optimization procedure can get stuck in local optima. Hence, early stopping will generally not be optimal but is a very practical strategy.
170 | 


--------------------------------------------------------------------------------
/autoregressive/index.tex:
--------------------------------------------------------------------------------
  1 | \section{Autoregressive Models}
  2 | 
  3 | We begin our study with the autoregressive generative models. As before, we assume we are given access to a dataset $\mathcal{D}$  of $n$-dimensional datapoints $\mathbf{x}$. For simplicity, we assume the datapoints are binary, i.e.,  $\mathbf{x} \in \{0,1\}^n$.
  4 | 
  5 | \section{Representation}
  6 | 
  7 | By the chain rule of probability, we can factorize the joint distribution over the $n$-dimensions as:
  8 | \[
  9 | \begin{equation}
 10 | p(\mathbf{x}) = \prod\limits_{i=1}^{n}p(x_i \vert x_1, x_2, \ldots, x_{i-1}) = \prod\limits_{i=1}^{n} p(x_i \vert \mathbf{x}_{<i})
 11 | \end{equation}
 12 | \label{eq:chain_rule}
 13 | \]
 14 | where $\mathbf{x}_{<i}=[x_1, x_2, \ldots, x_{i-1}]$ denotes the vector of random variables with index less than $i$. If we allow for every conditional $p(x_i \vert \mathbf{x}_{<i})$ to be specified in a tabular form, then such a representation is fully general and can represent any possible distribution over $n$ random variables. However, the space complexity for such a representation grows exponentially with $n$. 
 15 | 
 16 | To see why, let us consider the conditional for the last dimension, given by $p(x_n \vert \mathbf{x}_{<n})$. In order to fully specify this conditional, we need to specify a probability for $2^{n-1}$ configurations of the variables $x_1, x_2, \ldots, x_{n-1}$.  Since $x_n$ is a binary variable we need to specify one parameter per configuration $x_1, x_2, \ldots, x_{n-1}$ : namely $p(x_n = 1 | x_1, x_2, \ldots, x_{n-1})$. In total, the number of parameters needed to specify this conditional is given by $2^{n-1}$. Hence, a tabular representation for the conditionals is impractical for learning the joint distribution in (\ref{eq:chain_rule}) . 
 17 | 
 18 | In an \textit{autoregressive generative model}, the conditionals are specified as parameterized functions with a fixed number of parameters. That is, we assume the conditional distributions $p(x_i \vert \mathbf{x}_{<i})$ to correspond to a Bernoulli random variable and learn a function that maps the preceeding random variables $x_1, x_2, \ldots, x_{i-1}$ to the mean of this distribution. Hence, we have:
 19 | \[
 20 | p_{\theta_i}(x_i \vert \mathbf{x}_{<i}) = \mathrm{Bern}(f_i(x_1, x_2, \ldots, x_{i-1}))
 21 | \]
 22 | where $\theta_i$ denotes the set of parameters used to specify the mean function $f_i: \{0,1\}^{i-1}\rightarrow [0,1]$.  The term \textit{autoregressive} originates from the literature on time-series models where observations from the previous time-steps are used to predict the value at the current time step. Here, we are predicting the distribution for the $i$-th random variable using the values of the preceeding random variables in the sequence $x_1, x_2, \ldots, x_n$.
 23 | 
 24 | The number of parameters of an autoregressive generative model are given by $\sum_{i=1}^n \vert \theta_i \vert$. As we shall see in the examples below, the number of parameters are much fewer than the tabular setting considered previously. Unlike the tabular setting however, an autoregressive generative model cannot represent all possible distributions. Its expressiveness is limited by the fact that we are limiting the conditional distributions to correspond to a Bernoulli random variable with a restricted class of parameterized functions specifying the mean.
 25 | 
 26 | In the simplest case, we can specify the function as a linear combination of the input elements followed by a sigmoid non-linearity (to restrict the output to lie between 0 and 1). This gives us the formulation of a \textit{fully-visible sigmoid belief network} (FVSBN):
 27 | \[
 28 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\alpha^{(i)}_0 + \alpha^{(i)}_1 x_1 + \ldots + \alpha^{(i)}_{i-1} x_{i-1})  
 29 | \]
 30 | where $\sigma$ denotes the sigmoid function and $\theta_i=\{\alpha^{(i)}_0,\alpha^{(i)}_1, \ldots, \alpha^{(i)}_{i-1}\}$ denote the parameters of the mean function. The conditional for variable $i$ requires $i$
 31 |  parameters, and hence the total number of parameters in the model is given by $\sum_{i=1}^ni= O(n^2)$.  Note that the number of parameters are much fewer than the exponential parameters required in the tabular case.
 32 | 
 33 | A natural way to increase the expressiveness of an autoregressive generative model is to use more flexible parameterizations for the mean function e.g., multi-layer perceptrons (MLP). In the case of 1-hidden layer neural networks, the mean function for variable $i$ can be expressed as:
 34 | \[
 35 | \mathbf{h}_i = \sigma(A_i \mathbf{x_{<i}} + \mathbf{c}_i)\\
 36 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )  
 37 | \]
 38 | where $\mathbf{h}_i \in \mathbb{R}^d$ denotes the hidden layer activations for the MLP and$\theta_i = \{A_i \in \mathbb{R}^{d\times (i-1)},  \mathbf{c}_i \in \mathbb{R}^d, \boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d, b_i \in \mathbb{R}\}$ are the set of parameters for the mean function $f_i(\cdot)$.  The total number of parameters in this model is dominated by the matrices $A_i$ and given by $O(n^2 d)$. 
 39 | 
 40 | The Neural Autoregressive Density Estimation (NADE) provides an efficient MLP parameterization that shares parameters used for evaluating the hidden layer activations.
 41 | \[
 42 | \mathbf{h}_i = \sigma(W_{., <i} \mathbf{x_{<i}} + \mathbf{c})\\
 43 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )  
 44 | \]
 45 | where $\theta=\{W\in \mathbb{R}^{d\times n}, \mathbf{c} \in \mathbb{R}^d, \{\boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d\}^n_{i=1}, \{b_i \in \mathbb{R}\}^n_{i=1}\}$is the full set of parameters for the mean functions $f_1(\cdot), f_2(\cdot), \ldots, f_n(\cdot)$. The weight matrix $W$ and the bias vector $\mathbf{c}$ are shared across the conditionals. Sharing parameters offers two benefits:
 46 | \begin{enumerate}
 47 | \item The total number of parameters from $O(n^2 d)$ to $O(nd)$ [readers are encouraged to check!].
 48 | \item The hidden unit activations can be evaluated in $O(nd)$ time via the following recursive strategy:
 49 | \[
 50 | \mathbf{h}_i = \sigma(\mathbf{a}_i)\\
 51 | \mathbf{a}_{i+1} = \mathbf{a}_{i} + W[., i]x_i
 52 | \]
 53 | with the base case given by $\mathbf{a}_1=\mathbf{c}$.
 54 | \end{enumerate}
 55 | 
 56 | 
 57 | \section{Learning and inference}
 58 | 
 59 | Recall that learning a generative model involves optimizing the closeness between the data and model distributions. One commonly used notion of closeness is the KL divergence between the data and the model distributions.
 60 | 
 61 | $$
 62 | \begin{align*}
 63 | \min_{\theta\in \mathcal{M}}d_{KL}(p_{\mathrm{data}}, p_{\theta}) &= \mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\mathrm{data}}(\mathbf{x}) - \log p_{\theta}(\mathbf{x})\right].
 64 | \end{align*}
 65 | $$
 66 | Before moving any further, we make two comments about the KL divergence. First, we note that the KL divergence between any two distributions is asymmetric. As we navigate through this chapter, the reader is encouraged to think what could go wrong if we decided to optimize the reverse KL divergence instead. Secondly, the KL divergences heavily penalizes model distribution $p_\theta$ which place little mass on any datapoint that has a non-zero probability under $p_{\mathrm{data}}$. In the extreme case, if the density $p_\theta(\mathbf{x})$ evaluates to zero for a datapoint sampled from $p_{\mathrm{data}}$, the objective evaluates to $+\infty$. 
 67 | 
 68 | Since $p_{\mathrm{data}}$ does not depend on $\theta$, we can equivalently recover the optimal parameters via maximizing likelihood estimation:
 69 | 
 70 | $$
 71 | \begin{align*}
 72 | \max_{\theta\in \mathcal{M}}\mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\theta}(\mathbf{x})\right].
 73 | \end{align*}
 74 | $$
 75 | Here, $\log p_{\theta}(\mathbf{x})$ is referred to as the log-likelihood of the datapoint $\mathbf{x}$ with respect to the model distribution $p_\theta$. 
 76 | 
 77 | To approximate the expectation over the unknown $p_{\mathrm{data}}$, we make an assumption: points in the dataset $\mathcal{D}$ are sampled i.i.d. from $p_{\mathrm{data}}$. This allows us to obtain an unbiased Monte Carlo estimate of the objective:
 78 | 
 79 | $$
 80 | \begin{align}
 81 | \max_{\theta\in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\log p_{\theta}(\mathbf{x}) = \mathcal{L}(\theta \vert \mathcal{D}).
 82 | \end{align}
 83 | \label{eq:mle}
 84 | \tag{2}
 85 | $$
 86 | 
 87 | The maximum likelihood estimation (MLE) objective has an intuitive interpretation: pick the model parameters $\theta \in \mathcal{M}$ that maximize the log-probability of the observed datapoints in $\mathcal{D}$. 
 88 | 
 89 | In practice, we optimize the MLE objective using mini-batch gradient ascent. The algorithm operates in iterations. At every iteration $t$, we sample a mini-batch $\mathcal{B}_t$  of datapoints sampled randomly from the dataset ($\vert \mathcal{B}_t\vert < \vert \mathcal{D} \vert$) and compute gradients of the objective evaluated for the mini-batch. These parameters at iteration $t+1$ are then given via the following update rule:
 90 | \[
 91 | \theta^{(t+1)} = \theta^{(t)} + r_t \nabla_\theta\mathcal{L}(\theta^{(t)} \vert \mathcal{B}_t)
 92 | \]
 93 | where $\theta^{(t+1)}$ and $\theta^{(t)}$ are the parameters at iterations $t+1$ and $t$ respectively, and $r_t$ is the learning rate at iteration $t$.  Typically, we only specify the initial learning rate $r_1$ and update the rate based on a schedule.  [Variants](http://cs231n.github.io/optimization-1/) of stochastic gradient ascent, such as RMS prop and Adam, employ modified update rules that work slightly better in practice. 
 94 | 
 95 | From a practical standpoint, we must think about how to choose hyperaparameters (such as the initial learning rate) and a stopping criteria for the gradient descent. For both these questions, we follow the standard practice in machine learning of monitoring the objective on a validation dataset. Consequently, we choose the hyperparameters with the best performance on the validation dataset and stop updating the parameters when the validation log-likelihoods stop improving.
 96 | 
 97 | Now that we have a well-defined objective and optimization procedure, the only remaining task is to evaluate the objective in the context of an autoregressive generative model. To this end, we substitute the factorization of the joint distribution in Eq.$~\ref{eq:chain_rule}$
 98 | in the MLE objective in Eq.$~\ref{eq:mle}$ to get:
 99 | \[
100 | \max_{\theta \in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\sum_{i=1}^n\log p_{\theta_i}(x_i \vert \mathbf{x}_{<i})
101 | \]where $\theta = \{\theta_1, \theta_2, \ldots, \theta_n\}$ now denotes the collective set of parameters for the conditionals.
102 | 
103 | Inference in an autoregressive model is straightforward. For density estimation of an arbitrary point $\mathbf{x}$, we simply evaluate the log-conditionals $\log p_{\theta_i}(x_i \vert \mathbf{x}_{<i})$ for each $i$ and add these up to obtain the log-likelihood assigned by the model to $\mathbf{x}$. Since we know conditioning vector $\mathbf{x}$, each of the conditionals can be evaluated in parallel. Hence, density estimation is efficient on modern hardware.
104 | 
105 | Sampling from an autoregressive model is a sequential procedure. Here, we first sample $x_1$, then we sample $x_2$ conditioned on the sampled $x_1$, followed by $x_3$ conditioned on both $x_1$ and $x_2$ and so on until we sample $x_n$ conditioned on the previously sampled $\mathbf{x}_{<n}$. For applications requiring real-time generation of high-dimensional data such as audio synthesis, the sequential sampling can be an expensive process.
106 | 
107 | 
108 | TODO: add NADE samples figure
109 | 
110 | Finally, an autoregressive model does not directly learn unsupervised representations of the data. In the next few set of lectures, we will look at latent variable models (e.g., variational autoencoders) which explicitly learn latent representations of the data.
111 | 
112 | 
113 | TODO: Autoregressive generative models based on Autoencoders, RNNs, and CNNs.
114 | MADE, Char-RNN, Pixel-CNN, Wavenet
115 | 


--------------------------------------------------------------------------------
/autoregressive/nade.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/autoregressive/nade.png


--------------------------------------------------------------------------------
/css/tufte.css:
--------------------------------------------------------------------------------
  1 | ---
  2 | # this ensures Jekyll reads the file to be transformed into CSS later
  3 | # only Main files contain this front matter, not partials.
  4 | nav_exclude: true
  5 | ---
  6 | /*****************************************************************************
  7 | /*
  8 | /* Tufte Jekyll blog theme
  9 | /* Based on Tufte CSS by Dave Liepmann ( https://github.com/edwardtufte/tufte-
 10 | /*
 11 | /* The README.md will show you how to set up your site along with other goodie
 12 | /*****************************************************************************/
 13 | 
 14 | // Imports to create final
 15 | 
 16 | @import "../_sass/fonts";
 17 | @import "../_sass/settings";
 18 | @import "../_sass/syntax-highlighting";
 19 | 
 20 | /* Tufte CSS styles */
 21 | html { font-size: 15px; }
 22 | 
 23 | body { width: 87.5%;
 24 |        margin-left: auto;
 25 |        margin-right: auto;
 26 |        padding-left: 12.5%;
 27 |        font-family: et-book, Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif;
 28 |        background-color: #fffff8;
 29 |        color: #111;
 30 |        max-width: 1400px;
 31 |        counter-reset: sidenote-counter; }
 32 | 
 33 | h1 { font-weight: 400;
 34 |      margin-top: 4rem;
 35 |      margin-bottom: 1.5rem;
 36 |      font-size: 3.2rem;
 37 |      line-height: 1; }
 38 | 
 39 | h2 { font-style: italic;
 40 |      font-weight: 400;
 41 |      margin-top: 2.1rem;
 42 |      margin-bottom: 0;
 43 |      font-size: 2.2rem;
 44 |      line-height: 1; }
 45 | 
 46 | h3 { font-style: italic;
 47 |      font-weight: 400;
 48 |      font-size: 1.7rem;
 49 |      margin-top: 2rem;
 50 |      margin-bottom: 0;
 51 |      line-height: 1; }
 52 | 
 53 | p.subtitle { font-style: italic;
 54 |              margin-top: 1rem;
 55 |              margin-bottom: 1rem;
 56 |              font-size: 1.8rem;
 57 |              display: block;
 58 |              line-height: 1; }
 59 | 
 60 | .numeral { font-family: et-book-roman-old-style; }
 61 | 
 62 | .danger { color: red; }
 63 | 
 64 | article { position: relative;
 65 |           padding: 5rem 0rem; }
 66 | 
 67 | section { padding-top: 1rem;
 68 |           padding-bottom: 1rem; }
 69 | 
 70 | p, ol, ul { font-size: 1.4rem; }
 71 | 
 72 | p { line-height: 2rem;
 73 |     margin-top: 1.4rem;
 74 |     margin-bottom: 1.4rem;
 75 |     padding-right: 0;
 76 |     vertical-align: baseline; }
 77 | 
 78 | /* Chapter Epigraphs */
 79 | div.epigraph { margin: 5em 0; }
 80 | 
 81 | div.epigraph > blockquote { margin-top: 3em;
 82 |                             margin-bottom: 3em; }
 83 | 
 84 | div.epigraph > blockquote, div.epigraph > blockquote > p { font-style: italic; }
 85 | 
 86 | div.epigraph > blockquote > footer { font-style: normal; }
 87 | 
 88 | div.epigraph > blockquote > footer > cite { font-style: italic; }
 89 | 
 90 | /* end chapter epigraphs styles */
 91 | 
 92 | blockquote { font-size: 1.4rem; }
 93 | 
 94 | blockquote p { width: 50%; }
 95 | 
 96 | blockquote footer { width: 50%;
 97 |                     font-size: 1.1rem;
 98 |                     text-align: right; }
 99 | 
100 | ol, ul { width: 45%;
101 |          -webkit-padding-start: 5%;
102 |          -webkit-padding-end: 5%; }
103 | 
104 | li { padding: 0.5rem 0; }
105 | 
106 | figure { padding: 0;
107 |          border: 0;
108 |          font-size: 100%;
109 |          font: inherit;
110 |          vertical-align: baseline;
111 |          max-width: 55%;
112 |          -webkit-margin-start: 0;
113 |          -webkit-margin-end: 0;
114 |          margin: 0 0 3em 0; }
115 | 
116 | figcaption { float: right;
117 |              clear: right;
118 |              margin-right: -48%;
119 |              margin-top: 0;
120 |              margin-bottom: 0;
121 |              font-size: 1.1rem;
122 |              line-height: 1.6;
123 |              vertical-align: baseline;
124 |              position: relative;
125 |              max-width: 40%; }
126 | 
127 | figure.fullwidth figcaption { margin-right: 24%; }
128 | 
129 | /* Links: replicate underline that clears descenders */
130 | a:link, a:visited { color: inherit; }
131 | 
132 | a:link { text-decoration: none;
133 |          background: -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#333, #333);
134 |          background: linear-gradient(#fffff8, #fffff8), linear-gradient(#fffff8, #fffff8), linear-gradient(#333, #333);
135 |          -webkit-background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
136 |          -moz-background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
137 |          background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
138 |          background-repeat: no-repeat, no-repeat, repeat-x;
139 |          text-shadow: 0.03em 0 #fffff8, -0.03em 0 #fffff8, 0 0.03em #fffff8, 0 -0.03em #fffff8, 0.06em 0 #fffff8, -0.06em 0 #fffff8, 0.09em 0 #fffff8, -0.09em 0 #fffff8, 0.12em 0 #fffff8, -0.12em 0 #fffff8, 0.15em 0 #fffff8, -0.15em 0 #fffff8;
140 |          background-position: 0% 93%, 100% 93%, 0% 93%; }
141 | 
142 | @media screen and (-webkit-min-device-pixel-ratio: 0) { a:link { background-position-y: 87%, 87%, 87%; } }
143 | 
144 | a:link::selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe;
145 |                     background: #b4d5fe; }
146 | 
147 | a:link::-moz-selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe;
148 |                          background: #b4d5fe; }
149 | 
150 | /* Sidenotes, margin notes, figures, captions */
151 | img { max-width: 100%; }
152 | 
153 | .sidenote, .marginnote { float: right;
154 |                          clear: right;
155 |                          margin-right: -60%;
156 |                          width: 50%;
157 |                          margin-top: 0;
158 |                          margin-bottom: 0;
159 |                          font-size: 1.1rem;
160 |                          line-height: 1.3;
161 |                          vertical-align: baseline;
162 |                          position: relative; }
163 | 
164 | .table-caption { float:right;
165 |                  clear:right;
166 |                  margin-right: -60%;
167 |                  width: 50%;
168 |                  margin-top: 0;
169 |                  margin-bottom: 0;
170 |                  font-size: 1.0rem;
171 |                  line-height: 1.6; }
172 | 
173 | .sidenote-number { counter-increment: sidenote-counter; }
174 | 
175 | .sidenote-number:after, .sidenote:before { content: counter(sidenote-counter) " ";
176 |                                            font-family: et-book-roman-old-style;
177 |                                            position: relative;
178 |                                            vertical-align: baseline; }
179 | 
180 | .sidenote-number:after { content: counter(sidenote-counter);
181 |                          font-size: 1rem;
182 |                          top: -0.5rem;
183 |                          left: 0.1rem; }
184 | 
185 | .sidenote:before { content: counter(sidenote-counter) " ";
186 |                    top: -0.5rem; }
187 | 
188 | p, footer, table, div.table-wrapper-small, div.supertable-wrapper > p, div.booktabs-wrapper { width: 55%; }
189 | 
190 | div.fullwidth, table.fullwidth { width: 100%; }
191 | 
192 | div.table-wrapper { overflow-x: auto;
193 |                     font-family: "Trebuchet MS", "Gill Sans", "Gill Sans MT", sans-serif; }
194 | 
195 | @media screen and (max-width: 760px) { p, footer { width: 90%; }
196 |                                        pre.code { width: 87.5%; }
197 |                                        ul { width: 85%; }
198 |                                        figure { max-width: 90%; }
199 |                                        figcaption, figure.fullwidth figcaption { margin-right: 0%;
200 |                                                                                  max-width: none; }
201 |                                        blockquote p, blockquote footer { width: 90%; }}
202 | 
203 | .sans { font-family: "Gill Sans", "Gill Sans MT", Calibri, sans-serif;
204 |         letter-spacing: .03em; }
205 | 
206 | .code { font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
207 |         font-size: 1.125rem;
208 |         line-height: 1.6; }
209 | 
210 | h1 .code, h2 .code, h3 .code { font-size: 0.80em; }
211 | 
212 | .marginnote .code, .sidenote .code { font-size: 1rem; }
213 | 
214 | pre.code { width: 52.5%;
215 |            padding-left: 2.5%;
216 |            overflow-x: auto; }
217 | 
218 | .fullwidth { max-width: 90%;
219 |              clear:both; }
220 | 
221 | span.newthought { font-variant: small-caps;
222 |                   font-size: 1.2em; }
223 | 
224 | input.margin-toggle { display: none; }
225 | 
226 | label.sidenote-number { display: inline; }
227 | 
228 | label.margin-toggle:not(.sidenote-number) { display: none; }
229 | 
230 | @media (max-width: 760px) { label.margin-toggle:not(.sidenote-number) { display: inline; }
231 |                             .sidenote, .marginnote { display: none; }
232 |                             .margin-toggle:checked + .sidenote,
233 |                             .margin-toggle:checked + .marginnote { display: block;
234 |                                                                    float: left;
235 |                                                                    left: 1rem;
236 |                                                                    clear: both;
237 |                                                                    width: 95%;
238 |                                                                    margin: 1rem 2.5%;
239 |                                                                    vertical-align: baseline;
240 |                                                                    position: relative; }
241 |                             label { cursor: pointer; }
242 |                             pre.code { width: 90%;
243 |                                        padding: 0; }
244 |                             .table-caption { display: block;
245 |                                              float: right;
246 |                                              clear: both;
247 |                                              width: 98%;
248 |                                              margin-top: 1rem;
249 |                                              margin-bottom: 0.5rem;
250 |                                              margin-left: 1%;
251 |                                              margin-right: 1%;
252 |                                              vertical-align: baseline;
253 |                                              position: relative; }
254 |                             div.table-wrapper, table, table.booktabs { width: 85%; }
255 |                             div.table-wrapper { border-right: 1px solid #efefef; }
256 |                             img { width: 100%; } }
257 | 


--------------------------------------------------------------------------------
/css/tufte.orginal.css:
--------------------------------------------------------------------------------
  1 | ---
  2 | # this ensures Jekyll reads the file to be transformed into CSS later
  3 | # only Main files contain this front matter, not partials.
  4 | nav_exclude: true
  5 | ---
  6 | /*****************************************************************************
  7 | /*
  8 | /* Tufte Jekyll blog theme
  9 | /* Based on Tufte CSS by Dave Liepmann ( https://github.com/edwardtufte/tufte-
 10 | /*
 11 | /* The README.md will show you how to set up your site along with other goodie
 12 | /*****************************************************************************/
 13 | 
 14 | // Imports to create final
 15 | 
 16 | @import "../_sass/fonts";
 17 | @import "../_sass/settings";
 18 | @import "../_sass/syntax-highlighting";
 19 | 
 20 | /* Tufte CSS styles */
 21 | html { font-size: 15px; }
 22 | 
 23 | body { width: 87.5%;
 24 |        margin-left: auto;
 25 |        margin-right: auto;
 26 |        padding-left: 12.5%;
 27 |        font-family: et-book, Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif;
 28 |        background-color: #fffff8;
 29 |        color: #111;
 30 |        max-width: 1400px;
 31 |        counter-reset: sidenote-counter; }
 32 | 
 33 | h1 { font-weight: 400;
 34 |      margin-top: 4rem;
 35 |      margin-bottom: 1.5rem;
 36 |      font-size: 3.2rem;
 37 |      line-height: 1; }
 38 | 
 39 | h2 { font-style: italic;
 40 |      font-weight: 400;
 41 |      margin-top: 2.1rem;
 42 |      margin-bottom: 0;
 43 |      font-size: 2.2rem;
 44 |      line-height: 1; }
 45 | 
 46 | h3 { font-style: italic;
 47 |      font-weight: 400;
 48 |      font-size: 1.7rem;
 49 |      margin-top: 2rem;
 50 |      margin-bottom: 0;
 51 |      line-height: 1; }
 52 | 
 53 | p.subtitle { font-style: italic;
 54 |              margin-top: 1rem;
 55 |              margin-bottom: 1rem;
 56 |              font-size: 1.8rem;
 57 |              display: block;
 58 |              line-height: 1; }
 59 | 
 60 | .numeral { font-family: et-book-roman-old-style; }
 61 | 
 62 | .danger { color: red; }
 63 | 
 64 | article { position: relative;
 65 |           padding: 5rem 0rem; }
 66 | 
 67 | section { padding-top: 1rem;
 68 |           padding-bottom: 1rem; }
 69 | 
 70 | p, ol, ul { font-size: 1.4rem; }
 71 | 
 72 | p { line-height: 2rem;
 73 |     margin-top: 1.4rem;
 74 |     margin-bottom: 1.4rem;
 75 |     padding-right: 0;
 76 |     vertical-align: baseline; }
 77 | 
 78 | /* Chapter Epigraphs */
 79 | div.epigraph { margin: 5em 0; }
 80 | 
 81 | div.epigraph > blockquote { margin-top: 3em;
 82 |                             margin-bottom: 3em; }
 83 | 
 84 | div.epigraph > blockquote, div.epigraph > blockquote > p { font-style: italic; }
 85 | 
 86 | div.epigraph > blockquote > footer { font-style: normal; }
 87 | 
 88 | div.epigraph > blockquote > footer > cite { font-style: italic; }
 89 | 
 90 | /* end chapter epigraphs styles */
 91 | 
 92 | blockquote { font-size: 1.4rem; }
 93 | 
 94 | blockquote p { width: 50%; }
 95 | 
 96 | blockquote footer { width: 50%;
 97 |                     font-size: 1.1rem;
 98 |                     text-align: right; }
 99 | 
100 | ol, ul { width: 45%;
101 |          -webkit-padding-start: 5%;
102 |          -webkit-padding-end: 5%; }
103 | 
104 | li { padding: 0.5rem 0; }
105 | 
106 | figure { padding: 0;
107 |          border: 0;
108 |          font-size: 100%;
109 |          font: inherit;
110 |          vertical-align: baseline;
111 |          max-width: 55%;
112 |          -webkit-margin-start: 0;
113 |          -webkit-margin-end: 0;
114 |          margin: 0 0 3em 0; }
115 | 
116 | figcaption { float: right;
117 |              clear: right;
118 |              margin-right: -48%;
119 |              margin-top: 0;
120 |              margin-bottom: 0;
121 |              font-size: 1.1rem;
122 |              line-height: 1.6;
123 |              vertical-align: baseline;
124 |              position: relative;
125 |              max-width: 40%; }
126 | 
127 | figure.fullwidth figcaption { margin-right: 24%; }
128 | 
129 | /* Links: replicate underline that clears descenders */
130 | a:link, a:visited { color: inherit; }
131 | 
132 | a:link { text-decoration: none;
133 |          background: -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#333, #333);
134 |          background: linear-gradient(#fffff8, #fffff8), linear-gradient(#fffff8, #fffff8), linear-gradient(#333, #333);
135 |          -webkit-background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
136 |          -moz-background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
137 |          background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
138 |          background-repeat: no-repeat, no-repeat, repeat-x;
139 |          text-shadow: 0.03em 0 #fffff8, -0.03em 0 #fffff8, 0 0.03em #fffff8, 0 -0.03em #fffff8, 0.06em 0 #fffff8, -0.06em 0 #fffff8, 0.09em 0 #fffff8, -0.09em 0 #fffff8, 0.12em 0 #fffff8, -0.12em 0 #fffff8, 0.15em 0 #fffff8, -0.15em 0 #fffff8;
140 |          background-position: 0% 93%, 100% 93%, 0% 93%; }
141 | 
142 | @media screen and (-webkit-min-device-pixel-ratio: 0) { a:link { background-position-y: 87%, 87%, 87%; } }
143 | 
144 | a:link::selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe;
145 |                     background: #b4d5fe; }
146 | 
147 | a:link::-moz-selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe;
148 |                          background: #b4d5fe; }
149 | 
150 | /* Sidenotes, margin notes, figures, captions */
151 | img { max-width: 100%; }
152 | 
153 | .sidenote, .marginnote { float: right;
154 |                          clear: right;
155 |                          margin-right: -60%;
156 |                          width: 50%;
157 |                          margin-top: 0;
158 |                          margin-bottom: 0;
159 |                          font-size: 1.1rem;
160 |                          line-height: 1.3;
161 |                          vertical-align: baseline;
162 |                          position: relative; }
163 | 
164 | .table-caption { float:right;
165 |                  clear:right;
166 |                  margin-right: -60%;
167 |                  width: 50%;
168 |                  margin-top: 0;
169 |                  margin-bottom: 0;
170 |                  font-size: 1.0rem;
171 |                  line-height: 1.6; }
172 | 
173 | .sidenote-number { counter-increment: sidenote-counter; }
174 | 
175 | .sidenote-number:after, .sidenote:before { content: counter(sidenote-counter) " ";
176 |                                            font-family: et-book-roman-old-style;
177 |                                            position: relative;
178 |                                            vertical-align: baseline; }
179 | 
180 | .sidenote-number:after { content: counter(sidenote-counter);
181 |                          font-size: 1rem;
182 |                          top: -0.5rem;
183 |                          left: 0.1rem; }
184 | 
185 | .sidenote:before { content: counter(sidenote-counter) " ";
186 |                    top: -0.5rem; }
187 | 
188 | p, footer, table, div.table-wrapper-small, div.supertable-wrapper > p, div.booktabs-wrapper { width: 55%; }
189 | 
190 | div.fullwidth, table.fullwidth { width: 100%; }
191 | 
192 | div.table-wrapper { overflow-x: auto;
193 |                     font-family: "Trebuchet MS", "Gill Sans", "Gill Sans MT", sans-serif; }
194 | 
195 | @media screen and (max-width: 760px) { p, footer { width: 90%; }
196 |                                        pre.code { width: 87.5%; }
197 |                                        ul { width: 85%; }
198 |                                        figure { max-width: 90%; }
199 |                                        figcaption, figure.fullwidth figcaption { margin-right: 0%;
200 |                                                                                  max-width: none; }
201 |                                        blockquote p, blockquote footer { width: 90%; }}
202 | 
203 | .sans { font-family: "Gill Sans", "Gill Sans MT", Calibri, sans-serif;
204 |         letter-spacing: .03em; }
205 | 
206 | .code { font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
207 |         font-size: 1.125rem;
208 |         line-height: 1.6; }
209 | 
210 | h1 .code, h2 .code, h3 .code { font-size: 0.80em; }
211 | 
212 | .marginnote .code, .sidenote .code { font-size: 1rem; }
213 | 
214 | pre.code { width: 52.5%;
215 |            padding-left: 2.5%;
216 |            overflow-x: auto; }
217 | 
218 | .fullwidth { max-width: 90%;
219 |              clear:both; }
220 | 
221 | span.newthought { font-variant: small-caps;
222 |                   font-size: 1.2em; }
223 | 
224 | input.margin-toggle { display: none; }
225 | 
226 | label.sidenote-number { display: inline; }
227 | 
228 | label.margin-toggle:not(.sidenote-number) { display: none; }
229 | 
230 | @media (max-width: 760px) { label.margin-toggle:not(.sidenote-number) { display: inline; }
231 |                             .sidenote, .marginnote { display: none; }
232 |                             .margin-toggle:checked + .sidenote,
233 |                             .margin-toggle:checked + .marginnote { display: block;
234 |                                                                    float: left;
235 |                                                                    left: 1rem;
236 |                                                                    clear: both;
237 |                                                                    width: 95%;
238 |                                                                    margin: 1rem 2.5%;
239 |                                                                    vertical-align: baseline;
240 |                                                                    position: relative; }
241 |                             label { cursor: pointer; }
242 |                             pre.code { width: 90%;
243 |                                        padding: 0; }
244 |                             .table-caption { display: block;
245 |                                              float: right;
246 |                                              clear: both;
247 |                                              width: 98%;
248 |                                              margin-top: 1rem;
249 |                                              margin-bottom: 0.5rem;
250 |                                              margin-left: 1%;
251 |                                              margin-right: 1%;
252 |                                              vertical-align: baseline;
253 |                                              position: relative; }
254 |                             div.table-wrapper, table, table.booktabs { width: 85%; }
255 |                             div.table-wrapper { border-right: 1px solid #efefef; }
256 |                             img { width: 100%; } }
257 | 


--------------------------------------------------------------------------------
/docs/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018 Aditya Grover, Stefano Ermon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | TEMPDIR := $(shell mktemp -d -t tmp.XXX)
 2 | 
 3 | publish:
 4 | 	echo 'hmmm'
 5 | 	cp -r ./_site/* $(TEMPDIR)
 6 | 	cd $(TEMPDIR) && \
 7 | 	ls -a  && \
 8 | 	git init && \
 9 | 	git add . && \
10 | 	git commit -m 'publish site' && \
11 | 	git remote add origin https://github.com/deepgenerativemodels/notes.git && \
12 | 	git push origin master:refs/heads/gh-pages --force
13 | 


--------------------------------------------------------------------------------
/docs/autoregressive/autoregressive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/autoregressive/autoregressive.png


--------------------------------------------------------------------------------
/docs/autoregressive/fvsbn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/autoregressive/fvsbn.png


--------------------------------------------------------------------------------
/docs/autoregressive/index.tex:
--------------------------------------------------------------------------------
  1 | \section{Autoregressive Models}
  2 | 
  3 | We begin our study with the autoregressive generative models. As before, we assume we are given access to a dataset $\mathcal{D}$  of $n$-dimensional datapoints $\mathbf{x}$. For simplicity, we assume the datapoints are binary, i.e.,  $\mathbf{x} \in \{0,1\}^n$.
  4 | 
  5 | \section{Representation}
  6 | 
  7 | By the chain rule of probability, we can factorize the joint distribution over the $n$-dimensions as:
  8 | \[
  9 | \begin{equation}
 10 | p(\mathbf{x}) = \prod\limits_{i=1}^{n}p(x_i \vert x_1, x_2, \ldots, x_{i-1}) = \prod\limits_{i=1}^{n} p(x_i \vert \mathbf{x}_{<i})
 11 | \end{equation}
 12 | \label{eq:chain_rule}
 13 | \]
 14 | where $\mathbf{x}_{<i}=[x_1, x_2, \ldots, x_{i-1}]$ denotes the vector of random variables with index less than $i$. If we allow for every conditional $p(x_i \vert \mathbf{x}_{<i})$ to be specified in a tabular form, then such a representation is fully general and can represent any possible distribution over $n$ random variables. However, the space complexity for such a representation grows exponentially with $n$. 
 15 | 
 16 | To see why, let us consider the conditional for the last dimension, given by $p(x_n \vert \mathbf{x}_{<n})$. In order to fully specify this conditional, we need to specify a probability for $2^{n-1}$ configurations of the variables $x_1, x_2, \ldots, x_{n-1}$.  Since the probabilities should sum to 1, the total number of parameters for specifying this conditional is given by $2^{n-1} -1$. Hence, a tabular representation for the conditionals is impractical for learning the joint distribution in (\ref{eq:chain_rule}) . 
 17 | 
 18 | In an \textit{autoregressive generative model}, the conditionals are specified as parameterized functions with a fixed number of parameters. That is, we assume the conditional distributions $p(x_i \vert \mathbf{x}_{<i})$ to correspond to a Bernoulli random variable and learn a function that maps the preceeding random variables $x_1, x_2, \ldots, x_{i-1}$ to the mean of this distribution. Hence, we have:
 19 | \[
 20 | p_{\theta_i}(x_i \vert \mathbf{x}_{<i}) = \mathrm{Bern}(f_i(x_1, x_2, \ldots, x_{i-1}))
 21 | \]
 22 | where $\theta_i$ denotes the set of parameters used to specify the mean function $f_i: \{0,1\}^{i-1}\rightarrow [0,1]$.  The term \textit{autoregressive} originates from the literature on time-series models where observations from the previous time-steps are used to predict the value at the current time step. Here, we are predicting the distribution for the $i$-th random variable using the values of the preceeding random variables in the sequence $x_1, x_2, \ldots, x_n$.
 23 | 
 24 | The number of parameters of an autoregressive generative model are given by $\sum_{i=1}^n \vert \theta_i \vert$. As we shall see in the examples below, the number of parameters are much fewer than the tabular setting considered previously. Unlike the tabular setting however, an autoregressive generative model cannot represent all possible distributions. Its expressiveness is limited by the fact that we are limiting the conditional distributions to correspond to a Bernoulli random variable with a restricted class of parameterized functions specifying the mean.
 25 | 
 26 | In the simplest case, we can specify the function as a linear combination of the input elements followed by a sigmoid non-linearity (to restrict the output to lie between 0 and 1). This gives us the formulation of a \textit{fully-visible sigmoid belief network} (FVSBN):
 27 | \[
 28 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\alpha^{(i)}_0 + \alpha^{(i)}_1 x_1 + \ldots + \alpha^{(i)}_{i-1} x_{i-1})  
 29 | \]
 30 | where $\sigma$ denotes the sigmoid function and $\theta_i=\{\alpha^{(i)}_0,\alpha^{(i)}_1, \ldots, \alpha^{(i)}_{i-1}\}$ denote the parameters of the mean function. The conditional for variable $i$ requires $i$
 31 |  parameters, and hence the total number of parameters in the model is given by $\sum_{i=1}^ni= O(n^2)$.  Note that the number of parameters are much fewer than the exponential parameters required in the tabular case.
 32 | 
 33 | A natural way to increase the expressiveness of an autoregressive generative model is to use more flexible parameterizations for the mean function e.g., multi-layer perceptrons (MLP). In the case of 1-hidden layer neural networks, the mean function for variable $i$ can be expressed as:
 34 | \[
 35 | \mathbf{h}_i = \sigma(A_i \mathbf{x_{<i}} + \mathbf{c}_i)\\
 36 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )  
 37 | \]
 38 | where $\mathbf{h}_i \in \mathbb{R}^d$ denotes the hidden layer activations for the MLP and$\theta_i = \{A_i \in \mathbb{R}^{d\times (i-1)},  \mathbf{c}_i \in \mathbb{R}^d, \boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d, b_i \in \mathbb{R}\}$ are the set of parameters for the mean function $\mu_i(\cdot)$.  The total number of parameters in this model is dominated by the matrices $A_i$ and given by $O(n^2 d)$. 
 39 | 
 40 | The Neural Autoregressive Density Estimation (NADE) provides an efficient MLP parameterization that shares parameters used for evaluating the hidden layer activations.
 41 | \[
 42 | \mathbf{h}_i = \sigma(W_{., <i} \mathbf{x_{<i}} + \mathbf{c})\\
 43 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )  
 44 | \]
 45 | where $\theta=\{W\in \mathbb{R}^{d\times n}, \mathbf{c} \in \mathbb{R}^d, \{\boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d\}^n_{i=1}, \{b_i \in \mathbb{R}\}^n_{i=1}\}$is the full set of parameters for the mean functions $f_1(\cdot), f_2(\cdot), \ldots, f_n(\cdot)$. The weight matrix $W$ and the bias vector $\mathbf{c}$ are shared across the conditionals. Sharing parameters offers two benefits:
 46 | \begin{enumerate}
 47 | \item The total number of parameters from $O(n^2 d)$ to $O(nd)$ [readers are encouraged to check!].
 48 | \item The hidden unit activations can be evaluated in $O(nd)$ time via the following recursive strategy:
 49 | \[
 50 | \mathbf{h}_i = \sigma(\mathbf{a}_i)\\
 51 | \mathbf{a}_{i+1} = \mathbf{a}_{i} + W[., i]x_i
 52 | \]
 53 | with the base case given by $\mathbf{a}_1=\mathbf{c}$.
 54 | \end{enumerate}
 55 | 
 56 | 
 57 | \section{Learning and inference}
 58 | 
 59 | Recall that learning a generative model involves optimizing the closeness between the data and model distributions. One commonly used notion of closeness in the KL divergence between the data and the model distributions.
 60 | 
 61 | $$
 62 | \begin{align*}
 63 | \min_{\theta\in \mathcal{M}}d_{KL}(p_{\mathrm{data}}, p_{\theta}) &= \mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\mathrm{data}}(\mathbf{x}) - \log p_{\theta}(\mathbf{x})\right].
 64 | \end{align*}
 65 | $$
 66 | Before moving any further, we make two comments about the KL divergence. First, we note that the KL divergence between any two distributions is asymmetric. As we navigate through this chapter, the reader is encouraged to think what could go wrong if we decided to optimize the reverse KL divergence instead. Secondly, the KL divergences heavily penalizes model distribution $p_\theta$ which place little mass on any datapoint that has a non-zero probability under $p_{\mathrm{data}}$. In the extreme case, if the density $p_\theta(\mathbf{x})$ evaluates to zero for a datapoint sampled from $p_{\mathrm{data}}$, the objective evaluates to $+\infty$. 
 67 | 
 68 | Since $p_{\mathrm{data}}$ does not depend on $\theta$, we can equivalently recover the optimal parameters via maximizing likelihood estimation:
 69 | 
 70 | $$
 71 | \begin{align*}
 72 | \max_{\theta\in \mathcal{M}}\mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\theta}(\mathbf{x})\right].
 73 | \end{align*}
 74 | $$
 75 | Here, $\log p_{\theta}(\mathbf{x})$ is referred to as the log-likelihood of the datapoint $\mathbf{x}$ with respect to the model distribution $p_\theta$. 
 76 | 
 77 | To approximate the expectation over the unknown $p_{\mathrm{data}}$, we make an assumption: points in the dataset $\mathcal{D}$ are sampled i.i.d. from $p_{\mathrm{data}}$. This allows us to obtain an unbiased Monte Carlo estimate of the objective:
 78 | 
 79 | $$
 80 | \begin{align}
 81 | \max_{\theta\in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\log p_{\theta}(\mathbf{x}) = \mathcal{L}(\theta \vert \mathcal{D}).
 82 | \end{align}
 83 | \label{eq:mle}
 84 | \tag{2}
 85 | $$
 86 | 
 87 | The maximum likelihood estimation (MLE) objective has an intuitive interpretation: pick the model parameters $\theta \in \mathcal{M}$ that maximize the log-probability of the observed datapoints in $\mathcal{D}$. 
 88 | 
 89 | In practice, we optimize the MLE objective using mini-batch gradient ascent. The algorithm operates in iterations. At every iteration $t$, we sample a mini-batch $\mathcal{B}_t$  of datapoints sampled randomly from the dataset ($\vert \mathcal{B}_t\vert < \vert \mathcal{D} \vert$) and compute gradients of the objective evaluated for the mini-batch. These parameters at iteration $t+1$ are then given via the following update rule:
 90 | \[
 91 | \theta^{(t+1)} = \theta^{(t)} + r_t \nabla_\theta\mathcal{L}(\theta^{(t)} \vert \mathcal{B}_t)
 92 | \]
 93 | where $\theta^{(t+1)}$ and $\theta^{(t)}$ are the parameters at iterations $t+1$ and $t$ respectively, and $r_t$ is the learning rate at iteration $t$.  Typically, we only specify the initial learning rate $r_1$ and update the rate based on a schedule.  [Variants](http://cs231n.github.io/optimization-1/) of stochastic gradient ascent, such as RMS prop and Adam, employ modified update rules that work slightly better in practice. 
 94 | 
 95 | From a practical standpoint, we must think about how to choose hyperaparameters (such as the initial learning rate) and a stopping criteria for the gradient descent. For both these questions, we follow the standard practice in machine learning of monitoring the objective on a validation dataset. Consequently, we choose the hyperparameters with the best performance on the validation dataset and stop updating the parameters when the validation log-likelihoods stop improving.
 96 | 
 97 | Now that we have a well-defined objective and optimization procedure, the only remaining task is to evaluate the objective in the context of an autoregressive generative model. To this end, we substitute the factorization of the joint distribution in Eq.$~\ref{eq:chain_rule}$
 98 | in the MLE objective in Eq.$~\ref{eq:mle}$ to get:
 99 | \[
100 | \max_{\theta \in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\sum_{i=1}^n\log p_{\theta_i}(x_i \vert \mathbf{x}_{<i})
101 | \]where $\theta = \{\theta_1, \theta_2, \ldots, \theta_n\}$ now denotes the collective set of parameters for the conditionals.
102 | 
103 | Inference in an autoregressive model is straightforward. For density estimation of an arbitrary point $\mathbf{x}$, we simply evaluate the log-conditionals $\log p_{\theta_i}(x_i \vert \mathbf{x}_{<i})$ for each $i$ and add these up to obtain the log-likelihood assigned by the model to $\mathbf{x}$. Since we know conditioning vector $\mathbf{x}$, each of the conditionals can be evaluated in parallel. Hence, density estimation is efficient on modern hardware.
104 | 
105 | Sampling from an autoregressive model is a sequential procedure. Here, we first sample $x_1$, then we sample $x_2$ conditioned on the sampled $x_1$, followed by $x_3$ conditioned on both $x_1$ and $x_2$ and so on until we sample $x_n$ conditioned on the previously sampled $\mathbf{x}_{<n}$. For applications requiring real-time generation of high-dimensional data such as audio synthesis, the sequential sampling can be an expensive process.
106 | 
107 | 
108 | TODO: add NADE samples figure
109 | 
110 | Finally, an autoregressive model does not directly learn unsupervised representations of the data. In the next few set of lectures, we will look at latent variable models (e.g., variational autoencoders) which explicitly learn latent representations of the data.
111 | 
112 | 
113 | TODO: Autoregressive generative models based on Autoencoders, RNNs, and CNNs.
114 | MADE, Char-RNN, Pixel-CNN, Wavenet


--------------------------------------------------------------------------------
/docs/autoregressive/nade.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/autoregressive/nade.png


--------------------------------------------------------------------------------
/docs/css/tufte.orginal.css:
--------------------------------------------------------------------------------
  1 | /*****************************************************************************
  2 | /*
  3 | /* Tufte Jekyll blog theme
  4 | /* Based on Tufte CSS by Dave Liepmann ( https://github.com/edwardtufte/tufte-
  5 | /*
  6 | /* The README.md will show you how to set up your site along with other goodie
  7 | /*****************************************************************************/
  8 | 
  9 | // Imports to create final
 10 | 
 11 | @import "../_sass/fonts";
 12 | @import "../_sass/settings";
 13 | @import "../_sass/syntax-highlighting";
 14 | 
 15 | /* Tufte CSS styles */
 16 | html { font-size: 15px; }
 17 | 
 18 | body { width: 87.5%;
 19 |        margin-left: auto;
 20 |        margin-right: auto;
 21 |        padding-left: 12.5%;
 22 |        font-family: et-book, Palatino, "Palatino Linotype", "Palatino LT STD", "Book Antiqua", Georgia, serif;
 23 |        background-color: #fffff8;
 24 |        color: #111;
 25 |        max-width: 1400px;
 26 |        counter-reset: sidenote-counter; }
 27 | 
 28 | h1 { font-weight: 400;
 29 |      margin-top: 4rem;
 30 |      margin-bottom: 1.5rem;
 31 |      font-size: 3.2rem;
 32 |      line-height: 1; }
 33 | 
 34 | h2 { font-style: italic;
 35 |      font-weight: 400;
 36 |      margin-top: 2.1rem;
 37 |      margin-bottom: 0;
 38 |      font-size: 2.2rem;
 39 |      line-height: 1; }
 40 | 
 41 | h3 { font-style: italic;
 42 |      font-weight: 400;
 43 |      font-size: 1.7rem;
 44 |      margin-top: 2rem;
 45 |      margin-bottom: 0;
 46 |      line-height: 1; }
 47 | 
 48 | p.subtitle { font-style: italic;
 49 |              margin-top: 1rem;
 50 |              margin-bottom: 1rem;
 51 |              font-size: 1.8rem;
 52 |              display: block;
 53 |              line-height: 1; }
 54 | 
 55 | .numeral { font-family: et-book-roman-old-style; }
 56 | 
 57 | .danger { color: red; }
 58 | 
 59 | article { position: relative;
 60 |           padding: 5rem 0rem; }
 61 | 
 62 | section { padding-top: 1rem;
 63 |           padding-bottom: 1rem; }
 64 | 
 65 | p, ol, ul { font-size: 1.4rem; }
 66 | 
 67 | p { line-height: 2rem;
 68 |     margin-top: 1.4rem;
 69 |     margin-bottom: 1.4rem;
 70 |     padding-right: 0;
 71 |     vertical-align: baseline; }
 72 | 
 73 | /* Chapter Epigraphs */
 74 | div.epigraph { margin: 5em 0; }
 75 | 
 76 | div.epigraph > blockquote { margin-top: 3em;
 77 |                             margin-bottom: 3em; }
 78 | 
 79 | div.epigraph > blockquote, div.epigraph > blockquote > p { font-style: italic; }
 80 | 
 81 | div.epigraph > blockquote > footer { font-style: normal; }
 82 | 
 83 | div.epigraph > blockquote > footer > cite { font-style: italic; }
 84 | 
 85 | /* end chapter epigraphs styles */
 86 | 
 87 | blockquote { font-size: 1.4rem; }
 88 | 
 89 | blockquote p { width: 50%; }
 90 | 
 91 | blockquote footer { width: 50%;
 92 |                     font-size: 1.1rem;
 93 |                     text-align: right; }
 94 | 
 95 | ol, ul { width: 45%;
 96 |          -webkit-padding-start: 5%;
 97 |          -webkit-padding-end: 5%; }
 98 | 
 99 | li { padding: 0.5rem 0; }
100 | 
101 | figure { padding: 0;
102 |          border: 0;
103 |          font-size: 100%;
104 |          font: inherit;
105 |          vertical-align: baseline;
106 |          max-width: 55%;
107 |          -webkit-margin-start: 0;
108 |          -webkit-margin-end: 0;
109 |          margin: 0 0 3em 0; }
110 | 
111 | figcaption { float: right;
112 |              clear: right;
113 |              margin-right: -48%;
114 |              margin-top: 0;
115 |              margin-bottom: 0;
116 |              font-size: 1.1rem;
117 |              line-height: 1.6;
118 |              vertical-align: baseline;
119 |              position: relative;
120 |              max-width: 40%; }
121 | 
122 | figure.fullwidth figcaption { margin-right: 24%; }
123 | 
124 | /* Links: replicate underline that clears descenders */
125 | a:link, a:visited { color: inherit; }
126 | 
127 | a:link { text-decoration: none;
128 |          background: -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#333, #333);
129 |          background: linear-gradient(#fffff8, #fffff8), linear-gradient(#fffff8, #fffff8), linear-gradient(#333, #333);
130 |          -webkit-background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
131 |          -moz-background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
132 |          background-size: 0.05em 1px, 0.05em 1px, 1px 1px;
133 |          background-repeat: no-repeat, no-repeat, repeat-x;
134 |          text-shadow: 0.03em 0 #fffff8, -0.03em 0 #fffff8, 0 0.03em #fffff8, 0 -0.03em #fffff8, 0.06em 0 #fffff8, -0.06em 0 #fffff8, 0.09em 0 #fffff8, -0.09em 0 #fffff8, 0.12em 0 #fffff8, -0.12em 0 #fffff8, 0.15em 0 #fffff8, -0.15em 0 #fffff8;
135 |          background-position: 0% 93%, 100% 93%, 0% 93%; }
136 | 
137 | @media screen and (-webkit-min-device-pixel-ratio: 0) { a:link { background-position-y: 87%, 87%, 87%; } }
138 | 
139 | a:link::selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe;
140 |                     background: #b4d5fe; }
141 | 
142 | a:link::-moz-selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe;
143 |                          background: #b4d5fe; }
144 | 
145 | /* Sidenotes, margin notes, figures, captions */
146 | img { max-width: 100%; }
147 | 
148 | .sidenote, .marginnote { float: right;
149 |                          clear: right;
150 |                          margin-right: -60%;
151 |                          width: 50%;
152 |                          margin-top: 0;
153 |                          margin-bottom: 0;
154 |                          font-size: 1.1rem;
155 |                          line-height: 1.3;
156 |                          vertical-align: baseline;
157 |                          position: relative; }
158 | 
159 | .table-caption { float:right;
160 |                  clear:right;
161 |                  margin-right: -60%;
162 |                  width: 50%;
163 |                  margin-top: 0;
164 |                  margin-bottom: 0;
165 |                  font-size: 1.0rem;
166 |                  line-height: 1.6; }
167 | 
168 | .sidenote-number { counter-increment: sidenote-counter; }
169 | 
170 | .sidenote-number:after, .sidenote:before { content: counter(sidenote-counter) " ";
171 |                                            font-family: et-book-roman-old-style;
172 |                                            position: relative;
173 |                                            vertical-align: baseline; }
174 | 
175 | .sidenote-number:after { content: counter(sidenote-counter);
176 |                          font-size: 1rem;
177 |                          top: -0.5rem;
178 |                          left: 0.1rem; }
179 | 
180 | .sidenote:before { content: counter(sidenote-counter) " ";
181 |                    top: -0.5rem; }
182 | 
183 | p, footer, table, div.table-wrapper-small, div.supertable-wrapper > p, div.booktabs-wrapper { width: 55%; }
184 | 
185 | div.fullwidth, table.fullwidth { width: 100%; }
186 | 
187 | div.table-wrapper { overflow-x: auto;
188 |                     font-family: "Trebuchet MS", "Gill Sans", "Gill Sans MT", sans-serif; }
189 | 
190 | @media screen and (max-width: 760px) { p, footer { width: 90%; }
191 |                                        pre.code { width: 87.5%; }
192 |                                        ul { width: 85%; }
193 |                                        figure { max-width: 90%; }
194 |                                        figcaption, figure.fullwidth figcaption { margin-right: 0%;
195 |                                                                                  max-width: none; }
196 |                                        blockquote p, blockquote footer { width: 90%; }}
197 | 
198 | .sans { font-family: "Gill Sans", "Gill Sans MT", Calibri, sans-serif;
199 |         letter-spacing: .03em; }
200 | 
201 | .code { font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
202 |         font-size: 1.125rem;
203 |         line-height: 1.6; }
204 | 
205 | h1 .code, h2 .code, h3 .code { font-size: 0.80em; }
206 | 
207 | .marginnote .code, .sidenote .code { font-size: 1rem; }
208 | 
209 | pre.code { width: 52.5%;
210 |            padding-left: 2.5%;
211 |            overflow-x: auto; }
212 | 
213 | .fullwidth { max-width: 90%;
214 |              clear:both; }
215 | 
216 | span.newthought { font-variant: small-caps;
217 |                   font-size: 1.2em; }
218 | 
219 | input.margin-toggle { display: none; }
220 | 
221 | label.sidenote-number { display: inline; }
222 | 
223 | label.margin-toggle:not(.sidenote-number) { display: none; }
224 | 
225 | @media (max-width: 760px) { label.margin-toggle:not(.sidenote-number) { display: inline; }
226 |                             .sidenote, .marginnote { display: none; }
227 |                             .margin-toggle:checked + .sidenote,
228 |                             .margin-toggle:checked + .marginnote { display: block;
229 |                                                                    float: left;
230 |                                                                    left: 1rem;
231 |                                                                    clear: both;
232 |                                                                    width: 95%;
233 |                                                                    margin: 1rem 2.5%;
234 |                                                                    vertical-align: baseline;
235 |                                                                    position: relative; }
236 |                             label { cursor: pointer; }
237 |                             pre.code { width: 90%;
238 |                                        padding: 0; }
239 |                             .table-caption { display: block;
240 |                                              float: right;
241 |                                              clear: both;
242 |                                              width: 98%;
243 |                                              margin-top: 1rem;
244 |                                              margin-bottom: 0.5rem;
245 |                                              margin-left: 1%;
246 |                                              margin-right: 1%;
247 |                                              vertical-align: baseline;
248 |                                              position: relative; }
249 |                             div.table-wrapper, table, table.booktabs { width: 85%; }
250 |                             div.table-wrapper { border-right: 1px solid #efefef; }
251 |                             img { width: 100%; } }
252 | 


--------------------------------------------------------------------------------
/docs/flow/flow-graphical.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/flow/flow-graphical.PNG


--------------------------------------------------------------------------------
/docs/flow/iaf.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/flow/iaf.PNG


--------------------------------------------------------------------------------
/docs/flow/maf.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/flow/maf.PNG


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.eot


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.ttf


--------------------------------------------------------------------------------
/docs/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.woff


--------------------------------------------------------------------------------
/docs/fonts/icomoon.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/icomoon.eot


--------------------------------------------------------------------------------
/docs/fonts/icomoon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" >
 3 | <svg xmlns="http://www.w3.org/2000/svg">
 4 | <metadata>Generated by IcoMoon</metadata>
 5 | <defs>
 6 | <font id="icomoon" horiz-adv-x="512">
 7 | <font-face units-per-em="512" ascent="480" descent="-32" />
 8 | <missing-glyph horiz-adv-x="512" />
 9 | <glyph unicode="&#x20;" d="" horiz-adv-x="256" />
10 | <glyph unicode="&#xe600;" d="M192 160l64 32 224 224-32 32-224-224-32-64zM144.65 46.451c-15.816 33.364-32.833 50.381-66.198 66.197l49.548 136.396 64 38.956 192 192h-96l-192-192-96-320 320 96 192 192v96l-192-192-38.956-64z" />
11 | <glyph unicode="&#xe601;" d="M448 416h-48v-16c0-17.645-14.355-32-32-32s-32 14.355-32 32v16h-160v-16c0-17.645-14.355-32-32-32s-32 14.355-32 32v16h-48c-17.6 0-32-14.4-32-32v-352c0-17.6 14.4-32 32-32h384c17.6 0 32 14.4 32 32v352c0 17.6-14.4 32-32 32zM448 32.058c-0.017-0.020-0.038-0.041-0.058-0.058h-383.885c-0.020 0.017-0.041 0.038-0.057 0.058v287.942h384v-287.942zM144 384c8.836 0 16 7.164 16 16v64c0 8.836-7.164 16-16 16s-16-7.164-16-16v-64c0-8.836 7.164-16 16-16zM368 384c8.836 0 16 7.164 16 16v64c0 8.836-7.164 16-16 16s-16-7.164-16-16v-64c0-8.836 7.164-16 16-16zM288 288h-160v-32h128v-64h-128v-32h128v-64h-128v-32h160zM352 64h32v224h-64v-32h32zM436-12h-360c-17.6 0-32 10.4-32 28v-16c0-17.6 14.4-32 32-32h360c17.6 0 32 14.4 32 32v16c0-17.6-14.4-28-32-28z" />
12 | <glyph unicode="&#xe602;" d="M476.698 442.679l-2.014 2.021c-47.074 47.067-124.097 47.067-171.163 0l-109.053-109.068c-47.067-47.066-47.067-124.088 0-171.155l2.013-2.013c3.916-3.924 8.073-7.462 12.368-10.729l39.924 39.925c-4.651 2.747-9.063 6.036-13.058 10.030l-2.021 2.021c-25.557 25.549-25.557 67.136 0 92.695l109.064 109.056c25.558 25.559 67.137 25.559 92.693 0l2.021-2.012c25.55-25.558 25.55-67.146 0-92.695l-49.343-49.343c8.566-21.154 12.624-43.7 12.269-66.193l76.302 76.302c47.067 47.068 47.067 124.089-0.002 171.158zM315.521 285.533c-3.916 3.916-8.073 7.461-12.368 10.72l-39.924-39.916c4.652-2.748 9.063-6.037 13.058-10.031l2.021-2.020c25.558-25.558 25.558-67.136 0-92.694l-109.065-109.067c-25.559-25.551-67.138-25.551-92.694 0l-2.021 2.021c-25.549 25.56-25.549 67.138 0 92.694l49.344 49.343c-8.567 21.153-12.623 43.701-12.269 66.193l-76.301-76.299c-47.068-47.066-47.068-124.089 0-171.162l2.013-2.016c47.076-47.064 124.096-47.064 171.164 0l109.055 109.059c47.067 47.066 47.067 124.097 0 171.163l-2.013 2.012z" />
13 | <glyph unicode="&#xe603;" d="M256 480c-141.385 0-256-114.615-256-256s114.615-256 256-256 256 114.615 256 256-114.615 256-256 256zM224 384h64v-64h-64v64zM320 64h-128v32h32v128h-32v32h96v-160h32v-32z" />
14 | <glyph unicode="&#xe604;" d="M256 480c-141.385 0-256-114.615-256-256s114.615-256 256-256 256 114.615 256 256-114.615 256-256 256zM384 306.745l-82.744-82.745 82.744-82.744v-45.256h-45.256l-82.744 82.744-82.745-82.744h-45.255v45.256l82.745 82.744-82.745 82.745v45.255h45.255l82.745-82.745 82.744 82.745h45.256v-45.255z" />
15 | <glyph unicode="&#xe605;" d="M256 480c-141.385 0-256-114.615-256-256s114.615-256 256-256 256 114.615 256 256-114.615 256-256 256zM208 64l-106 138 47 49 59-75 185 151 23-23-208-240z" />
16 | <glyph unicode="&#xe606;" d="M512 112l-144 368h-224l-144-144v-224l144-144h224l144 144v224l-144 144zM288 64h-64v64h64v-64zM288 192h-64v192h64v-192z" />
17 | <glyph unicode="&#xe607;" d="M426.671 480h-341.328c-46.937 0-85.343-38.405-85.343-85.345v-341.311c0-46.969 38.406-85.344 85.343-85.344h341.328c46.938 0 85.329 38.375 85.329 85.345v341.31c0 46.94-38.391 85.345-85.329 85.345zM91.314 36.687l-38.628 38.627 128 128 14.628-14.627-104-152zM76.686 380.686l14.628 14.628 164.686-132.687 164.687 132.687 14.627-14.628-179.314-211.313-179.314 211.313zM420.687 36.687l-104 152 14.627 14.627 128-128-38.627-38.627z" />
18 | <glyph unicode="&#xe608;" d="M0.403 45.168c-0.122 1.266-0.226 2.535-0.292 3.815 0.065-1.28 0.17-2.549 0.292-3.815zM117.954 197.426c46.005-1.369 76.867 46.349 68.931 106.599-7.947 60.24-51.698 108.584-97.704 109.961-46.013 1.365-76.87-44.741-68.926-105 7.941-60.234 51.676-110.187 97.699-111.56zM512 352v42.655c0 46.94-38.391 85.345-85.329 85.345h-341.328c-46.138 0-84.006-37.116-85.282-82.963 29.181 25.693 69.662 47.158 111.437 47.158 44.652 0 178.622 0 178.622 0l-39.974-33.809h-56.634c37.565-14.402 57.578-58.062 57.578-102.861 0-37.624-20.905-69.977-50.444-92.984-28.822-22.451-34.286-31.854-34.286-50.939 0-16.289 30.873-44 47.016-55.394 47.191-33.269 62.458-64.156 62.458-115.728 0-8.214-1.021-16.415-3.033-24.48h153.871c46.937 0 85.328 38.375 85.328 85.345v266.654h-96v-95.999h-32v96h-95.999v32h95.999v96h32v-96h96zM92.943 97.032c10.807 0 20.711 0.295 30.968 0.295-13.573 13.167-24.313 29.3-24.313 49.19 0 11.804 3.782 23.168 9.067 33.26-5.391-0.385-10.895-0.497-16.563-0.497-37.178 0-68.753 12.038-92.102 31.927v-33.621l0.003-100.865c26.72 12.687 58.444 20.311 92.94 20.311zM1.71 36.371c-0.556 2.729-0.983 5.503-1.271 8.317 0.287-2.814 0.715-5.588 1.271-8.317zM227.725 3.577c-7.529 29.403-34.227 43.982-71.444 69.784-13.536 4.366-28.447 6.937-44.447 7.104-44.809 0.482-86.554-17.471-110.108-44.186 7.96-38.853 42.517-68.279 83.617-68.279h143.222c0.908 5.564 1.348 11.316 1.348 17.216 0 6.267-0.767 12.396-2.188 18.361z" />
19 | <glyph unicode="&#xe609;" d="M426.672 480h-341.33c-46.936 0-85.342-38.407-85.342-85.344v-341.313c0-46.969 38.406-85.343 85.342-85.343l341.33 0.001c46.938 0 85.328 38.373 85.328 85.344v341.311c0 46.937-38.391 85.344-85.328 85.344zM435.296 224h-83.296v-224h-96v224h-46.263v73.282h46.263v47.593c0 64.671 27.896 103.125 103.935 103.125h87.622v-79.285h-71.565c-21.241 0.035-23.876-11.076-23.876-31.756l-0.116-39.677h96l-12.704-73.282z" />
20 | <glyph unicode="&#xe60a;" d="M426.671 480h-341.328c-46.937 0-85.343-38.405-85.343-85.345v-341.311c0-46.969 38.406-85.344 85.343-85.344h341.328c46.938 0 85.329 38.375 85.329 85.345v341.31c0 46.94-38.391 85.345-85.329 85.345zM419.026 309.083c0.164-3.671 0.245-7.364 0.245-11.074 0-113.107-84.608-243.534-239.329-243.534-47.502 0-91.717 14.174-128.943 38.459 6.58-0.794 13.276-1.197 20.065-1.197 39.411 0 75.679 13.685 104.467 36.641-36.808 0.69-67.872 25.438-78.577 59.441 5.137-1 10.406-1.537 15.826-1.537 7.672 0 15.103 1.048 22.16 3.004-38.48 7.866-67.475 42.458-67.475 83.928 0 0.361 0 0.719 0.008 1.076 11.34-6.41 24.312-10.26 38.1-10.705-22.571 15.349-37.421 41.546-37.421 71.244 0 15.685 4.147 30.389 11.389 43.029 41.487-51.785 103.468-85.86 173.377-89.431-1.435 6.266-2.179 12.798-2.179 19.507 0 47.269 37.663 85.59 84.115 85.59 24.195 0 46.059-10.393 61.401-27.029 19.16 3.838 37.162 10.96 53.416 20.771-6.281-19.988-19.617-36.761-36.983-47.355 17.013 2.069 33.226 6.67 48.31 13.477-11.273-17.162-25.535-32.238-41.972-44.305z" />
21 | <glyph unicode="&#xe60b;" d="M426.67 480h-341.327c-46.938 0-85.343-38.406-85.343-85.344v-341.314c0-46.967 38.406-85.342 85.344-85.342h341.326c46.938 0 85.33 38.374 85.33 85.342v341.314c0 46.938-38.392 85.344-85.33 85.344zM139.472 64.376c-23.985 0-43.472 19.346-43.472 43.314 0 23.842 19.486 43.406 43.472 43.406 24.079 0 43.53-19.564 43.53-43.406-0.001-23.968-19.452-43.314-43.53-43.314zM248.734 64.002c0 40.905-15.904 79.409-44.73 108.222-28.857 28.875-67.188 44.813-107.952 44.813v62.593c118.826 0 215.563-96.721 215.563-215.627l-62.881-0.001zM359.814 64.002c0 145.531-118.329 263.97-263.688 263.97v62.624c180.001 0 326.473-146.562 326.473-326.596l-62.785 0.002z" />
22 | <glyph unicode="&#xe60c;" d="M426.688 480h-341.344c-46.938 0-85.344-38.406-85.344-85.341v-341.313c0-46.971 38.406-85.346 85.344-85.346h341.341c46.937 0 85.315 38.377 85.315 85.348v341.311c0 46.935-38.378 85.341-85.312 85.341zM144 144c-44.183 0-80 35.817-80 80s35.817 80 80 80 80-35.817 80-80c0-44.183-35.817-80-80-80zM368 144c-44.184 0-80 35.817-80 80s35.816 80 80 80 80-35.817 80-80c0-44.183-35.816-80-80-80z" />
23 | <glyph unicode="&#xe60d;" d="M131.736 100.316c0-13.526 11.447-20.291 34.334-20.291 20.032 0 30.044 7.021 30.044 21.067 0 13.269-10.923 19.898-32.772 19.898-21.073 0.002-31.606-6.888-31.606-20.674zM426.674 480h-341.33c-46.938 0-85.344-38.405-85.344-85.344v-341.313c0-46.968 38.406-85.343 85.344-85.343h341.33c46.934 0 85.326 38.375 85.326 85.344v341.312c0 46.939-38.391 85.344-85.326 85.344zM237.864 275.506c-3.642-1.302-8.844-2.729-15.606-4.293 2.080-5.983 3.123-11.576 3.123-16.777 0-16.646-5.009-31.149-15.022-43.506-10.015-12.354-22.956-19.703-38.823-22.043-10.402-1.563-15.604-7.156-15.604-16.779 0-3.381 1.689-6.765 5.072-10.145 4.422-4.943 10.924-8.066 19.509-9.367 37.195-5.721 55.793-21.197 55.793-46.427 0-40.321-24.062-60.481-72.181-60.481-19.771 0-36.028 3.513-48.772 10.535-16.129 8.844-24.188 22.76-24.188 41.75 0 21.852 12.096 36.808 36.282 44.869v0.78c-8.842 5.462-13.263 13.785-13.263 24.972 0 14.566 4.162 23.672 12.484 27.311v0.781c-8.323 2.862-15.737 9.366-22.24 19.512-7.282 10.926-10.925 22.63-10.925 35.115 0 18.729 6.635 34.335 19.896 46.82 12.748 11.707 27.964 17.558 45.653 17.558 12.747 0 24.579-3.119 35.505-9.361 12.485 0 26.921 3.12 43.313 9.361l-0.006-40.185zM301.089 140.503h-44.093c0.521 5.201 0.779 14.042 0.779 26.532v121.346c0 12.227-0.261 20.682-0.779 25.359h44.093c-0.521-4.942-0.781-13.135-0.781-24.582v-119.782c0-13.267 0.261-22.892 0.781-28.873zM298.159 352.759c-5.332-5.722-11.641-8.581-18.923-8.581-7.543 0-13.979 2.859-19.313 8.581-5.333 5.726-8.001 12.486-8.001 20.289 0 8.067 2.668 14.956 8.001 20.683 5.333 5.721 11.77 8.584 19.313 8.584 7.282 0 13.592-2.863 18.923-8.584 5.333-5.728 7.999-12.615 7.999-20.683 0-7.803-2.666-14.564-7.999-20.289zM420.845 145.576c-9.627-5.205-21.203-7.803-34.729-7.803-18.985 0-32.124 6.76-39.405 20.287-5.465 10.145-8.193 26.14-8.193 47.99v69.842h0.389v0.781l-5.853 0.391c-3.382 0-7.806-0.391-13.267-1.171v37.847h19.117v15.216c0 7.285-0.389 13.14-1.17 17.56h45.259c-0.775-4.94-1.168-10.533-1.168-16.776v-15.999h33.943v-37.846c-1.299 0-3.705 0.129-7.218 0.389-3.512 0.261-6.828 0.393-9.949 0.393h-16.777v-72.572c0-17.432 5.725-26.145 17.167-26.145 8.063 0 15.348 2.209 21.854 6.632v-39.016zM160.608 279.797c-16.388 0-24.582-9.627-24.582-28.873 0-17.951 8.194-26.924 24.582-26.924 15.868 0 23.801 9.104 23.801 27.313 0 7.545-1.82 14.047-5.461 19.511-4.423 5.981-10.537 8.973-18.34 8.973z" />
24 | <glyph unicode="&#xe60e;" d="M416 448h-320l-96-96v-336c0-8.837 7.163-16 16-16h480c8.836 0 16 7.163 16 16v336l-96 96zM256 64l-160 128h96v96h128v-96h96l-160-128zM77.255 384l32 32h293.489l32-32h-357.489z" />
25 | <glyph unicode="&#xe60f;" d="M0 416v-384h512v384h-512zM96 64h-64v64h64v-64zM96 192h-64v64h64v-64zM96 320h-64v64h64v-64zM384 64h-256v320h256v-320zM480 64h-64v64h64v-64zM480 192h-64v64h64v-64zM480 320h-64v64h64v-64zM192 320v-192l128 96z" />
26 | </font></defs></svg>


--------------------------------------------------------------------------------
/docs/fonts/icomoon.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/icomoon.ttf


--------------------------------------------------------------------------------
/docs/fonts/icomoon.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/fonts/icomoon.woff


--------------------------------------------------------------------------------
/docs/gan/cyclegan_gendisc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/gan/cyclegan_gendisc.png


--------------------------------------------------------------------------------
/docs/gan/gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/gan/gan.png


--------------------------------------------------------------------------------
/docs/gan/index.tex:
--------------------------------------------------------------------------------
  1 | \section{Generative Adversarial Networks}
  2 | 
  3 | % We begin our study with the autoregressive generative models. As before, we assume we are given access to a dataset $\mathcal{D}$  of $n$-dimensional datapoints $\mathbf{x}$. For simplicity, we assume the datapoints are binary, i.e.,  $\mathbf{x} \in \{0,1\}^n$.
  4 | 
  5 | \section{Representation}
  6 | 
  7 | By the chain rule of probability, we can factorize the joint distribution over the $n$-dimensions as:
  8 | \[
  9 | \begin{equation}
 10 | p(\mathbf{x}) = \prod\limits_{i=1}^{n}p(x_i \vert x_1, x_2, \ldots, x_{i-1}) = \prod\limits_{i=1}^{n} p(x_i \vert \mathbf{x}_{<i})
 11 | \end{equation}
 12 | \label{eq:chain_rule}
 13 | \]
 14 | where $\mathbf{x}_{<i}=[x_1, x_2, \ldots, x_{i-1}]$ denotes the vector of random variables with index less than $i$. If we allow for every conditional $p(x_i \vert \mathbf{x}_{<i})$ to be specified in a tabular form, then such a representation is fully general and can represent any possible distribution over $n$ random variables. However, the space complexity for such a representation grows exponentially with $n$. 
 15 | 
 16 | To see why, let us consider the conditional for the last dimension, given by $p(x_n \vert \mathbf{x}_{<n})$. In order to fully specify this conditional, we need to specify a probability for $2^{n-1}$ configurations of the variables $x_1, x_2, \ldots, x_{n-1}$.  Since the probabilities should sum to 1, the total number of parameters for specifying this conditional is given by $2^{n-1} -1$. Hence, a tabular representation for the conditionals is impractical for learning the joint distribution in (\ref{eq:chain_rule}) . 
 17 | 
 18 | In an \textit{autoregressive generative model}, the conditionals are specified as parameterized functions with a fixed number of parameters. That is, we assume the conditional distributions $p(x_i \vert \mathbf{x}_{<i})$ to correspond to a Bernoulli random variable and learn a function that maps the preceeding random variables $x_1, x_2, \ldots, x_{i-1}$ to the mean of this distribution. Hence, we have:
 19 | \[
 20 | p_{\theta_i}(x_i \vert \mathbf{x}_{<i}) = \mathrm{Bern}(f_i(x_1, x_2, \ldots, x_{i-1}))
 21 | \]
 22 | where $\theta_i$ denotes the set of parameters used to specify the mean function $f_i: \{0,1\}^{i-1}\rightarrow [0,1]$.  The term \textit{autoregressive} originates from the literature on time-series models where observations from the previous time-steps are used to predict the value at the current time step. Here, we are predicting the distribution for the $i$-th random variable using the values of the preceeding random variables in the sequence $x_1, x_2, \ldots, x_n$.
 23 | 
 24 | The number of parameters of an autoregressive generative model are given by $\sum_{i=1}^n \vert \theta_i \vert$. As we shall see in the examples below, the number of parameters are much fewer than the tabular setting considered previously. Unlike the tabular setting however, an autoregressive generative model cannot represent all possible distributions. Its expressiveness is limited by the fact that we are limiting the conditional distributions to correspond to a Bernoulli random variable with a restricted class of parameterized functions specifying the mean.
 25 | 
 26 | In the simplest case, we can specify the function as a linear combination of the input elements followed by a sigmoid non-linearity (to restrict the output to lie between 0 and 1). This gives us the formulation of a \textit{fully-visible sigmoid belief network} (FVSBN):
 27 | \[
 28 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\alpha^{(i)}_0 + \alpha^{(i)}_1 x_1 + \ldots + \alpha^{(i)}_{i-1} x_{i-1})  
 29 | \]
 30 | where $\sigma$ denotes the sigmoid function and $\theta_i=\{\alpha^{(i)}_0,\alpha^{(i)}_1, \ldots, \alpha^{(i)}_{i-1}\}$ denote the parameters of the mean function. The conditional for variable $i$ requires $i$
 31 |  parameters, and hence the total number of parameters in the model is given by $\sum_{i=1}^ni= O(n^2)$.  Note that the number of parameters are much fewer than the exponential parameters required in the tabular case.
 32 | 
 33 | A natural way to increase the expressiveness of an autoregressive generative model is to use more flexible parameterizations for the mean function e.g., multi-layer perceptrons (MLP). In the case of 1-hidden layer neural networks, the mean function for variable $i$ can be expressed as:
 34 | \[
 35 | \mathbf{h}_i = \sigma(A_i \mathbf{x_{<i}} + \mathbf{c}_i)\\
 36 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )  
 37 | \]
 38 | where $\mathbf{h}_i \in \mathbb{R}^d$ denotes the hidden layer activations for the MLP and$\theta_i = \{A_i \in \mathbb{R}^{d\times (i-1)},  \mathbf{c}_i \in \mathbb{R}^d, \boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d, b_i \in \mathbb{R}\}$ are the set of parameters for the mean function $\mu_i(\cdot)$.  The total number of parameters in this model is dominated by the matrices $A_i$ and given by $O(n^2 d)$. 
 39 | 
 40 | The Neural Autoregressive Density Estimation (NADE) provides an efficient MLP parameterization that shares parameters used for evaluating the hidden layer activations.
 41 | \[
 42 | \mathbf{h}_i = \sigma(W_{., <i} \mathbf{x_{<i}} + \mathbf{c})\\
 43 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )  
 44 | \]
 45 | where $\theta=\{W\in \mathbb{R}^{d\times n}, \mathbf{c} \in \mathbb{R}^d, \{\boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d\}^n_{i=1}, \{b_i \in \mathbb{R}\}^n_{i=1}\}$is the full set of parameters for the mean functions $f_1(\cdot), f_2(\cdot), \ldots, f_n(\cdot)$. The weight matrix $W$ and the bias vector $\mathbf{c}$ are shared across the conditionals. Sharing parameters offers two benefits:
 46 | \begin{enumerate}
 47 | \item The total number of parameters from $O(n^2 d)$ to $O(nd)$ [readers are encouraged to check!].
 48 | \item The hidden unit activations can be evaluated in $O(nd)$ time via the following recursive strategy:
 49 | \[
 50 | \mathbf{h}_i = \sigma(\mathbf{a}_i)\\
 51 | \mathbf{a}_{i+1} = \mathbf{a}_{i} + W[., i]x_i
 52 | \]
 53 | with the base case given by $\mathbf{a}_1=\mathbf{c}$.
 54 | \end{enumerate}
 55 | 
 56 | 
 57 | \section{Learning and inference}
 58 | 
 59 | Recall that learning a generative model involves optimizing the closeness between the data and model distributions. One commonly used notion of closeness in the KL divergence between the data and the model distributions.
 60 | 
 61 | $$
 62 | \begin{align*}
 63 | \min_{\theta\in \mathcal{M}}d_{KL}(p_{\mathrm{data}}, p_{\theta}) &= \mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\mathrm{data}}(\mathbf{x}) - \log p_{\theta}(\mathbf{x})\right].
 64 | \end{align*}
 65 | $$
 66 | Before moving any further, we make two comments about the KL divergence. First, we note that the KL divergence between any two distributions is asymmetric. As we navigate through this chapter, the reader is encouraged to think what could go wrong if we decided to optimize the reverse KL divergence instead. Secondly, the KL divergences heavily penalizes model distribution $p_\theta$ which place little mass on any datapoint that has a non-zero probability under $p_{\mathrm{data}}$. In the extreme case, if the density $p_\theta(\mathbf{x})$ evaluates to zero for a datapoint sampled from $p_{\mathrm{data}}$, the objective evaluates to $+\infty$. 
 67 | 
 68 | Since $p_{\mathrm{data}}$ does not depend on $\theta$, we can equivalently recover the optimal parameters via maximizing likelihood estimation:
 69 | 
 70 | $$
 71 | \begin{align*}
 72 | \max_{\theta\in \mathcal{M}}\mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\theta}(\mathbf{x})\right].
 73 | \end{align*}
 74 | $$
 75 | Here, $\log p_{\theta}(\mathbf{x})$ is referred to as the log-likelihood of the datapoint $\mathbf{x}$ with respect to the model distribution $p_\theta$. 
 76 | 
 77 | To approximate the expectation over the unknown $p_{\mathrm{data}}$, we make an assumption: points in the dataset $\mathcal{D}$ are sampled i.i.d. from $p_{\mathrm{data}}$. This allows us to obtain an unbiased Monte Carlo estimate of the objective:
 78 | 
 79 | $$
 80 | \begin{align}
 81 | \max_{\theta\in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\log p_{\theta}(\mathbf{x}) = \mathcal{L}(\theta \vert \mathcal{D}).
 82 | \end{align}
 83 | \label{eq:mle}
 84 | \tag{2}
 85 | $$
 86 | 
 87 | The maximum likelihood estimation (MLE) objective has an intuitive interpretation: pick the model parameters $\theta \in \mathcal{M}$ that maximize the log-probability of the observed datapoints in $\mathcal{D}$. 
 88 | 
 89 | In practice, we optimize the MLE objective using mini-batch gradient ascent. The algorithm operates in iterations. At every iteration $t$, we sample a mini-batch $\mathcal{B}_t$  of datapoints sampled randomly from the dataset ($\vert \mathcal{B}_t\vert < \vert \mathcal{D} \vert$) and compute gradients of the objective evaluated for the mini-batch. These parameters at iteration $t+1$ are then given via the following update rule:
 90 | \[
 91 | \theta^{(t+1)} = \theta^{(t)} + r_t \nabla_\theta\mathcal{L}(\theta^{(t)} \vert \mathcal{B}_t)
 92 | \]
 93 | where $\theta^{(t+1)}$ and $\theta^{(t)}$ are the parameters at iterations $t+1$ and $t$ respectively, and $r_t$ is the learning rate at iteration $t$.  Typically, we only specify the initial learning rate $r_1$ and update the rate based on a schedule.  [Variants](http://cs231n.github.io/optimization-1/) of stochastic gradient ascent, such as RMS prop and Adam, employ modified update rules that work slightly better in practice. 
 94 | 
 95 | From a practical standpoint, we must think about how to choose hyperaparameters (such as the initial learning rate) and a stopping criteria for the gradient descent. For both these questions, we follow the standard practice in machine learning of monitoring the objective on a validation dataset. Consequently, we choose the hyperparameters with the best performance on the validation dataset and stop updating the parameters when the validation log-likelihoods stop improving.
 96 | 
 97 | Now that we have a well-defined objective and optimization procedure, the only remaining task is to evaluate the objective in the context of an autoregressive generative model. To this end, we substitute the factorization of the joint distribution in Eq.$~\ref{eq:chain_rule}$
 98 | in the MLE objective in Eq.$~\ref{eq:mle}$ to get:
 99 | \[
100 | \max_{\theta \in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\sum_{i=1}^n\log p_{\theta_i}(x_i \vert \mathbf{x}_{<i})
101 | \]where $\theta = \{\theta_1, \theta_2, \ldots, \theta_n\}$ now denotes the collective set of parameters for the conditionals.
102 | 
103 | Inference in an autoregressive model is straightforward. For density estimation of an arbitrary point $\mathbf{x}$, we simply evaluate the log-conditionals $\log p_{\theta_i}(x_i \vert \mathbf{x}_{<i})$ for each $i$ and add these up to obtain the log-likelihood assigned by the model to $\mathbf{x}$. Since we know conditioning vector $\mathbf{x}$, each of the conditionals can be evaluated in parallel. Hence, density estimation is efficient on modern hardware.
104 | 
105 | Sampling from an autoregressive model is a sequential procedure. Here, we first sample $x_1$, then we sample $x_2$ conditioned on the sampled $x_1$, followed by $x_3$ conditioned on both $x_1$ and $x_2$ and so on until we sample $x_n$ conditioned on the previously sampled $\mathbf{x}_{<n}$. For applications requiring real-time generation of high-dimensional data such as audio synthesis, the sequential sampling can be an expensive process.
106 | 
107 | 
108 | TODO: add NADE samples figure
109 | 
110 | Finally, an autoregressive model does not directly learn unsupervised representations of the data. In the next few set of lectures, we will look at latent variable models (e.g., variational autoencoders) which explicitly learn latent representations of the data.
111 | 
112 | 
113 | TODO: Autoregressive generative models based on Autoencoders, RNNs, and CNNs.
114 | MADE, Char-RNN, Pixel-CNN, Wavenet
115 | 


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |   <meta charset="utf-8">
  5 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1">
  7 | 
  8 |   <title>Contents</title>
  9 |   <meta name="description" content="Lecture notes for Deep Generative Models.">
 10 | 
 11 | 
 12 |   <link rel="stylesheet" href="/notes/css/tufte.css">	
 13 |   
 14 | 
 15 |   <!-- Google Fonts loaded here depending on setting in _data/options.yml true loads font, blank does not-->
 16 |   
 17 |     <link href='//fonts.googleapis.com/css?family=Lato:400,400italic' rel='stylesheet' type='text/css'>
 18 |   
 19 |   <!-- Load up MathJax script if needed ... specify in /_data/options.yml file-->
 20 |   
 21 |     <script type="text/javascript" src="//cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
 22 |   
 23 | 
 24 |   <script>
 25 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
 26 |   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
 27 |   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
 28 |   })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
 29 | 
 30 |   ga('create', 'UA-129020129-1', 'auto');
 31 |   ga('send', 'pageview');
 32 | 
 33 |   </script>
 34 | 
 35 |   <link rel="canonical" href="http://localhost:4000/notes/">
 36 |   <link rel="alternate" type="application/rss+xml" title="Deep Generative Models" href="http://localhost:4000/notes/feed.xml" />
 37 | </head>
 38 | 
 39 |   <body>
 40 |     <!--- Header and nav template site-wide -->
 41 | <header>
 42 |     <nav class="group">
 43 |         <a href="/notes/">Contents</a>
 44 | 	<a href="http://deepgenerativemodels.github.io">Class</a>
 45 | 	<a href="http://github.com/deepgenerativemodels/notes">Github</a>
 46 | 	</nav>
 47 | </header>
 48 | 
 49 |     <article class="group">
 50 |       <h1>Contents</h1>
 51 | <p class="subtitle"></p>
 52 | 
 53 | 
 54 | <script type="text/x-mathjax-config">
 55 |   MathJax.Hub.Config({
 56 |     TeX: {
 57 |       Macros: {
 58 |         e: "\\epsilon",
 59 |         xti: "x^{(i)}",
 60 |         yti: "y^{(i)}",
 61 |         bfy: "{\\bf y}",
 62 |         bfx: "{\\bf x}",
 63 |         bfg: "{\\bf g}",
 64 |         bfbeta: "{\\bf \\beta}",
 65 |         tp: "\\tilde p",
 66 |         pt: "p_\\theta",
 67 |         Exp: "{\\mathbb{E}}",
 68 |         Ind: "{\\mathbb{I}}",
 69 |         KL: "{\\mathbb{KL}}",
 70 |         Dc: "{\\mathcal{D}}",
 71 |         Tc: "{\\mathcal{T}}",
 72 |         Xc: "{\\mathcal{X}}",
 73 |         note: ["\\textcolor{blue}{[NOTE: #1]}",1]
 74 |       }
 75 |     }
 76 |   });
 77 | </script>
 78 | 
 79 | 
 80 | <p>These notes form a concise introductory course on deep generative models.
 81 | They are based on Stanford <a href="https://deepgenerativemodels.github.io/">CS236</a>, taught by <a href="http://cs.stanford.edu/~ermon/">Stefano Ermon</a> and <a href="http://aditya-grover.github.io/">Aditya Grover</a>, and have been written by <a href="http://aditya-grover.github.io/">Aditya Grover</a>, with the <a href="https://github.com/deepgenerativemodels/notes/commits/master">help</a> of many students and course staff.
 82 | <label for="mn-id-whatever" class="margin-toggle"> ⊕</label><input type="checkbox" id="mn-id-whatever" class="margin-toggle" /><span class="marginnote">The notes are still <strong>under construction</strong>!
 83 | Since these notes are brand new, you will find several typos. If you do, please let us know, or submit a pull request with your fixes to our <a href="https://github.com/deepgenerativemodels/notes">Github repository</a>. </span>
 84 | You too may help make these notes better by submitting your improvements to us via <a href="https://github.com/deepgenerativemodels/notes">Github</a>.</p>
 85 | 
 86 | <ol>
 87 |   <li>
 88 |     <p><a href="introduction/">Introduction</a></p>
 89 |   </li>
 90 |   <li>
 91 |     <p><a href="autoregressive/">Autoregressive Models</a></p>
 92 |   </li>
 93 |   <li>
 94 |     <p><a href="vae/">Variational Autoencoders</a></p>
 95 |   </li>
 96 |   <li>
 97 |     <p><a href="flow/">Normalizing Flow Models</a></p>
 98 |   </li>
 99 |   <li>
100 |     <p><a href="gan/">Generative Adversarial Networks</a></p>
101 |   </li>
102 | </ol>
103 | 
104 | 
105 | 
106 | 
107 |     </article>
108 |     <span class="print-footer">Contents - Aditya Grover</span>
109 |     <footer>
110 |   <hr class="slender">
111 |   <!-- <ul class="footer&#45;links"> -->
112 |   <!--   <li><a href="mailto:hate@spam.net"><span class="icon&#45;mail"></span></a></li>     -->
113 |   <!--    -->
114 |   <!--     <li> -->
115 |   <!--       <a href="//www.twitter.com/twitter_handle"><span class="icon-twitter"></span></a> -->
116 |   <!--     </li> -->
117 |   <!--    -->
118 |   <!--     <li> -->
119 |   <!--       <a href="//plus.google.com/+googlePlusName"><span class="icon-googleplus"></span></a> -->
120 |   <!--     </li> -->
121 |   <!--    -->
122 |   <!--     <li> -->
123 |   <!--       <a href="//github.com/GithubHandle"><span class="icon-github"></span></a> -->
124 |   <!--     </li> -->
125 |   <!--    -->
126 |   <!--     <li> -->
127 |   <!--       <a href="//www.flickr.com/photos/FlickrUserID"><span class="icon-flickr"></span></a> -->
128 |   <!--     </li> -->
129 |   <!--    -->
130 |   <!--     <li> -->
131 |   <!--       <a href="/feed"><span class="icon-feed"></span></a> -->
132 |   <!--     </li> -->
133 |   <!--      -->
134 |   <!-- </ul> -->
135 | <div class="credits">
136 | <!-- <span>&#38;copy; 2018 <!&#45;&#45; &#38;#38;nbsp;&#38;#38;nbsp;ADITYA GROVER &#45;&#45;></span></br> <br> -->
137 | <span>Site created with <a href="//jekyllrb.com">Jekyll</a> using the <a href="//github.com/clayh53/tufte-jekyll">Tufte theme</a>. &copy; 2018</span> 
138 | </div>  
139 | </footer>
140 | 
141 |   </body>
142 | </html>
143 | 


--------------------------------------------------------------------------------
/docs/introduction/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 |   <head>
  4 |   <meta charset="utf-8">
  5 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  6 |   <meta name="viewport" content="width=device-width, initial-scale=1">
  7 | 
  8 |   <title>Introduction</title>
  9 |   <meta name="description" content="Lecture notes for Deep Generative Models.">
 10 | 
 11 | 
 12 |   <link rel="stylesheet" href="/notes/css/tufte.css">	
 13 |   
 14 | 
 15 |   <!-- Google Fonts loaded here depending on setting in _data/options.yml true loads font, blank does not-->
 16 |   
 17 |     <link href='//fonts.googleapis.com/css?family=Lato:400,400italic' rel='stylesheet' type='text/css'>
 18 |   
 19 |   <!-- Load up MathJax script if needed ... specify in /_data/options.yml file-->
 20 |   
 21 |     <script type="text/javascript" src="//cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
 22 |   
 23 | 
 24 |   <script>
 25 |   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
 26 |   (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
 27 |   m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
 28 |   })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
 29 | 
 30 |   ga('create', 'UA-129020129-1', 'auto');
 31 |   ga('send', 'pageview');
 32 | 
 33 |   </script>
 34 | 
 35 |   <link rel="canonical" href="http://localhost:4000/notes/introduction/">
 36 |   <link rel="alternate" type="application/rss+xml" title="Deep Generative Models" href="http://localhost:4000/notes/feed.xml" />
 37 | </head>
 38 | 
 39 |   <body>
 40 |     <!--- Header and nav template site-wide -->
 41 | <header>
 42 |     <nav class="group">
 43 |         <a href="/notes/">Contents</a>
 44 | 	<a href="http://deepgenerativemodels.github.io">Class</a>
 45 | 	<a href="http://github.com/deepgenerativemodels/notes">Github</a>
 46 | 	</nav>
 47 | </header>
 48 | 
 49 |     <article class="group">
 50 |       <h1>Introduction</h1>
 51 | <p class="subtitle"></p>
 52 | 
 53 | 
 54 | <script type="text/x-mathjax-config">
 55 |   MathJax.Hub.Config({
 56 |     TeX: {
 57 |       Macros: {
 58 |         e: "\\epsilon",
 59 |         xti: "x^{(i)}",
 60 |         yti: "y^{(i)}",
 61 |         bfy: "{\\bf y}",
 62 |         bfx: "{\\bf x}",
 63 |         bfg: "{\\bf g}",
 64 |         bfbeta: "{\\bf \\beta}",
 65 |         tp: "\\tilde p",
 66 |         pt: "p_\\theta",
 67 |         Exp: "{\\mathbb{E}}",
 68 |         Ind: "{\\mathbb{I}}",
 69 |         KL: "{\\mathbb{KL}}",
 70 |         Dc: "{\\mathcal{D}}",
 71 |         Tc: "{\\mathcal{T}}",
 72 |         Xc: "{\\mathcal{X}}",
 73 |         note: ["\\textcolor{blue}{[NOTE: #1]}",1]
 74 |       }
 75 |     }
 76 |   });
 77 | </script>
 78 | 
 79 | 
 80 | <p>Intelligent agents are constantly generating, acquiring, and processing
 81 | data. This data could be in the form of <em>images</em> that we capture on our
 82 | phones, <em>text</em> messages we share with our friends, <em>graphs</em> that model
 83 | interactions on social media, <em>videos</em> that record important events,
 84 | etc. Natural agents excel at discovering patterns, extracting
 85 | knowledge, and performing complex reasoning based on the data they observe. How
 86 | can we build artificial learning systems to do the same?</p>
 87 | 
 88 | <p>In this course, we will study generative models that view the world under the lens of probability.
 89 | In such a worldview, we can think of any kind of
 90 | observed data, say <script type="math/tex">\mathcal{D}</script>, as a finite set of samples from an
 91 | underlying distribution, say <script type="math/tex">p_{\mathrm{data}}</script>. At its very core, the
 92 | goal of any generative model is then to approximate this data
 93 | distribution given access to the dataset <script type="math/tex">\mathcal{D}</script>. The hope is that
 94 | if we are able to <em>learn</em> a good generative model, we can use the
 95 | learned model for downstream <em>inference</em>.</p>
 96 | 
 97 | <h2 id="learning">Learning</h2>
 98 | 
 99 | <p>We will be primarily interested in parametric approximations to the data
100 | distribution, which summarize all the information about the dataset <script type="math/tex">\mathcal{D}</script> in
101 | a finite set of parameters. In contrast with non-parametric models,
102 | parametric models scale more efficiently with large datasets but are
103 | limited in the family of distributions they can represent.</p>
104 | 
105 | <p>In the parametric setting, we can think of the task of learning a
106 | generative model as picking the parameters within a family of model
107 | distributions that minimizes some notion of distance<sup id="fnref:1"><a href="#fn:1" class="footnote">1</a></sup> between the
108 | model distribution and the data distribution.</p>
109 | 
110 | <p><img src="learning_1.png" alt="drawing" width="200" class="center" /></p>
111 | 
112 | <p><img src="learning_2.png" alt="drawing" width="300" class="center" />
113 | <!-- ![given](learning_1.png =100x20)
114 | ![goal](learning_2.png =100x20) --></p>
115 | 
116 | <p>For instance, we might be given access to a dataset of dog images <script type="math/tex">\mathcal{D}</script> and
117 | our goal is to learn the paraemeters of  a generative model <script type="math/tex">\theta</script> within a model family <script type="math/tex">\mathcal{M}</script> such that
118 | the model distribution <script type="math/tex">p_\theta</script> is close to the data distribution over dogs
119 | <script type="math/tex">p_{\mathrm{data}}</script>. Mathematically, we can specify our goal as the
120 | following optimization problem: <script type="math/tex"></script>\begin{equation}
121 | \min_{\theta\in \mathcal{M}}d(p_{\mathrm{data}}, p_{\theta})
122 | \label{eq:learning_gm}
123 | \tag{1}
124 | \end{equation}<script type="math/tex"></script>where <script type="math/tex">p_{\mathrm{data}}</script> is accessed via the dataset
125 | <script type="math/tex">\mathcal{D}</script> and <script type="math/tex">d(\cdot)</script> is a notion of distance between probability distributions.</p>
126 | 
127 | <p>As we navigate through this course, it is interesting to take note of
128 | the difficulty of the problem at hand. A typical image from a modern
129 | phone camera has a resolution of approximately <script type="math/tex">700 \times 1400</script> pixels.
130 | Each pixel has three channels: R(ed), G(reen) and B(lue) and each
131 | channel can take a value between 0 to 255. Hence, the number of possible
132 | images is given by <script type="math/tex">256^{700 \times 1400 \times 3}\approx 10 ^{800000}</script>.
133 | In contrast, Imagenet, one of the largest publicly available datasets,
134 | consists of only about 15 million images. Hence, learning a generative
135 | model with such a limited dataset is a highly underdetermined problem.</p>
136 | 
137 | <p>Fortunately, the real world is highly structured and automatically
138 | discovering the underlying structure is key to learning generative
139 | models. For example, we can hope to learn some basic artifacts about
140 | dogs even with just a few images: two eyes, two ears, fur etc. Instead
141 | of incorporating this prior knowledge explicitly, we will hope the model
142 | learns the underlying structure directly from data. There is no free
143 | lunch however, and indeed successful learning of generative models will
144 | involve instantiating the optimization problem in
145 | <script type="math/tex">(\ref{eq:learning_gm})</script> in a suitable way. In this course, we will be
146 | primarily interested in the following questions:</p>
147 | 
148 | <ul>
149 |   <li>What is the representation for the model family <script type="math/tex">\mathcal{M}</script>?</li>
150 |   <li>What is the objective function <script type="math/tex">d(\cdot)</script>?</li>
151 |   <li>What is the optimization procedure for minimizing <script type="math/tex">d(\cdot)</script>?</li>
152 | </ul>
153 | 
154 | <p>In the next few set of lectures, we will take a deeper dive into certain
155 | families of generative models. For each model family, we will note how
156 | the representation is closely tied with the choice of learning objective
157 | and the optimization procedure.</p>
158 | 
159 | <h2 id="inference">Inference</h2>
160 | 
161 | <p>For a discriminative model such as logistic regression, the fundamental
162 | inference task is to predict a label for any given datapoint. Generative
163 | models, on the other hand, learn a joint distribution over the entire
164 | data.<sup id="fnref:2"><a href="#fn:2" class="footnote">2</a></sup></p>
165 | 
166 | <p>While the range of applications to which generative models have been
167 | used continue to grow, we can identify three fundamental inference
168 | queries for evaluating a generative model.:</p>
169 | 
170 | <ol>
171 |   <li>
172 |     <p><em>Density estimation:</em> Given a datapoint <script type="math/tex">\mathbf{x}</script>, what is the
173 | probability assigned by the model, i.e., <script type="math/tex">p_\theta(\mathbf{x})</script>?</p>
174 |   </li>
175 |   <li>
176 |     <p><em>Sampling:</em> How can we <em>generate</em> novel data from the model
177 | distribution, i.e.,
178 | <script type="math/tex">\mathbf{x}_{\mathrm{new}} \sim p_\theta(\mathbf{x})</script>?</p>
179 |   </li>
180 |   <li>
181 |     <p><em>Unsupervised representation learning:</em> How can we learn meaningful
182 | feature representations for a datapoint <script type="math/tex">\mathbf{x}</script>?</p>
183 |   </li>
184 | </ol>
185 | 
186 | <p>Going back to our example of learning a generative model over dog
187 | images, we can intuitively expect a good generative model to work as
188 | follows. For density estimation, we expect <script type="math/tex">p_\theta(\mathbf{x})</script> to be
189 | high for dog images and low otherwise. Alluding to the name <em>generative
190 | model</em>, sampling involves generating novel images of dogs beyond the
191 | ones we observe in our dataset. Finally, representation learning can
192 | help discover high-level structure in the data such as the breed of
193 | dogs.</p>
194 | 
195 | <p>In light of the above inference tasks, we note two caveats. First,
196 | quantitative evaluation of generative models on these tasks is itself
197 | non-trivial (in particular, sampling and representation learning) and an
198 | area of active research. Some quantitative metrics exist, but these
199 | metrics often fail to reflect desirable qualitative attributes in the
200 | generated samples and the learned representations. Secondly, not all
201 | model families permit efficient and accurate inference on all these
202 | tasks. Indeed, the trade-offs in the inference capabilities of the
203 | current generative models have led to the development of very diverse approaches as
204 | we shall see in this course.</p>
205 | 
206 | <h2 id="footnotes">Footnotes</h2>
207 | 
208 | <div class="footnotes">
209 |   <ol>
210 |     <li id="fn:1">
211 |       <p>As we shall see later, functions that do not satisfy all
212 | properties of a distance metric are also used in practice, e.g., KL
213 | divergence. <a href="#fnref:1" class="reversefootnote">&#8617;</a></p>
214 |     </li>
215 |     <li id="fn:2">
216 |       <p>Technically, a probabilistic discriminative model is also a
217 | generative model of the labels conditioned on the data. However, the
218 | usage of the term generative models is typically reserved for high
219 | dimensional data. <a href="#fnref:2" class="reversefootnote">&#8617;</a></p>
220 |     </li>
221 |   </ol>
222 | </div>
223 | 
224 | 
225 | 
226 |     </article>
227 |     <span class="print-footer">Introduction - Aditya Grover</span>
228 |     <footer>
229 |   <hr class="slender">
230 |   <!-- <ul class="footer&#45;links"> -->
231 |   <!--   <li><a href="mailto:hate@spam.net"><span class="icon&#45;mail"></span></a></li>     -->
232 |   <!--    -->
233 |   <!--     <li> -->
234 |   <!--       <a href="//www.twitter.com/twitter_handle"><span class="icon-twitter"></span></a> -->
235 |   <!--     </li> -->
236 |   <!--    -->
237 |   <!--     <li> -->
238 |   <!--       <a href="//plus.google.com/+googlePlusName"><span class="icon-googleplus"></span></a> -->
239 |   <!--     </li> -->
240 |   <!--    -->
241 |   <!--     <li> -->
242 |   <!--       <a href="//github.com/GithubHandle"><span class="icon-github"></span></a> -->
243 |   <!--     </li> -->
244 |   <!--    -->
245 |   <!--     <li> -->
246 |   <!--       <a href="//www.flickr.com/photos/FlickrUserID"><span class="icon-flickr"></span></a> -->
247 |   <!--     </li> -->
248 |   <!--    -->
249 |   <!--     <li> -->
250 |   <!--       <a href="/feed"><span class="icon-feed"></span></a> -->
251 |   <!--     </li> -->
252 |   <!--      -->
253 |   <!-- </ul> -->
254 | <div class="credits">
255 | <!-- <span>&#38;copy; 2018 <!&#45;&#45; &#38;#38;nbsp;&#38;#38;nbsp;ADITYA GROVER &#45;&#45;></span></br> <br> -->
256 | <span>Site created with <a href="//jekyllrb.com">Jekyll</a> using the <a href="//github.com/clayh53/tufte-jekyll">Tufte theme</a>. &copy; 2018</span> 
257 | </div>  
258 | </footer>
259 | 
260 |   </body>
261 | </html>
262 | 


--------------------------------------------------------------------------------
/docs/introduction/learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/introduction/learning.png


--------------------------------------------------------------------------------
/docs/introduction/learning_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/introduction/learning_1.png


--------------------------------------------------------------------------------
/docs/introduction/learning_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/introduction/learning_2.png


--------------------------------------------------------------------------------
/docs/vae/klgap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/vae/klgap.png


--------------------------------------------------------------------------------
/docs/vae/vae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/docs/vae/vae.png


--------------------------------------------------------------------------------
/flow/flow-graphical.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/flow/flow-graphical.PNG


--------------------------------------------------------------------------------
/flow/flow-graphical.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/flow/flow-graphical.png


--------------------------------------------------------------------------------
/flow/iaf.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/flow/iaf.PNG


--------------------------------------------------------------------------------
/flow/iaf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/flow/iaf.png


--------------------------------------------------------------------------------
/flow/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: post
  3 | title: Normalizing Flow Models
  4 | ---
  5 | 
  6 | We continue our study over another type of likelihood based generative models. As before, we assume we are given access to a dataset $$\mathcal{D}$$ of $$n$$-dimensional datapoints $$\mathbf{x}$$. So far we have learned two types of likelihood based generative models: 
  7 | 
  8 | 1. Autoregressive Models: $$p_\theta(\mathbf{x}) = \prod_{i=1}^{N} p_\theta(x_i \vert \mathbf{x}_{<i})$$
  9 | 
 10 | 2. Variational autoencoders: $$p_\theta(\mathbf{x}) = \int p_\theta(\mathbf{x}, \mathbf{z}) \text{d}\mathbf{z}$$
 11 | 
 12 | The two methods have relative strengths and weaknesses. Autoregressive models provide tractable likelihoods but no direct mechanism for learning features, whereas variational autoencoders can learn feature representations but have intractable marginal likelihoods.
 13 | 
 14 | In this section, we introduce normalizing flows: a type of method that combines the best of both worlds, allowing both feature learning and tractable marginal likelihood estimation.
 15 | 
 16 | # Change of Variables Formula
 17 | 
 18 | In normalizing flows, we wish to map simple distributions (easy to sample and evaluate densities) to complex ones (learned via data). The change of variables formula describe how to evaluate densities of a random variable that is a deterministic transformation from another variable.
 19 | 
 20 | **Change of Variables**: $$Z$$ and $$X$$ be random variables which are related by a mapping $$f: \mathbb{R}^n \to \mathbb{R}^n$$ such that $$X = f(Z)$$ and $$Z = f^{-1}(X)$$. Then 
 21 | 
 22 | {% math %}p_X(\mathbb{x}) = p_Z(f^{-1}(\mathbb{x})) \left\vert \text{det}\left(\frac{\partial f^{-1}(\mathbb{x})}{\partial \mathbb{x}}\right) \right\vert
 23 | 
 24 | {% endmath %}
 25 | 
 26 | There are several things to note here.
 27 | 
 28 | 1. $$\mathbb{x}$$ and $$\mathbb{z}$$ need to be continuous and have the same dimension.
 29 | 
 30 | 2. $$\frac{\partial f^{-1}(\mathbb{x})}{\partial \mathbb{x}}$$ is a matrix of dimension $$n \times n$$, where each entry at location $$(i, j)$$ is defined as $$\frac{\partial f^{-1}(\mathbb{x})_i}{\partial x_j}$$. This matrix is also known as the Jacobian matrix.
 31 | 
 32 | 3. $$\text{det}(A)$$ denotes the determinant of a square matrix $$A$$.
 33 | 
 34 | 4. For any invertible matrix $$A$$, $$\text{det}(A^{-1}) = \text{det}(A)^{-1}$$, so for $$\mathbb{z} = f^{-1}(\mathbb{x})$$ we have
 35 | 
 36 |    {% math %}
 37 | 
 38 |    p_X(\mathbb{x}) = p_Z(\mathbb{z}) \left\vert \text{det}\left(\frac{\partial f(\mathbb{z})}{\partial \mathbb{z}}\right) \right\vert^{-1}
 39 | 
 40 |    {% endmath %}
 41 | 
 42 | 5. If $$\left \vert \text{det}\left(\frac{\partial f(\mathbb{z})}{\partial \mathbb{z}}\right) \right\vert = 1$$, then the mappings is volume preserving, which means that the transformed distribution $$p_X$$ will have the same “volume” compared to the original one $$p_Z$$. 
 43 | 
 44 | 
 45 | 
 46 | # Normalizing Flow Models
 47 | 
 48 | We are ready to introduce normalizing flow models. Let us consider a directed, latent-variable model over observed variables $$X$$ and latent variables $$Z$$. In a **normalizing flow model**, the mapping between $$Z$$ and $$X$$, given by $$f_\theta: \mathbb{R}^n \to \mathbb{R}^n$$, is deterministic and invertible such that $$X = f_\theta(Z)$$ and $$Z  = f_\theta^{-1}(X)$$[^nf]. 
 49 | 
 50 | ![](flow-graphical.png)
 51 | 
 52 | Using change of variables, the marginal likelihood $$p(x)$$ is given by
 53 | 
 54 | {% math %}
 55 | 
 56 | p_X(\mathbb{x}; \theta) = p_Z(f_\theta^{-1}(\mathbb{x})) \left\vert \text{det}\left(\frac{\partial f^{-1}_\theta(\mathbb{x})}{\partial \mathbb{x}}\right) \right\vert
 57 | 
 58 | {% endmath %}
 59 | 
 60 | The name “normalizing flow” can be interpreted as the following:
 61 | 
 62 | 1. “Normalizing” means that the change of variables gives a normalized density after applying an invertible transformation.
 63 | 2. “Flow” means that the invertible transformations can be composed with each other to create more complex invertible transformations.
 64 | 
 65 | Different from autoregressive model and variational autoencoders, deep normalizing flow models require specific architectural structures. 
 66 | 
 67 | 1. The input and output dimensions must be the same.
 68 | 2. The transformation must be invertible.
 69 | 3. Computing the determinant of the Jacobian needs to be efficient (and differentiable).
 70 | 
 71 | Next, we introduce several popular forms of flow models that satisfy these properties.
 72 | 
 73 | The Planar Flow introduces the following invertible transformation
 74 | 
 75 | {% math %}
 76 | \mathbf{x} = f_\theta(z) = \mathbf{z} + \mathbf{u} h(\mathbf{w}^\top \mathbf{z} + b)
 77 | {% endmath %}
 78 | 
 79 | where $$\mathbf{u}, \mathbf{w}, b$$ are parameters.
 80 | 
 81 | The absolute value of the determinant of the Jacobian is given by
 82 | 
 83 | {% math %}
 84 | \left\vert \text{det}\left(\frac{\partial f(\mathbb{z})}{\partial \mathbb{z}}\right) \right\vert = \left\vert 1 + h'(\mathbf{w}^\top \mathbf{z} + b) \mathbf{u}^\top \mathbf{w} \right\vert
 85 | {% endmath %}
 86 | 
 87 | However,  $$\mathbf{u}, \mathbf{w}, b, h(\cdot)$$ need to be restricted in order to be invertible. For example, $$h = \tanh$$ and $$h'(\mathbf{w}^\top \mathbf{z} + b) \mathbf{u}^\top \mathbf{w} \geq -1$$. Note that while $$f_\theta(\mathbf{z})$$ is invertible, computing $$f_\theta^{-1}(\mathbf{z})$$ could be difficult analytically.  The following models address this problem, where both $$f_\theta$$ and $$f_\theta^{-1}$$ have simple analytical forms. 
 88 | 
 89 | The Nonlinear Independent Components Estimation (NICE) model and Real Non-Volume Preserving (RealNVP) model composes two kinds of invertible transformations: additive coupling layers and rescaling layers. The coupling layer in NICE partitions a variable $$\mathbf{z}$$ into two disjoint subsets, say $$\mathbf{z}_1$$ and $$\mathbf{z}_2$$. Then it applies the following transformation:
 90 | 
 91 | Forward mapping $$\mathbf{z} \to \mathbf{x}$$
 92 | 1. $$\mathbf{x}_1 = \mathbf{z}_1$$, which is an identity mapping.
 93 | 
 94 | 2. $$\mathbf{x}_2 = \mathbf{z}_2 + m_\theta(\mathbf{z_1})$$, where $$m_\theta$$ is a neural network.
 95 | 
 96 | Inverse mapping $$\mathbf{x} \to \mathbf{z}$$:
 97 | 1. $$\mathbf{z}_1 = \mathbf{x}_1$$, which is an identity mapping.
 98 | 
 99 | 2. $$\mathbf{z}_2 = \mathbf{x}_2 - m_\theta(\mathbf{x_1})$$, which is the inverse of the forward transformation.
100 | 
101 | Therefore, the Jacobian of the forward mapping is lower triangular, whose determinant is simply the product of the elements on the diagonal, which is 1. Therefore, this defines a volume preserving transformation. RealNVP adds scaling factors to the transformation:
102 | 
103 | {% math %}
104 | 
105 | \mathbf{x}_2 = \exp(s_\theta(\mathbb{z}_1)) \odot \mathbf{z}_2 + m_\theta(\mathbf{z_1})
106 | 
107 | {% endmath %}
108 | 
109 | where $$\odot$$ denotes elementwise product. This results in a non-volume preserving transformation.
110 | 
111 | Some autoregressive models can also be interpreted as flow models. For a Gaussian autoregressive model, one receive some Gaussian noise for each dimension of $$\mathbb{x}$$, which can be treated as the latent variables $$\mathbf{z}$$. Such transformations are also invertible, meaning that given $$\mathbf{x}$$ and the model parameters, we can obtain $$\mathbf{z}$$ exactly.
112 | 
113 | Masked Autoregressive Flow (MAF) uses this interpretation, where the forward mapping is an autoregressive model. However, sampling is sequential and slow, in $$O(n)$$ time where $$n$$ is the dimension of the samples.
114 | 
115 | ![](maf.png)
116 | 
117 | To address the sampling problem, the Inverse Autoregressive Flow (IAF) simply inverts the generating process. In this case, generating $$\mathbf{x}$$ from the noise can be parallelized, but computing the likelihood of new data points is slow. However, for generated points the likelihood can be computed efficiently (since the noise are already obtained).
118 | 
119 | ![](iaf.png)
120 | 
121 | Parallel WaveNet combines the best of both worlds for IAF and MAF where it uses an IAF student model to retrieve sample and a MAF teacher model to compute likelihood. The teacher model can be efficiently trained via maximum likelihood, and the student model is trained by minimizing the KL divergence between itself and the teacher model. Since computing the IAF likelihood for an IAF sample is efficient, this process is efficient.  
122 | 
123 | # Footnotes
124 | 
125 | [^nf]: Recall the conditions for change of variable formula.
126 | 
127 | 


--------------------------------------------------------------------------------
/flow/maf.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/flow/maf.PNG


--------------------------------------------------------------------------------
/flow/maf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/flow/maf.png


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.eot


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-bold-line-figures/et-bembo-bold-line-figures.woff


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.eot


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-display-italic-old-style-figures/et-bembo-display-italic-old-style-figures.woff


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.eot


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-roman-line-figures/et-bembo-roman-line-figures.woff


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.eot


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-roman-old-style-figures/et-bembo-roman-old-style-figures.woff


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.eot


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-bembo/et-bembo-semi-bold-old-style-figures/et-bembo-semi-bold-old-style-figures.woff


--------------------------------------------------------------------------------
/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.eot


--------------------------------------------------------------------------------
/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-bold-line-figures/et-book-bold-line-figures.woff


--------------------------------------------------------------------------------
/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.eot


--------------------------------------------------------------------------------
/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-display-italic-old-style-figures/et-book-display-italic-old-style-figures.woff


--------------------------------------------------------------------------------
/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.eot


--------------------------------------------------------------------------------
/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-roman-line-figures/et-book-roman-line-figures.woff


--------------------------------------------------------------------------------
/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.eot


--------------------------------------------------------------------------------
/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-roman-old-style-figures/et-book-roman-old-style-figures.woff


--------------------------------------------------------------------------------
/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.eot


--------------------------------------------------------------------------------
/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.ttf


--------------------------------------------------------------------------------
/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/et-book/et-book-semi-bold-old-style-figures/et-book-semi-bold-old-style-figures.woff


--------------------------------------------------------------------------------
/fonts/icomoon.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/icomoon.eot


--------------------------------------------------------------------------------
/fonts/icomoon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" >
 3 | <svg xmlns="http://www.w3.org/2000/svg">
 4 | <metadata>Generated by IcoMoon</metadata>
 5 | <defs>
 6 | <font id="icomoon" horiz-adv-x="512">
 7 | <font-face units-per-em="512" ascent="480" descent="-32" />
 8 | <missing-glyph horiz-adv-x="512" />
 9 | <glyph unicode="&#x20;" d="" horiz-adv-x="256" />
10 | <glyph unicode="&#xe600;" d="M192 160l64 32 224 224-32 32-224-224-32-64zM144.65 46.451c-15.816 33.364-32.833 50.381-66.198 66.197l49.548 136.396 64 38.956 192 192h-96l-192-192-96-320 320 96 192 192v96l-192-192-38.956-64z" />
11 | <glyph unicode="&#xe601;" d="M448 416h-48v-16c0-17.645-14.355-32-32-32s-32 14.355-32 32v16h-160v-16c0-17.645-14.355-32-32-32s-32 14.355-32 32v16h-48c-17.6 0-32-14.4-32-32v-352c0-17.6 14.4-32 32-32h384c17.6 0 32 14.4 32 32v352c0 17.6-14.4 32-32 32zM448 32.058c-0.017-0.020-0.038-0.041-0.058-0.058h-383.885c-0.020 0.017-0.041 0.038-0.057 0.058v287.942h384v-287.942zM144 384c8.836 0 16 7.164 16 16v64c0 8.836-7.164 16-16 16s-16-7.164-16-16v-64c0-8.836 7.164-16 16-16zM368 384c8.836 0 16 7.164 16 16v64c0 8.836-7.164 16-16 16s-16-7.164-16-16v-64c0-8.836 7.164-16 16-16zM288 288h-160v-32h128v-64h-128v-32h128v-64h-128v-32h160zM352 64h32v224h-64v-32h32zM436-12h-360c-17.6 0-32 10.4-32 28v-16c0-17.6 14.4-32 32-32h360c17.6 0 32 14.4 32 32v16c0-17.6-14.4-28-32-28z" />
12 | <glyph unicode="&#xe602;" d="M476.698 442.679l-2.014 2.021c-47.074 47.067-124.097 47.067-171.163 0l-109.053-109.068c-47.067-47.066-47.067-124.088 0-171.155l2.013-2.013c3.916-3.924 8.073-7.462 12.368-10.729l39.924 39.925c-4.651 2.747-9.063 6.036-13.058 10.030l-2.021 2.021c-25.557 25.549-25.557 67.136 0 92.695l109.064 109.056c25.558 25.559 67.137 25.559 92.693 0l2.021-2.012c25.55-25.558 25.55-67.146 0-92.695l-49.343-49.343c8.566-21.154 12.624-43.7 12.269-66.193l76.302 76.302c47.067 47.068 47.067 124.089-0.002 171.158zM315.521 285.533c-3.916 3.916-8.073 7.461-12.368 10.72l-39.924-39.916c4.652-2.748 9.063-6.037 13.058-10.031l2.021-2.020c25.558-25.558 25.558-67.136 0-92.694l-109.065-109.067c-25.559-25.551-67.138-25.551-92.694 0l-2.021 2.021c-25.549 25.56-25.549 67.138 0 92.694l49.344 49.343c-8.567 21.153-12.623 43.701-12.269 66.193l-76.301-76.299c-47.068-47.066-47.068-124.089 0-171.162l2.013-2.016c47.076-47.064 124.096-47.064 171.164 0l109.055 109.059c47.067 47.066 47.067 124.097 0 171.163l-2.013 2.012z" />
13 | <glyph unicode="&#xe603;" d="M256 480c-141.385 0-256-114.615-256-256s114.615-256 256-256 256 114.615 256 256-114.615 256-256 256zM224 384h64v-64h-64v64zM320 64h-128v32h32v128h-32v32h96v-160h32v-32z" />
14 | <glyph unicode="&#xe604;" d="M256 480c-141.385 0-256-114.615-256-256s114.615-256 256-256 256 114.615 256 256-114.615 256-256 256zM384 306.745l-82.744-82.745 82.744-82.744v-45.256h-45.256l-82.744 82.744-82.745-82.744h-45.255v45.256l82.745 82.744-82.745 82.745v45.255h45.255l82.745-82.745 82.744 82.745h45.256v-45.255z" />
15 | <glyph unicode="&#xe605;" d="M256 480c-141.385 0-256-114.615-256-256s114.615-256 256-256 256 114.615 256 256-114.615 256-256 256zM208 64l-106 138 47 49 59-75 185 151 23-23-208-240z" />
16 | <glyph unicode="&#xe606;" d="M512 112l-144 368h-224l-144-144v-224l144-144h224l144 144v224l-144 144zM288 64h-64v64h64v-64zM288 192h-64v192h64v-192z" />
17 | <glyph unicode="&#xe607;" d="M426.671 480h-341.328c-46.937 0-85.343-38.405-85.343-85.345v-341.311c0-46.969 38.406-85.344 85.343-85.344h341.328c46.938 0 85.329 38.375 85.329 85.345v341.31c0 46.94-38.391 85.345-85.329 85.345zM91.314 36.687l-38.628 38.627 128 128 14.628-14.627-104-152zM76.686 380.686l14.628 14.628 164.686-132.687 164.687 132.687 14.627-14.628-179.314-211.313-179.314 211.313zM420.687 36.687l-104 152 14.627 14.627 128-128-38.627-38.627z" />
18 | <glyph unicode="&#xe608;" d="M0.403 45.168c-0.122 1.266-0.226 2.535-0.292 3.815 0.065-1.28 0.17-2.549 0.292-3.815zM117.954 197.426c46.005-1.369 76.867 46.349 68.931 106.599-7.947 60.24-51.698 108.584-97.704 109.961-46.013 1.365-76.87-44.741-68.926-105 7.941-60.234 51.676-110.187 97.699-111.56zM512 352v42.655c0 46.94-38.391 85.345-85.329 85.345h-341.328c-46.138 0-84.006-37.116-85.282-82.963 29.181 25.693 69.662 47.158 111.437 47.158 44.652 0 178.622 0 178.622 0l-39.974-33.809h-56.634c37.565-14.402 57.578-58.062 57.578-102.861 0-37.624-20.905-69.977-50.444-92.984-28.822-22.451-34.286-31.854-34.286-50.939 0-16.289 30.873-44 47.016-55.394 47.191-33.269 62.458-64.156 62.458-115.728 0-8.214-1.021-16.415-3.033-24.48h153.871c46.937 0 85.328 38.375 85.328 85.345v266.654h-96v-95.999h-32v96h-95.999v32h95.999v96h32v-96h96zM92.943 97.032c10.807 0 20.711 0.295 30.968 0.295-13.573 13.167-24.313 29.3-24.313 49.19 0 11.804 3.782 23.168 9.067 33.26-5.391-0.385-10.895-0.497-16.563-0.497-37.178 0-68.753 12.038-92.102 31.927v-33.621l0.003-100.865c26.72 12.687 58.444 20.311 92.94 20.311zM1.71 36.371c-0.556 2.729-0.983 5.503-1.271 8.317 0.287-2.814 0.715-5.588 1.271-8.317zM227.725 3.577c-7.529 29.403-34.227 43.982-71.444 69.784-13.536 4.366-28.447 6.937-44.447 7.104-44.809 0.482-86.554-17.471-110.108-44.186 7.96-38.853 42.517-68.279 83.617-68.279h143.222c0.908 5.564 1.348 11.316 1.348 17.216 0 6.267-0.767 12.396-2.188 18.361z" />
19 | <glyph unicode="&#xe609;" d="M426.672 480h-341.33c-46.936 0-85.342-38.407-85.342-85.344v-341.313c0-46.969 38.406-85.343 85.342-85.343l341.33 0.001c46.938 0 85.328 38.373 85.328 85.344v341.311c0 46.937-38.391 85.344-85.328 85.344zM435.296 224h-83.296v-224h-96v224h-46.263v73.282h46.263v47.593c0 64.671 27.896 103.125 103.935 103.125h87.622v-79.285h-71.565c-21.241 0.035-23.876-11.076-23.876-31.756l-0.116-39.677h96l-12.704-73.282z" />
20 | <glyph unicode="&#xe60a;" d="M426.671 480h-341.328c-46.937 0-85.343-38.405-85.343-85.345v-341.311c0-46.969 38.406-85.344 85.343-85.344h341.328c46.938 0 85.329 38.375 85.329 85.345v341.31c0 46.94-38.391 85.345-85.329 85.345zM419.026 309.083c0.164-3.671 0.245-7.364 0.245-11.074 0-113.107-84.608-243.534-239.329-243.534-47.502 0-91.717 14.174-128.943 38.459 6.58-0.794 13.276-1.197 20.065-1.197 39.411 0 75.679 13.685 104.467 36.641-36.808 0.69-67.872 25.438-78.577 59.441 5.137-1 10.406-1.537 15.826-1.537 7.672 0 15.103 1.048 22.16 3.004-38.48 7.866-67.475 42.458-67.475 83.928 0 0.361 0 0.719 0.008 1.076 11.34-6.41 24.312-10.26 38.1-10.705-22.571 15.349-37.421 41.546-37.421 71.244 0 15.685 4.147 30.389 11.389 43.029 41.487-51.785 103.468-85.86 173.377-89.431-1.435 6.266-2.179 12.798-2.179 19.507 0 47.269 37.663 85.59 84.115 85.59 24.195 0 46.059-10.393 61.401-27.029 19.16 3.838 37.162 10.96 53.416 20.771-6.281-19.988-19.617-36.761-36.983-47.355 17.013 2.069 33.226 6.67 48.31 13.477-11.273-17.162-25.535-32.238-41.972-44.305z" />
21 | <glyph unicode="&#xe60b;" d="M426.67 480h-341.327c-46.938 0-85.343-38.406-85.343-85.344v-341.314c0-46.967 38.406-85.342 85.344-85.342h341.326c46.938 0 85.33 38.374 85.33 85.342v341.314c0 46.938-38.392 85.344-85.33 85.344zM139.472 64.376c-23.985 0-43.472 19.346-43.472 43.314 0 23.842 19.486 43.406 43.472 43.406 24.079 0 43.53-19.564 43.53-43.406-0.001-23.968-19.452-43.314-43.53-43.314zM248.734 64.002c0 40.905-15.904 79.409-44.73 108.222-28.857 28.875-67.188 44.813-107.952 44.813v62.593c118.826 0 215.563-96.721 215.563-215.627l-62.881-0.001zM359.814 64.002c0 145.531-118.329 263.97-263.688 263.97v62.624c180.001 0 326.473-146.562 326.473-326.596l-62.785 0.002z" />
22 | <glyph unicode="&#xe60c;" d="M426.688 480h-341.344c-46.938 0-85.344-38.406-85.344-85.341v-341.313c0-46.971 38.406-85.346 85.344-85.346h341.341c46.937 0 85.315 38.377 85.315 85.348v341.311c0 46.935-38.378 85.341-85.312 85.341zM144 144c-44.183 0-80 35.817-80 80s35.817 80 80 80 80-35.817 80-80c0-44.183-35.817-80-80-80zM368 144c-44.184 0-80 35.817-80 80s35.816 80 80 80 80-35.817 80-80c0-44.183-35.816-80-80-80z" />
23 | <glyph unicode="&#xe60d;" d="M131.736 100.316c0-13.526 11.447-20.291 34.334-20.291 20.032 0 30.044 7.021 30.044 21.067 0 13.269-10.923 19.898-32.772 19.898-21.073 0.002-31.606-6.888-31.606-20.674zM426.674 480h-341.33c-46.938 0-85.344-38.405-85.344-85.344v-341.313c0-46.968 38.406-85.343 85.344-85.343h341.33c46.934 0 85.326 38.375 85.326 85.344v341.312c0 46.939-38.391 85.344-85.326 85.344zM237.864 275.506c-3.642-1.302-8.844-2.729-15.606-4.293 2.080-5.983 3.123-11.576 3.123-16.777 0-16.646-5.009-31.149-15.022-43.506-10.015-12.354-22.956-19.703-38.823-22.043-10.402-1.563-15.604-7.156-15.604-16.779 0-3.381 1.689-6.765 5.072-10.145 4.422-4.943 10.924-8.066 19.509-9.367 37.195-5.721 55.793-21.197 55.793-46.427 0-40.321-24.062-60.481-72.181-60.481-19.771 0-36.028 3.513-48.772 10.535-16.129 8.844-24.188 22.76-24.188 41.75 0 21.852 12.096 36.808 36.282 44.869v0.78c-8.842 5.462-13.263 13.785-13.263 24.972 0 14.566 4.162 23.672 12.484 27.311v0.781c-8.323 2.862-15.737 9.366-22.24 19.512-7.282 10.926-10.925 22.63-10.925 35.115 0 18.729 6.635 34.335 19.896 46.82 12.748 11.707 27.964 17.558 45.653 17.558 12.747 0 24.579-3.119 35.505-9.361 12.485 0 26.921 3.12 43.313 9.361l-0.006-40.185zM301.089 140.503h-44.093c0.521 5.201 0.779 14.042 0.779 26.532v121.346c0 12.227-0.261 20.682-0.779 25.359h44.093c-0.521-4.942-0.781-13.135-0.781-24.582v-119.782c0-13.267 0.261-22.892 0.781-28.873zM298.159 352.759c-5.332-5.722-11.641-8.581-18.923-8.581-7.543 0-13.979 2.859-19.313 8.581-5.333 5.726-8.001 12.486-8.001 20.289 0 8.067 2.668 14.956 8.001 20.683 5.333 5.721 11.77 8.584 19.313 8.584 7.282 0 13.592-2.863 18.923-8.584 5.333-5.728 7.999-12.615 7.999-20.683 0-7.803-2.666-14.564-7.999-20.289zM420.845 145.576c-9.627-5.205-21.203-7.803-34.729-7.803-18.985 0-32.124 6.76-39.405 20.287-5.465 10.145-8.193 26.14-8.193 47.99v69.842h0.389v0.781l-5.853 0.391c-3.382 0-7.806-0.391-13.267-1.171v37.847h19.117v15.216c0 7.285-0.389 13.14-1.17 17.56h45.259c-0.775-4.94-1.168-10.533-1.168-16.776v-15.999h33.943v-37.846c-1.299 0-3.705 0.129-7.218 0.389-3.512 0.261-6.828 0.393-9.949 0.393h-16.777v-72.572c0-17.432 5.725-26.145 17.167-26.145 8.063 0 15.348 2.209 21.854 6.632v-39.016zM160.608 279.797c-16.388 0-24.582-9.627-24.582-28.873 0-17.951 8.194-26.924 24.582-26.924 15.868 0 23.801 9.104 23.801 27.313 0 7.545-1.82 14.047-5.461 19.511-4.423 5.981-10.537 8.973-18.34 8.973z" />
24 | <glyph unicode="&#xe60e;" d="M416 448h-320l-96-96v-336c0-8.837 7.163-16 16-16h480c8.836 0 16 7.163 16 16v336l-96 96zM256 64l-160 128h96v96h128v-96h96l-160-128zM77.255 384l32 32h293.489l32-32h-357.489z" />
25 | <glyph unicode="&#xe60f;" d="M0 416v-384h512v384h-512zM96 64h-64v64h64v-64zM96 192h-64v64h64v-64zM96 320h-64v64h64v-64zM384 64h-256v320h256v-320zM480 64h-64v64h64v-64zM480 192h-64v64h64v-64zM480 320h-64v64h64v-64zM192 320v-192l128 96z" />
26 | </font></defs></svg>


--------------------------------------------------------------------------------
/fonts/icomoon.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/icomoon.ttf


--------------------------------------------------------------------------------
/fonts/icomoon.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/fonts/icomoon.woff


--------------------------------------------------------------------------------
/gan/cyclegan_gendisc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/gan/cyclegan_gendisc.png


--------------------------------------------------------------------------------
/gan/gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/gan/gan.png


--------------------------------------------------------------------------------
/gan/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: post
  3 | title: Generative Adversarial Networks
  4 | ---
  5 | 
  6 | We now move onto another family of generative models called generative adversarial networks (GANs). GANs are unique from all the other model families that we have seen so far, such as autoregressive models, VAEs, and normalizing flow models, because we do not train them using maximum likelihood. 
  7 | 
  8 | 
  9 | Likelihood-free learning
 10 | ==============
 11 | 
 12 | Why not? In fact, it is not so clear that better likelihood numbers necessarily correspond to higher sample quality. We know that the *optimal generative model* will give us the best sample quality and highest test log-likelihood. However, models with high test log-likelihoods can still yield poor samples, and vice versa. To see why, consider pathological cases in which our model is comprised almost entirely of noise, or our model simply memorizes the training set. Therefore, we turn to *likelihood-free training* with the hope that optimizing a different objective will allow us to disentangle our desiderata of obtaining high likelihoods as well as high-quality samples.
 13 | 
 14 | Recall that maximum likelihood required us to evaluate the likelihood of the data under our model $$p_\theta$$. A natural way to set up a likelihood-free objective is to consider the *two-sample test*, a statistical test that determines whether or not a finite set of samples from two distributions are from the same distribution *using only samples from $$P$$ and $$Q$$*. Concretely, given $$S_1 = \{\mathbf{x} \sim P\}$$ and $$S_2 = \{\mathbf{x} \sim Q\}$$, we compute a test statistic $$T$$ according to the difference in $$S_1$$ and $$S_2$$ that, when less than a threshold $$\alpha$$, accepts the null hypothesis that $$P = Q$$. 
 15 | 
 16 | Analogously, we have in our generative modeling setup access to our training set $$S_1 = \mathcal{D} = \{\mathbf{x} \sim p_{\textrm{data}} \}$$ and $$S_2 = \{\mathbf{x} \sim p_{\theta} \}$$. The key idea is to train the model to minimize a *two-sample test objective* between $$S_1$$ and $$S_2$$. But this objective becomes extremely difficult to work with in high dimensions, so we choose to optimize a surrogate objective that instead *maximizes some distance* between $$S_1$$ and $$S_2$$.
 17 | 
 18 | GAN Objective
 19 | ==============
 20 | 
 21 | We thus arrive at the generative adversarial network formulation. There are two components in a GAN: (1) a generator and (2) a discriminator. The generator $$G_\theta$$ is a directed latent variable model that deterministically generates samples $$\mathbf{x}$$ from $$\mathbf{z}$$, and the discriminator $$D_\phi$$ is a function whose job is to distinguish samples from the real dataset and the generator. The image below is a graphical model of $$G_\theta$$ and $$D_\phi$$. $$\mathbf{x}$$ denotes samples (either from data or generator), $$\mathbf{z}$$ denotes our noise vector, and $$\mathbf{y}$$ denotes the discriminator's prediction about $$\mathbf{x}$$.
 22 | 
 23 | <figure>
 24 | <center><img src="gan.png" alt="drawing" width="300" class="center"/></center>
 25 | <!-- <figcaption>
 26 | Graphical model of generator $$G_\theta$$ and discriminator $$D_\phi$$.
 27 |  </figcaption> -->
 28 | </figure>
 29 | 
 30 | 
 31 | The generator and discriminator both play a two player minimax game, where the generator minimizes a two-sample test objective ($$p_{\textrm{data}} = p_\theta$$) and the discriminator maximizes the objective ($$p_{\textrm{data}} \neq p_\theta$$). Intuitively, the generator tries to fool the discriminator to the best of its ability by generating samples that look indistinguishable from $$p_{\textrm{data}}$$. 
 32 | 
 33 | Formally, the GAN objective can be written as:
 34 | 
 35 | {% math %}
 36 | \min_{\theta} \max_{\phi} V(G_\theta, D_\phi) = \mathbb{E}_{\mathbf{x} \sim \textbf{p}_{\textrm{data}}}[\log D_\phi(\textbf{x})] + 
 37 | \mathbb{E}_{\mathbf{z} \sim p(\textbf{z})}[\log (1-D_\phi(G_\theta(\textbf{z})))]
 38 | {% endmath %}
 39 | 
 40 | Let's unpack this expression. We know that the discriminator is maximizing this function with respect to its parameters $$\phi$$, where given a fixed generator $$G_\theta$$ it is performing binary classification: it assigns probability 1 to data points from the training set $$\mathbf{x} \sim p_{\textrm{data}}$$, and assigns probability 0 to generated samples $$\mathbf{x} \sim p_G$$. In this setup, the optimal discriminator is:
 41 | 
 42 | {% math %}
 43 | D^*_{G}(\mathbf{x}) = \frac{p_{\textrm{data}}(\mathbf{x})}{p_{\textrm{data}}(\mathbf{x}) + p_G(\mathbf{x})}
 44 | {% endmath %}
 45 | 
 46 | On the other hand, the generator minimizes this objective for a fixed discriminator $$D_\phi$$. And after performing some algebra, plugging in the optimal discriminator $$D^*_G(\cdot)$$ into the overall objective $$V(G_\theta, D^*_G(\mathbf{x}))$$ gives us:
 47 | 
 48 | {% math %}
 49 | 2D_{\textrm{JSD}}[p_{\textrm{data}}, p_G] - \log 4
 50 | {% endmath %}
 51 | 
 52 | The $$D_{\textrm{JSD}}$$ term is the *Jenson-Shannon Divergence*, which is also known as the symmetric form of the KL divergence:
 53 | 
 54 | {% math %}
 55 | D_{\textrm{JSD}}[p, q] = \frac{1}{2} \left( D_{\textrm{KL}}\left[p, \frac{p+q}{2} \right] + D_{\textrm{KL}}\left[q, \frac{p+q}{2} \right] \right)
 56 | {% endmath %}
 57 | 
 58 | The JSD satisfies all properties of the KL, and has the additional perk that $$D_{\textrm{JSD}}[p,q] = D_{\textrm{JSD}}[q,p]$$. With this distance metric, the optimal generator for the GAN objective becomes $$p_G = p_{\textrm{data}}$$, and the optimal objective value that we can achieve with optimal generators and discriminators $$G^*(\cdot)$$ and $$D^*_{G^*}(\mathbf{x})$$ is $$-\log 4$$.
 59 | 
 60 | 
 61 | GAN training algorithm
 62 | ==============
 63 | 
 64 | Thus, the way in which we train a GAN is as follows:
 65 | 
 66 | For epochs $$1, \ldots, N$$ do:
 67 | 1. Sample minibatch of size $$m$$ from data: $$\mathbf{x}^{(1)}, \ldots, \mathbf{x}^{(m)} \sim \mathcal{D}$$
 68 | 2. Sample minibatch of size $$m$$ of noise: $$\mathbf{z}^{(1)}, \ldots, \mathbf{z}^{(m)} \sim p_z$$
 69 | 3. Take a gradient *descent* step on the generator parameters $$\theta$$:
 70 | 	{% math %}
 71 | 	\triangledown_\theta V(G_\theta, D_\phi) = \frac{1}{m} \triangledown_\theta \sum_{i=1}^m \log \left(1 - D_\phi(G_\theta(\mathbf{z}^{(i)})) \right)
 72 | 	{% endmath %} 
 73 | 4. Take a gradient *ascent* step on the discriminator parameters $$\phi$$:
 74 | 	{% math %}
 75 | 	\triangledown_\phi V(G_\theta, D_\phi) = \frac{1}{m} \triangledown_\phi \sum_{i=1}^m \left[\log D_\phi(\mathbf{x}^{(i)}) + \log (1 - D_\phi(G_\theta(\mathbf{z}^{(i)}))) \right]
 76 | 	{% endmath %} 
 77 | 
 78 | 
 79 | Challenges
 80 | ==============
 81 | 
 82 | Although GANs have been successfully applied to several domains and tasks, working with them in practice is challenging because of their: (1) unstable optimization procedure, (2) potential for mode collapse, (3) difficulty in evaluation.
 83 | 
 84 | During optimization, the generator and discriminator loss often continue to oscillate without converging to a clear stopping point. Due to the lack of a robust stopping criteria, it is difficult to know when exactly the GAN has finished training. Additionally, the generator of a GAN can often get stuck producing one of a few types of samples over and over again (mode collapse). Most fixes to these challenges are empirically driven, and there has been a significant amount of work put into developing new architectures, regularization schemes, and noise perturbations in an attempt to circumvent these issues. Soumith Chintala has a nice [link](https://github.com/soumith/ganhacks) outlining various tricks of the trade to stabilize GAN training.
 85 | 
 86 | 
 87 | Selected GANs
 88 | ==============
 89 | 
 90 | Next, we focus our attention to a few select types of GAN architectures and explore them in more detail. 
 91 | 
 92 | ### f-GAN
 93 | The [f-GAN](https://arxiv.org/abs/1606.00709) optimizes the variant of the two-sample test objective that we have discussed so far, but using a very general notion of distance: the $$f divergence$$. Given two densities $$p$$ and $$q$$, the $$f$$-divergence can be written as: 
 94 | 
 95 | {% math %}
 96 | D_f(p,q) = \mathbb{E}_{\mathbf{x}\sim q}\left[f \left(\frac{p(\mathbf{x})}{q(\mathbf{x})} \right) \right]
 97 | {% endmath %}
 98 | where $$f$$ is any convex[^1], lower-semicontinuous[^2] function with $$f(1) = 0$$. Several of the distance "metrics" that we have seen so far fall under the class of f-divergences, such as KL, Jenson-Shannon, and total variation. 
 99 | 
100 | To set up the f-GAN objective, we borrow two commonly used tools from convex optimization[^3]: the Fenchel conjugate and duality. Specifically, we obtain a lower bound to any f-divergence via its Fenchel conjugate: 
101 | 
102 | {% math %}
103 | D_f(p,q) \geq \sup_{T \in \mathcal{T}} \left(\mathbb{E}_{x \sim p}[T(\mathbf{x})] - \mathbb{E}_{x \sim q}[f^*(T(\mathbf{x}))] \right)
104 | {% endmath %}
105 | 
106 | Therefore we can choose any f-divergence that we desire, let $$p = p_{\textrm{data}}$$ and $$q = p_G$$, parameterize $$T$$ by $$\phi$$ and $$G$$ by $$\theta$$, and obtain the following fGAN objective:
107 | 
108 | {% math %}
109 | \min_\theta \max_\phi F(\theta,\phi) =  \mathbb{E}_{x \sim p_{\textrm{data}}}[T_\phi(\mathbf{x})] - \mathbb{E}_{x \sim p_{G_\theta}}[f^*(T_\phi(\mathbf{x}))]
110 | {% endmath %}
111 | 
112 | Intuitively, we can think about this objective as the generator trying to minimize the divergence estimate, while the discriminator tries to tighten the lower bound.
113 | 
114 | ### BiGAN
115 | We won't worry too much about the [BiGAN](https://arxiv.org/abs/1605.09782) in these notes. However, we can think about this model as one that allows us to infer latent representations even within a GAN framework.
116 | 
117 | ### CycleGAN
118 | [CycleGAN](https://arxiv.org/abs/1703.10593) is a type of GAN that allows us to do unsupervised image-to-image translation, from two domains $$\mathcal{X} \leftrightarrow \mathcal{Y}$$.
119 | 
120 | Specifically, we learn two conditional generative models: $$G: \mathcal{X} \leftrightarrow \mathcal{Y}$$ and $$F: \mathcal{Y} \leftrightarrow \mathcal{X}$$. There is a discriminator $$D_\mathcal{Y}$$ associated with $$G$$ that compares the true $$Y$$ with the generated samples $$\hat{Y} = G(X)$$. Similarly, there is another discriminator $$D_\mathcal{X}$$ associated with $$F$$ that compares the true $$X$$ with the generated samples $$\hat{X} = F(Y)$$. The figure below illustrates the CycleGAN setup:
121 | 
122 | <figure>
123 | <center><img src="cyclegan_gendisc.png" alt="drawing" width="300" class="center"/></center>
124 | <!-- <figcaption>
125 | Graphical model of generator $$G_\theta$$ and discriminator $$D_\phi$$.
126 |  </figcaption> -->
127 | </figure>
128 | 
129 | CycleGAN enforces a property known as *cycle consistency*, which states that if we can go from $$X$$ to $$\hat{Y}$$ via $$G$$, then we should also be able to go from $$\hat{Y}$$ to $$X$$ via $$F$$. The overall loss function can be written as:
130 | 
131 | {% math %}
132 | \min_{F, G, D_\mathcal{X}, D_\mathcal{Y}} \mathcal{L}_{GAN}(G, D_\mathcal{Y}, X, Y) + \mathcal{L}_{GAN}(F, D_\mathcal{X}, X, Y) + \lambda \left(\mathbb{E}_X [||F(G(X)) - X||_1] + \mathbb{E}_Y [||G(F(Y)) - Y||_1] \right)
133 | {% endmath %}
134 | 
135 | Footnotes
136 | ==============
137 | [^1]: In this context, convex means a line joining any two points that lies above the function.
138 | [^2]: The function value at any point $$\mathbf{x}_0$$ is close to or greater than $$f(\mathbf{x}_0)$$.
139 | [^3]: This [book](http://web.stanford.edu/~boyd/cvxbook/) is an excellent resource to learn more about these topics.
140 | 


--------------------------------------------------------------------------------
/gan/index.tex:
--------------------------------------------------------------------------------
  1 | \section{Generative Adversarial Networks}
  2 | 
  3 | % We begin our study with the autoregressive generative models. As before, we assume we are given access to a dataset $\mathcal{D}$  of $n$-dimensional datapoints $\mathbf{x}$. For simplicity, we assume the datapoints are binary, i.e.,  $\mathbf{x} \in \{0,1\}^n$.
  4 | 
  5 | \section{Representation}
  6 | 
  7 | By the chain rule of probability, we can factorize the joint distribution over the $n$-dimensions as:
  8 | \[
  9 | \begin{equation}
 10 | p(\mathbf{x}) = \prod\limits_{i=1}^{n}p(x_i \vert x_1, x_2, \ldots, x_{i-1}) = \prod\limits_{i=1}^{n} p(x_i \vert \mathbf{x}_{<i})
 11 | \end{equation}
 12 | \label{eq:chain_rule}
 13 | \]
 14 | where $\mathbf{x}_{<i}=[x_1, x_2, \ldots, x_{i-1}]$ denotes the vector of random variables with index less than $i$. If we allow for every conditional $p(x_i \vert \mathbf{x}_{<i})$ to be specified in a tabular form, then such a representation is fully general and can represent any possible distribution over $n$ random variables. However, the space complexity for such a representation grows exponentially with $n$. 
 15 | 
 16 | To see why, let us consider the conditional for the last dimension, given by $p(x_n \vert \mathbf{x}_{<n})$. In order to fully specify this conditional, we need to specify a probability for $2^{n-1}$ configurations of the variables $x_1, x_2, \ldots, x_{n-1}$.  Since the probabilities should sum to 1, the total number of parameters for specifying this conditional is given by $2^{n-1} -1$. Hence, a tabular representation for the conditionals is impractical for learning the joint distribution in (\ref{eq:chain_rule}) . 
 17 | 
 18 | In an \textit{autoregressive generative model}, the conditionals are specified as parameterized functions with a fixed number of parameters. That is, we assume the conditional distributions $p(x_i \vert \mathbf{x}_{<i})$ to correspond to a Bernoulli random variable and learn a function that maps the preceeding random variables $x_1, x_2, \ldots, x_{i-1}$ to the mean of this distribution. Hence, we have:
 19 | \[
 20 | p_{\theta_i}(x_i \vert \mathbf{x}_{<i}) = \mathrm{Bern}(f_i(x_1, x_2, \ldots, x_{i-1}))
 21 | \]
 22 | where $\theta_i$ denotes the set of parameters used to specify the mean function $f_i: \{0,1\}^{i-1}\rightarrow [0,1]$.  The term \textit{autoregressive} originates from the literature on time-series models where observations from the previous time-steps are used to predict the value at the current time step. Here, we are predicting the distribution for the $i$-th random variable using the values of the preceeding random variables in the sequence $x_1, x_2, \ldots, x_n$.
 23 | 
 24 | The number of parameters of an autoregressive generative model are given by $\sum_{i=1}^n \vert \theta_i \vert$. As we shall see in the examples below, the number of parameters are much fewer than the tabular setting considered previously. Unlike the tabular setting however, an autoregressive generative model cannot represent all possible distributions. Its expressiveness is limited by the fact that we are limiting the conditional distributions to correspond to a Bernoulli random variable with a restricted class of parameterized functions specifying the mean.
 25 | 
 26 | In the simplest case, we can specify the function as a linear combination of the input elements followed by a sigmoid non-linearity (to restrict the output to lie between 0 and 1). This gives us the formulation of a \textit{fully-visible sigmoid belief network} (FVSBN):
 27 | \[
 28 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\alpha^{(i)}_0 + \alpha^{(i)}_1 x_1 + \ldots + \alpha^{(i)}_{i-1} x_{i-1})  
 29 | \]
 30 | where $\sigma$ denotes the sigmoid function and $\theta_i=\{\alpha^{(i)}_0,\alpha^{(i)}_1, \ldots, \alpha^{(i)}_{i-1}\}$ denote the parameters of the mean function. The conditional for variable $i$ requires $i$
 31 |  parameters, and hence the total number of parameters in the model is given by $\sum_{i=1}^ni= O(n^2)$.  Note that the number of parameters are much fewer than the exponential parameters required in the tabular case.
 32 | 
 33 | A natural way to increase the expressiveness of an autoregressive generative model is to use more flexible parameterizations for the mean function e.g., multi-layer perceptrons (MLP). In the case of 1-hidden layer neural networks, the mean function for variable $i$ can be expressed as:
 34 | \[
 35 | \mathbf{h}_i = \sigma(A_i \mathbf{x_{<i}} + \mathbf{c}_i)\\
 36 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )  
 37 | \]
 38 | where $\mathbf{h}_i \in \mathbb{R}^d$ denotes the hidden layer activations for the MLP and$\theta_i = \{A_i \in \mathbb{R}^{d\times (i-1)},  \mathbf{c}_i \in \mathbb{R}^d, \boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d, b_i \in \mathbb{R}\}$ are the set of parameters for the mean function $\mu_i(\cdot)$.  The total number of parameters in this model is dominated by the matrices $A_i$ and given by $O(n^2 d)$. 
 39 | 
 40 | The Neural Autoregressive Density Estimation (NADE) provides an efficient MLP parameterization that shares parameters used for evaluating the hidden layer activations.
 41 | \[
 42 | \mathbf{h}_i = \sigma(W_{., <i} \mathbf{x_{<i}} + \mathbf{c})\\
 43 | f_i(x_1, x_2, \ldots, x_{i-1}) =\sigma(\boldsymbol{\alpha}^{(i)}\mathbf{h}_i +b_i )  
 44 | \]
 45 | where $\theta=\{W\in \mathbb{R}^{d\times n}, \mathbf{c} \in \mathbb{R}^d, \{\boldsymbol{\alpha}^{(i)}\in \mathbb{R}^d\}^n_{i=1}, \{b_i \in \mathbb{R}\}^n_{i=1}\}$is the full set of parameters for the mean functions $f_1(\cdot), f_2(\cdot), \ldots, f_n(\cdot)$. The weight matrix $W$ and the bias vector $\mathbf{c}$ are shared across the conditionals. Sharing parameters offers two benefits:
 46 | \begin{enumerate}
 47 | \item The total number of parameters from $O(n^2 d)$ to $O(nd)$ [readers are encouraged to check!].
 48 | \item The hidden unit activations can be evaluated in $O(nd)$ time via the following recursive strategy:
 49 | \[
 50 | \mathbf{h}_i = \sigma(\mathbf{a}_i)\\
 51 | \mathbf{a}_{i+1} = \mathbf{a}_{i} + W[., i]x_i
 52 | \]
 53 | with the base case given by $\mathbf{a}_1=\mathbf{c}$.
 54 | \end{enumerate}
 55 | 
 56 | 
 57 | \section{Learning and inference}
 58 | 
 59 | Recall that learning a generative model involves optimizing the closeness between the data and model distributions. One commonly used notion of closeness in the KL divergence between the data and the model distributions.
 60 | 
 61 | $$
 62 | \begin{align*}
 63 | \min_{\theta\in \mathcal{M}}d_{KL}(p_{\mathrm{data}}, p_{\theta}) &= \mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\mathrm{data}}(\mathbf{x}) - \log p_{\theta}(\mathbf{x})\right].
 64 | \end{align*}
 65 | $$
 66 | Before moving any further, we make two comments about the KL divergence. First, we note that the KL divergence between any two distributions is asymmetric. As we navigate through this chapter, the reader is encouraged to think what could go wrong if we decided to optimize the reverse KL divergence instead. Secondly, the KL divergences heavily penalizes model distribution $p_\theta$ which place little mass on any datapoint that has a non-zero probability under $p_{\mathrm{data}}$. In the extreme case, if the density $p_\theta(\mathbf{x})$ evaluates to zero for a datapoint sampled from $p_{\mathrm{data}}$, the objective evaluates to $+\infty$. 
 67 | 
 68 | Since $p_{\mathrm{data}}$ does not depend on $\theta$, we can equivalently recover the optimal parameters via maximizing likelihood estimation:
 69 | 
 70 | $$
 71 | \begin{align*}
 72 | \max_{\theta\in \mathcal{M}}\mathbb{E}_{\mathbf{x} \sim p_{\mathrm{data}} }\left[\log p_{\theta}(\mathbf{x})\right].
 73 | \end{align*}
 74 | $$
 75 | Here, $\log p_{\theta}(\mathbf{x})$ is referred to as the log-likelihood of the datapoint $\mathbf{x}$ with respect to the model distribution $p_\theta$. 
 76 | 
 77 | To approximate the expectation over the unknown $p_{\mathrm{data}}$, we make an assumption: points in the dataset $\mathcal{D}$ are sampled i.i.d. from $p_{\mathrm{data}}$. This allows us to obtain an unbiased Monte Carlo estimate of the objective:
 78 | 
 79 | $$
 80 | \begin{align}
 81 | \max_{\theta\in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\log p_{\theta}(\mathbf{x}) = \mathcal{L}(\theta \vert \mathcal{D}).
 82 | \end{align}
 83 | \label{eq:mle}
 84 | \tag{2}
 85 | $$
 86 | 
 87 | The maximum likelihood estimation (MLE) objective has an intuitive interpretation: pick the model parameters $\theta \in \mathcal{M}$ that maximize the log-probability of the observed datapoints in $\mathcal{D}$. 
 88 | 
 89 | In practice, we optimize the MLE objective using mini-batch gradient ascent. The algorithm operates in iterations. At every iteration $t$, we sample a mini-batch $\mathcal{B}_t$  of datapoints sampled randomly from the dataset ($\vert \mathcal{B}_t\vert < \vert \mathcal{D} \vert$) and compute gradients of the objective evaluated for the mini-batch. These parameters at iteration $t+1$ are then given via the following update rule:
 90 | \[
 91 | \theta^{(t+1)} = \theta^{(t)} + r_t \nabla_\theta\mathcal{L}(\theta^{(t)} \vert \mathcal{B}_t)
 92 | \]
 93 | where $\theta^{(t+1)}$ and $\theta^{(t)}$ are the parameters at iterations $t+1$ and $t$ respectively, and $r_t$ is the learning rate at iteration $t$.  Typically, we only specify the initial learning rate $r_1$ and update the rate based on a schedule.  [Variants](http://cs231n.github.io/optimization-1/) of stochastic gradient ascent, such as RMS prop and Adam, employ modified update rules that work slightly better in practice. 
 94 | 
 95 | From a practical standpoint, we must think about how to choose hyperaparameters (such as the initial learning rate) and a stopping criteria for the gradient descent. For both these questions, we follow the standard practice in machine learning of monitoring the objective on a validation dataset. Consequently, we choose the hyperparameters with the best performance on the validation dataset and stop updating the parameters when the validation log-likelihoods stop improving.
 96 | 
 97 | Now that we have a well-defined objective and optimization procedure, the only remaining task is to evaluate the objective in the context of an autoregressive generative model. To this end, we substitute the factorization of the joint distribution in Eq.$~\ref{eq:chain_rule}$
 98 | in the MLE objective in Eq.$~\ref{eq:mle}$ to get:
 99 | \[
100 | \max_{\theta \in \mathcal{M}}\frac{1}{\vert D \vert} \sum_{\mathbf{x} \in\mathcal{D} }\sum_{i=1}^n\log p_{\theta_i}(x_i \vert \mathbf{x}_{<i})
101 | \]where $\theta = \{\theta_1, \theta_2, \ldots, \theta_n\}$ now denotes the collective set of parameters for the conditionals.
102 | 
103 | Inference in an autoregressive model is straightforward. For density estimation of an arbitrary point $\mathbf{x}$, we simply evaluate the log-conditionals $\log p_{\theta_i}(x_i \vert \mathbf{x}_{<i})$ for each $i$ and add these up to obtain the log-likelihood assigned by the model to $\mathbf{x}$. Since we know conditioning vector $\mathbf{x}$, each of the conditionals can be evaluated in parallel. Hence, density estimation is efficient on modern hardware.
104 | 
105 | Sampling from an autoregressive model is a sequential procedure. Here, we first sample $x_1$, then we sample $x_2$ conditioned on the sampled $x_1$, followed by $x_3$ conditioned on both $x_1$ and $x_2$ and so on until we sample $x_n$ conditioned on the previously sampled $\mathbf{x}_{<n}$. For applications requiring real-time generation of high-dimensional data such as audio synthesis, the sequential sampling can be an expensive process.
106 | 
107 | 
108 | TODO: add NADE samples figure
109 | 
110 | Finally, an autoregressive model does not directly learn unsupervised representations of the data. In the next few set of lectures, we will look at latent variable models (e.g., variational autoencoders) which explicitly learn latent representations of the data.
111 | 
112 | 
113 | TODO: Autoregressive generative models based on Autoencoders, RNNs, and CNNs.
114 | MADE, Char-RNN, Pixel-CNN, Wavenet
115 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: post
 3 | title: Contents
 4 | ---
 5 | These notes form a concise introductory course on deep generative models.
 6 | They are based on Stanford [CS236](https://deepgenerativemodels.github.io/), taught by [Stefano Ermon](http://cs.stanford.edu/~ermon/) and [Aditya Grover](http://aditya-grover.github.io/), and have been written by [Aditya Grover](http://aditya-grover.github.io/), with the [help](https://github.com/deepgenerativemodels/notes/commits/master) of many students and course staff.
 7 | {% marginnote 'mn-id-whatever' 'The notes are still **under construction**!
 8 | Since these notes are brand new, you will find several typos. If you do, please let us know, or submit a pull request with your fixes to our [Github repository](https://github.com/deepgenerativemodels/notes).'%}
 9 | You too may help make these notes better by submitting your improvements to us via [Github](https://github.com/deepgenerativemodels/notes).
10 | 
11 | 
12 | 1. [Introduction](introduction/)
13 | 
14 | 2. [Autoregressive Models](autoregressive/)
15 | 
16 | 3. [Variational Autoencoders](vae/) 
17 | 
18 | 4. [Normalizing Flow Models](flow/) 
19 | 
20 | 5. [Generative Adversarial Networks](gan/)
21 | 
22 | 


--------------------------------------------------------------------------------
/introduction/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: post
  3 | title: Introduction
  4 | ---
  5 | 
  6 | Intelligent agents are constantly generating, acquiring, and processing
  7 | data. This data could be in the form of *images* that we capture on our
  8 | phones, *text* messages we share with our friends, *graphs* that model
  9 | interactions on social media, *videos* that record important events,
 10 | etc. Natural agents excel at discovering patterns, extracting
 11 | knowledge, and performing complex reasoning based on the data they observe. How
 12 | can we build artificial learning systems to do the same?
 13 | 
 14 | In this course, we will study generative models that view the world under the lens of probability.
 15 | In such a worldview, we can think of any kind of
 16 | observed data, say $$\mathcal{D}$$, as a finite set of samples from an
 17 | underlying distribution, say $$p_{\mathrm{data}}$$. At its very core, the
 18 | goal of any generative model is then to approximate this data
 19 | distribution given access to the dataset $$\mathcal{D}$$. The hope is that
 20 | if we are able to *learn* a good generative model, we can use the
 21 | learned model for downstream *inference*.
 22 | 
 23 | Learning
 24 | --------
 25 | 
 26 | We will be primarily interested in parametric approximations to the data
 27 | distribution, which summarize all the information about the dataset $$\mathcal{D}$$ in
 28 | a finite set of parameters. In contrast with non-parametric models,
 29 | parametric models scale more efficiently with large datasets but are
 30 | limited in the family of distributions they can represent.
 31 | 
 32 | In the parametric setting, we can think of the task of learning a
 33 | generative model as picking the parameters within a family of model
 34 | distributions that minimizes some notion of distance[^1] between the
 35 | model distribution and the data distribution.
 36 |  
 37 | <img src="learning_1.png" alt="drawing" width="200" class="center"/>
 38 | 
 39 | 
 40 | <img src="learning_2.png" alt="drawing" width="300" class="center"/>
 41 | <!-- ![given](learning_1.png =100x20)
 42 | ![goal](learning_2.png =100x20) -->
 43 | 
 44 | 
 45 | For instance, we might be given access to a dataset of dog images $$\mathcal{D}$$ and
 46 | our goal is to learn the parameters of  a generative model $$\theta$$ within a model family $$\mathcal{M}$$ such that
 47 | the model distribution $$p_\theta$$ is close to the data distribution over dogs
 48 | $$p_{\mathrm{data}}$$. Mathematically, we can specify our goal as the
 49 | following optimization problem: $$$$\begin{equation}
 50 | \min_{\theta\in \mathcal{M}}d(p_{\mathrm{data}}, p_{\theta})
 51 | \label{eq:learning_gm}
 52 | \tag{1}
 53 | \end{equation}$$$$where $$p_{\mathrm{data}}$$ is accessed via the dataset
 54 | $$\mathcal{D}$$ and $$d(\cdot)$$ is a notion of distance between probability distributions.
 55 | 
 56 | As we navigate through this course, it is interesting to take note of
 57 | the difficulty of the problem at hand. A typical image from a modern
 58 | phone camera has a resolution of approximately $$700 \times 1400$$ pixels.
 59 | Each pixel has three channels: R(ed), G(reen) and B(lue) and each
 60 | channel can take a value between 0 to 255. Hence, the number of possible
 61 | images is given by $$256^{700 \times 1400 \times 3}\approx 10 ^{800000}$$.
 62 | In contrast, ImageNet, one of the largest publicly available datasets,
 63 | consists of only about 15 million images. Hence, learning a generative
 64 | model with such a limited dataset is a highly underdetermined problem.
 65 | 
 66 | Fortunately, the real world is highly structured and automatically
 67 | discovering the underlying structure is key to learning generative
 68 | models. For example, we can hope to learn some basic artifacts about
 69 | dogs even with just a few images: two eyes, two ears, fur etc. Instead
 70 | of incorporating this prior knowledge explicitly, we will hope the model
 71 | learns the underlying structure directly from data. There is no free
 72 | lunch however, and indeed successful learning of generative models will
 73 | involve instantiating the optimization problem in
 74 | $$(\ref{eq:learning_gm})$$ in a suitable way. In this course, we will be
 75 | primarily interested in the following questions:
 76 | 
 77 | * What is the representation for the model family $$\mathcal{M}$$?
 78 | * What is the objective function $$d(\cdot)$$?
 79 | * What is the optimization procedure for minimizing $$d(\cdot)$$?
 80 | 
 81 | In the next few set of lectures, we will take a deeper dive into certain
 82 | families of generative models. For each model family, we will note how
 83 | the representation is closely tied with the choice of learning objective
 84 | and the optimization procedure.
 85 | 
 86 | Inference
 87 | ---------
 88 | 
 89 | For a discriminative model such as logistic regression, the fundamental
 90 | inference task is to predict a label for any given datapoint. Generative
 91 | models, on the other hand, learn a joint distribution over the entire
 92 | data.[^2]
 93 | 
 94 | While the range of applications to which generative models have been
 95 | used continue to grow, we can identify three fundamental inference
 96 | queries for evaluating a generative model.:
 97 | 
 98 | 1.  *Density estimation:* Given a datapoint $$\mathbf{x}$$, what is the
 99 |     probability assigned by the model, i.e., $$p_\theta(\mathbf{x})$$?
100 | 
101 | 2.  *Sampling:* How can we *generate* novel data from the model
102 |     distribution, i.e.,
103 |     $$\mathbf{x}_{\mathrm{new}} \sim p_\theta(\mathbf{x})$$?
104 | 
105 | 3.  *Unsupervised representation learning:* How can we learn meaningful
106 |     feature representations for a datapoint $$\mathbf{x}$$?
107 | 
108 | Going back to our example of learning a generative model over dog
109 | images, we can intuitively expect a good generative model to work as
110 | follows. For density estimation, we expect $$p_\theta(\mathbf{x})$$ to be
111 | high for dog images and low otherwise. Alluding to the name *generative
112 | model*, sampling involves generating novel images of dogs beyond the
113 | ones we observe in our dataset. Finally, representation learning can
114 | help discover high-level structure in the data such as the breed of
115 | dogs.
116 | 
117 | In light of the above inference tasks, we note two caveats. First,
118 | quantitative evaluation of generative models on these tasks is itself
119 | non-trivial (in particular, sampling and representation learning) and an
120 | area of active research. Some quantitative metrics exist, but these
121 | metrics often fail to reflect desirable qualitative attributes in the
122 | generated samples and the learned representations. Secondly, not all
123 | model families permit efficient and accurate inference on all these
124 | tasks. Indeed, the trade-offs in the inference capabilities of the
125 | current generative models have led to the development of very diverse approaches as
126 | we shall see in this course.
127 | 
128 | 
129 | ## Footnotes
130 | 
131 | [^1]: As we shall see later, functions that do not satisfy all
132 |     properties of a distance metric are also used in practice, e.g., KL
133 |     divergence.
134 | 
135 | [^2]: Technically, a probabilistic discriminative model is also a
136 |     generative model of the labels conditioned on the data. However, the
137 |     usage of the term generative models is typically reserved for high
138 |     dimensional data.
139 | 


--------------------------------------------------------------------------------
/introduction/learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/introduction/learning.png


--------------------------------------------------------------------------------
/introduction/learning_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/introduction/learning_1.png


--------------------------------------------------------------------------------
/introduction/learning_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/introduction/learning_2.png


--------------------------------------------------------------------------------
/vae/klgap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/vae/klgap.png


--------------------------------------------------------------------------------
/vae/vae.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepgenerativemodels/notes/bd2303339eaaea884870125b473cc1ae8c980d51/vae/vae.png


--------------------------------------------------------------------------------