├── Gemfile
├── assets
├── css
│ └── style.scss
└── images
│ ├── bg_hr.png
│ ├── blacktocat.png
│ ├── icon_download.png
│ └── sprite_download.png
├── script
├── bootstrap
├── cibuild
└── release
├── thumbnail.png
├── .gitignore
├── README.md
├── images
├── toward-controlled-generation-of-text
│ ├── architecture.png
│ └── learning-process.png
├── style-transfer-in-text-exploration-and-evaluation
│ └── architecture.png
├── semi-supervised-learning-with-deep-generative-models
│ └── architecture.png
├── improved-neural-text-attribute-transfer-with-non-parallel-data
│ └── architecture.png
├── unsupervised-machine-translation-using-monolingual-corpora-only
│ └── architecture.png
├── sequence-to-better-sequence-continuous-revision-of-combinatorial-structures
│ └── model.png
├── style-transfer-from-non-parallel-text-by-cross-alignment
│ └── cross-alignment-training.png
├── domain-adaptation-meets-disentangled-representation-learning-and-style-transfer
│ └── architecture.png
└── natural-language-multitasking-analyzing-and-improving-syntactic-saliency-of-hidden-representations
│ └── architecture.png
├── _config.yml
├── create_review.py
├── jekyll-theme-slate.gemspec
├── _reviews
├── causal-embeddings-for-recommendation.md
├── a-neural-probabilistic-language-model.md
├── disentangled-representations-for-manipulation-of-sentiment-in-text.md
├── jtav-jointly-learning-social-media-content-representation-by-fusing-textual-acoustic-and-visual-features.md
├── jade-joint-autoencoders-for-disentanglement.md
├── generating-sentences-by-editing-prototypes.md
├── improved-neural-text-attribute-transfer-with-non-parallel-data.md
├── improved-techniques-for-training-gans.md
├── style-transfer-through-back-translation.md
├── sequence-to-better-sequence-continuous-revision-of-combinatorial-structures.md
├── wasserstein-autoencoders.md
├── controlling-linguistic-style-aspects-in-neural-language-generation.md
├── word-translation-without-parallel-data.md
├── controlling-politeness-in-neural-machine-translation-via-side-constraints.md
├── adversarially-regularized-autoencoders.md
├── supervised-learning-of-universal-sentence-representations-from-natural-language-inference-data.md
├── natural-language-multitasking-analyzing-and-improving-syntactic-saliency-of-hidden-representations.md
├── domain-adaptation-meets-disentangled-representation-learning-and-style-transfer.md
├── semi-supervised-learning-with-deep-generative-models.md
├── fader-networks-manipulating-images-by-sliding-attributes.md
├── learning-to-generate-reviews-and-discovering-sentiment.md
├── generating-sentences-from-a-continuous-space.md
├── style-transfer-in-text-exploration-and-evaluation.md
├── unsupervised-machine-translation-using-monolingual-corpora-only.md
├── adversarial-learning-for-neural-dialogue-generation.md
├── a-hierarchical-neural-autoencoder-for-paragraphs-and-documents.md
├── improved-variational-autoencoders-for-text-modeling-using-dilated-convolutions.md
├── multispace-variational-encoderdecoders-for-semisupervised-labeled-sequence-transduction.md
├── toward-controlled-generation-of-text.md
├── infovae-information-maximizing-variational-autoencoders.md
├── beam-search-strategies-for-neural-machine-translation.md
├── natural-language-processing-almost-from-scratch.md
├── adversarial-generation-of-natural-language.md
├── infogan-interpretable-representation-learning-by-information-maximizing-generative-adversarial-nets.md
└── style-transfer-from-non-parallel-text-by-cross-alignment.md
├── _layouts
└── default.html
├── _sass
├── rouge-github.scss
└── jekyll-theme-slate.scss
├── LICENSE
└── index.md
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | gemspec
4 |
--------------------------------------------------------------------------------
/assets/css/style.scss:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 |
4 | @import "jekyll-theme-slate";
5 |
--------------------------------------------------------------------------------
/script/bootstrap:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | gem install bundler
6 | bundle install
7 |
--------------------------------------------------------------------------------
/thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/thumbnail.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _site/
2 | .sass-cache/
3 | .jekyll-cache/
4 | .jekyll-metadata
5 | Gemfile.lock
6 | .vscode
7 |
--------------------------------------------------------------------------------
/assets/images/bg_hr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/assets/images/bg_hr.png
--------------------------------------------------------------------------------
/script/cibuild:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | bundle exec jekyll build
6 | gem build jekyll-theme-slate.gemspec
7 |
--------------------------------------------------------------------------------
/assets/images/blacktocat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/assets/images/blacktocat.png
--------------------------------------------------------------------------------
/assets/images/icon_download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/assets/images/icon_download.png
--------------------------------------------------------------------------------
/assets/images/sprite_download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/assets/images/sprite_download.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Research Review Notes
2 |
3 | Access the website at the link below:
4 |
5 | https://vineetjohn.github.io/research-review-notes/
6 |
7 |
--------------------------------------------------------------------------------
/images/toward-controlled-generation-of-text/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/toward-controlled-generation-of-text/architecture.png
--------------------------------------------------------------------------------
/images/toward-controlled-generation-of-text/learning-process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/toward-controlled-generation-of-text/learning-process.png
--------------------------------------------------------------------------------
/images/style-transfer-in-text-exploration-and-evaluation/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/style-transfer-in-text-exploration-and-evaluation/architecture.png
--------------------------------------------------------------------------------
/images/semi-supervised-learning-with-deep-generative-models/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/semi-supervised-learning-with-deep-generative-models/architecture.png
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | title: Research Review Notes
2 | description: Summaries of academic research papers
3 | show_downloads: false
4 | google_analytics:
5 | theme: jekyll-theme-slate
6 | include: ['_reviews']
7 | github:
8 | is_project_page: True
9 |
--------------------------------------------------------------------------------
/images/improved-neural-text-attribute-transfer-with-non-parallel-data/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/improved-neural-text-attribute-transfer-with-non-parallel-data/architecture.png
--------------------------------------------------------------------------------
/images/unsupervised-machine-translation-using-monolingual-corpora-only/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/unsupervised-machine-translation-using-monolingual-corpora-only/architecture.png
--------------------------------------------------------------------------------
/images/sequence-to-better-sequence-continuous-revision-of-combinatorial-structures/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/sequence-to-better-sequence-continuous-revision-of-combinatorial-structures/model.png
--------------------------------------------------------------------------------
/images/style-transfer-from-non-parallel-text-by-cross-alignment/cross-alignment-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/style-transfer-from-non-parallel-text-by-cross-alignment/cross-alignment-training.png
--------------------------------------------------------------------------------
/images/domain-adaptation-meets-disentangled-representation-learning-and-style-transfer/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/domain-adaptation-meets-disentangled-representation-learning-and-style-transfer/architecture.png
--------------------------------------------------------------------------------
/images/natural-language-multitasking-analyzing-and-improving-syntactic-saliency-of-hidden-representations/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vineetjohn/research-review-notes/HEAD/images/natural-language-multitasking-analyzing-and-improving-syntactic-saliency-of-hidden-representations/architecture.png
--------------------------------------------------------------------------------
/create_review.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import subprocess
4 |
5 | initial_layout = [
6 | "---",
7 | "layout: default",
8 | "permalink: 'reviews/{}.html'",
9 | "title: '{}'",
10 | "---\n",
11 | "# {}",
12 | "---\n",
13 | "## Idea\n\n",
14 | "## Method\n\n",
15 | "## Observations\n\n"
16 | ]
17 |
18 |
19 | def main():
20 | paper_name = sys.argv[1]
21 | formatted_name = subprocess.check_output(
22 | ["filename-formatter", paper_name]).decode("utf-8").strip()
23 | file_contents = "\n".join(initial_layout)
24 |
25 | with open("_reviews/{}.md".format(formatted_name), 'w') as f:
26 | f.write(file_contents.format(formatted_name, paper_name, paper_name))
27 |
28 |
29 | if __name__ == '__main__':
30 | main()
31 |
--------------------------------------------------------------------------------
/jekyll-theme-slate.gemspec:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 |
3 | Gem::Specification.new do |s|
4 | s.name = "jekyll-theme-slate"
5 | s.version = "0.1.0"
6 | s.license = "CC0-1.0"
7 | s.authors = ["Jason Costello", "GitHub, Inc."]
8 | s.email = ["opensource+jekyll-theme-slate@github.com"]
9 | s.homepage = "https://github.com/pages-themes/slate"
10 | s.summary = "Slate is a Jekyll theme for GitHub Pages"
11 |
12 | s.files = `git ls-files -z`.split("\x0").select do |f|
13 | f.match(%r{^((_includes|_layouts|_sass|assets)/|(LICENSE|README)((\.(txt|md|markdown)|$)))}i)
14 | end
15 |
16 | s.platform = Gem::Platform::RUBY
17 | s.add_runtime_dependency "jekyll", "~> 3.5"
18 | s.add_runtime_dependency "jekyll-seo-tag", "~> 2.0"
19 | end
20 |
--------------------------------------------------------------------------------
/script/release:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # Tag and push a release.
3 |
4 | set -e
5 |
6 | # Make sure we're in the project root.
7 |
8 | cd $(dirname "$0")/..
9 |
10 | # Make sure the darn thing works
11 |
12 | bundle update
13 |
14 | # Build a new gem archive.
15 |
16 | rm -rf jekyll-theme-slate-*.gem
17 | gem build -q jekyll-theme-slate.gemspec
18 |
19 | # Make sure we're on the master branch.
20 |
21 | (git branch | grep -q 'master') || {
22 | echo "Only release from the master branch."
23 | exit 1
24 | }
25 |
26 | # Figure out what version we're releasing.
27 |
28 | tag=v`ls jekyll-theme-slate-*.gem | sed 's/^jekyll-theme-slate-\(.*\)\.gem$/\1/'`
29 |
30 | # Make sure we haven't released this version before.
31 |
32 | git fetch -t origin
33 |
34 | (git tag -l | grep -q "$tag") && {
35 | echo "Whoops, there's already a '${tag}' tag."
36 | exit 1
37 | }
38 |
39 | # Tag it and bag it.
40 |
41 | gem push jekyll-theme-slate-*.gem && git tag "$tag" &&
42 | git push origin master && git push origin "$tag"
43 |
--------------------------------------------------------------------------------
/_reviews/causal-embeddings-for-recommendation.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: 'reviews/causal-embeddings-for-recommendation.html'
4 | title: 'Causal Embeddings for Recommendation'
5 | ---
6 |
7 | # Causal Embeddings for Recommendation
8 | ---
9 |
10 | ## Idea
11 | The authors argue that recommendation systems that optimize for time-spent are indirect optimization methods compared to approaches that just predict items based on a user's past. They learn a recommendation policy that tries to infer the desired outcome from organic user behavior.
12 |
13 | The idea presented is a riff on simple matrix factorization methods and uses the outcome of randomized recommendations' outcomes to create user and item representations.
14 |
15 | ## Background
16 | Previous approaches to item recommendations are broadly classified into 2 categories:
17 | 1. Item-to-item similarity systems, that learn embeddings for items and use a distance metric like cosine similarity to identify similar items
18 | 2. User-item sequence embeddings, which tries to predict the next item which the user intends to purchase.
19 |
20 | ## Method
21 | * The authors attempt to jointly factorize the matrix of control observations and the matrix of treatment observations.
22 | * They utilize an algorithm the call the 'CausE' algorithm, that generates the recommendations.
23 |
24 | ## Observations
25 | * The proposed algorithms outperformed the baselines compared against for the MovieLens and Netflix datasets.
26 |
--------------------------------------------------------------------------------
/_reviews/a-neural-probabilistic-language-model.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/a-neural-probabilistic-language-model.html"
4 | title: 'A Neural Probabilistic Language Model'
5 | ---
6 |
7 | # A Neural Probabilistic Language Model
8 | ---
9 |
10 | ## Idea
11 |
12 | This is the seminal paper on neural language modeling that first proposed learning distributed representations of words. There is an obvious distinction made for predictions in a discrete vocabulary space vs. predictions in a continuous space i.e. the curse of dimensionality. The solution proposed is to have real-valued word feature vectors that are learnt along with the joint probability function of their occurrence in sequences in the corpus.
13 |
14 | ## Method
15 |
16 | * The probability function is expressed as the product of the conditional probabilities of the next word given the current word.
17 | * In contrast to future works like Word2Vec, this paper considers the next word's probability distribution to be conditioned only on a window of $n$ words that precede it.
18 | * Neural architecture used for experiments comprised of an embedding layer, a hidden $tanh$ layer and an output $softmax$ layer.
19 | * Experiments benchmarked this system on the Brown and AP News corpora and it was compared to n-gram models, with the performance metric being perplexity (lower = better).
20 |
21 | ## Observations
22 |
23 | The model had test perplexity gains of 24% for Brown and 8% on AP news compared to the state-of-the-art n-gram models (smoothed trigram).
24 |
--------------------------------------------------------------------------------
/_reviews/disentangled-representations-for-manipulation-of-sentiment-in-text.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/disentangled-representations-for-manipulation-of-sentiment-in-text.html"
4 | title: 'Disentangled Representations for Manipulation of Sentiment in Text'
5 | ---
6 |
7 | # Disentangled Representations for Manipulation of Sentiment in Text
8 |
9 | ## Idea
10 |
11 | The main idea of the paper is to change the style (sentiment) of a body of text while retaining its content
12 |
13 | ## Method
14 |
15 | * This system uses a CNN for text encoding and an RNN for decoding.
16 | * MMD is a metric that calculates the distance between the means of 2 different probability distribution. This is estimated in this paper using a Gaussian kernel.
17 | * In the case of sentiment, 2 distinct probability distributions $P_{source}$ and $P_{target}$ are learned. The style transfer is achieved by traversing the manifold between these 2 distributions.
18 | * The initial training phase just uses an autoencoder-like setup to recreate the original sentences.
19 | * After the initial training, the CNN is trained to classify sentiment, thus, causing the distribution of the encoded sentences to diverge based on whether the sentence is positive or negative.
20 | * Decoding is done by conditioning the start of the sentence on a start-of-sentence token and the sentence's encoding. The sentence generation ends when an EOS (end-of-sentence) token is generated.
21 | * The traversal across the manifold between the positive and negative distributions is done to transfer the sentiment of text.
22 |
23 | ## Observations
24 |
25 | * It is not clear how the representations are disentangled. It seems like the sentence encoding itself encodes information about the sentiment and hence, the representations rely on the entanglement to generate the manifold which is traversed.
26 | * No quantitative evaluations were performed citing lack of evaluation metrics.
27 |
--------------------------------------------------------------------------------
/_reviews/jtav-jointly-learning-social-media-content-representation-by-fusing-textual-acoustic-and-visual-features.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: 'reviews/jtav-jointly-learning-social-media-content-representation-by-fusing-textual-acoustic-and-visual-features.html'
4 | title: 'JTAV: Jointly Learning Social Media Content Representation by Fusing Textual, Acoustic, and Visual Features'
5 | ---
6 |
7 | # JTAV: Jointly Learning Social Media Content Representation by Fusing Textual, Acoustic, and Visual Features
8 | ---
9 |
10 | ## Idea
11 | The authors propose to learn the 'content' of social media, by fusing together 3 distinct content modes: textual, acoustic and visual.
12 | They claim that jointly learning features from all 3 of these outperforms systems that rely on single- or bi-modal learning.
13 | Their approach involves learning single-modal features and then fusing together the features learned from different modes.
14 |
15 | ## Background
16 | * Bi-modal learning has already been explored in previous work.
17 | * This comprises learning representations of each single-modal attribute independently.
18 | * These representations/embeddings can then be fused by learning a common embedding space.
19 |
20 | ## Method
21 | An attention-based network attBiGRU is used for text information, a DCRNN for acoustic information, and a fine-tuned general framework called DenseNet for visual features.
22 |
23 | Text information is divided into two parts:
24 | * Protagonist
25 | * Supporting Players
26 | of which the supporting players attend to the representation of the protagonist.
27 | Here, the protagonist is simply the largest body of text in the instance of data (e.g. blog post, song) and the supporting players refer to the auxiliary pieces of text (e.g. comments, reviews).
28 |
29 | ## Observations
30 | * The addition of text and audio don't seem to give a huge boost over the performance already achieved by just images. (i.e. 0.58 vs. 0.62)
31 |
--------------------------------------------------------------------------------
/_reviews/jade-joint-autoencoders-for-disentanglement.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/jade-joint-autoencoders-for-disentanglement.html"
4 | title: 'JADE - Joint Autoencoders for Dis-Entanglement'
5 | ---
6 |
7 | # JADE: Joint Autoencoders for Dis-Entanglement
8 | ---
9 |
10 | ## Idea
11 | The authors motivate the idea of using an auxilliary dataset that has abundant labeled samples to supplement a dataset that has only a few labeled samples, as long as the two datasets have at least one factor of variation in common. This is done by distentangling the content representation from the style using a variational autoencoder.
12 |
13 | ## Method
14 | * The authors make the assumption that the distributions $X$, with limited data, and $Y$ with abundant data, have the generative priors $P(z_1, z_2)$ and $P(z_3, z_4)$ respectively.
15 | * Assuming that $z_2$ and $z_4$ are the content representations of each dataset respectively, we want to train a classifier that is trained to distinguish between the two, and train encoders for each dataset that encoder $y_2$ and $y_4$ in such a way that they are indistinguishable from each other.
16 | * This forces the content to be stored specifically in $z_2$ and $z_4$. Now, since the set of $Y$ labeled examples is abundant, we have several $(z_4, label)$ pairs to train a new content classifier on, and now since $z_2$ and $z_4$ are interchangeable, we can use this dataset to augment the existing labels for the scarce dataset of $z_2, l$ pairs.
17 |
18 | ## Observations
19 | * Even though the objective is to augment training of data scarce regimes with abundant data from another dataset, there are assumptions about the other dataset, including the very presence of such a dataset, and the fact that there should be at least one factor of variation in common.
20 | * It'd have been interesting if the authors did an exploratory evaluation on the pancreatic and breast cancer datasets as well, as motivated in the introduction, instead of evaluating only on MNIST and SVHN.
21 |
22 |
--------------------------------------------------------------------------------
/_reviews/generating-sentences-by-editing-prototypes.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: 'reviews/generating-sentences-by-editing-prototypes.html'
4 | title: 'Generating Sentences by Editing Prototypes'
5 | ---
6 |
7 | # Generating Sentences by Editing Prototypes
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors propose a prototype-then-edit model that involves modifying a source sentence by attending to a random prototype from the source sequences and conditioning generation on a randomly sampled edit vector.
13 |
14 | ## Method
15 |
16 | * The formulation described in the paper stems from the observation that sentences in a large corpus can be viewed as minor transformations of the other sentences in the corpus.
17 | * The method involves sampling a random prototype from the training set to condition on. To avoid an intractable sum over all the possible sentences in the training set, since each of them have a uniform probability of being picked, only prototypes $x'$ in the neighbourhood of the final output $x$, following the rationale that only a subset of the total prototypes could possible be edited into $x$.
18 | * The neighbourhood of candidate prototypes is calculated using a word-edit distance metric - Jaccard Distance.
19 | * The prototype vector is attended to while decoding.
20 | * The other component of the latent space is the random vector that is analogous to $z$ in a VAE setup. Unlike most applications of VAE, the sampling is done from a prior that is not a simple Gaussian distribution.
21 |
22 | ### Experiments
23 | * A vocab size of $10K$ was used and the named-entities (proper nouns) were replaced by their base common nouns.
24 | * Corpora used for the experiments were Yelp review and BillionWord
25 | * The encoder used a 3 layer bidirectional LSTM and the decoder uses a 3 layer LSTM model with attention.
26 |
27 | ## Observations
28 | * The perplexity results outperforms a vanilla neural language model and a 5-gram Kneser-Ney language model used in an ensemble for the Yelp corpus.
29 | * Qualitative results indicate that multiple edits performed on a single sentence prevents degeneration to generic sentences.
30 |
--------------------------------------------------------------------------------
/_reviews/improved-neural-text-attribute-transfer-with-non-parallel-data.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/improved-neural-text-attribute-transfer-with-non-parallel-data.html"
4 | title: 'Improved Neural Text Attribute Transfer with Non-parallel Data'
5 | ---
6 |
7 | # Improved Neural Text Attribute Transfer with Non-parallel Data
8 | ---
9 |
10 | ## Idea
11 | The authors attempt the disentanglement of sentiment and content in text, using a cycle loss and a collaborative classifier.
12 |
13 |
14 | ## Method
15 | * Single encoder, decoder and classifier used.
16 | * There is no adversarial discriminator used. Only a collaborative classifier is needed for this method.
17 | * The encoder-decoder model (not a VAE) also uses attention to align the encoder and decoder outputs.
18 | * The model works only for single attribute transfer.
19 | * Decoding is done using a sequence of hidden states as opposed to a single latent vector as most other methods use.
20 | * As shown in the below architecture, a sample sentence $x_i$ with label $i$ is transferred to $x_j$ with label, and the transferred back to $x_i$, using the cycle-loss from the [CycleGAN paper](https://arxiv.org/abs/1703.10593). This is supposed to help with the content preservation.
21 | * Another content preservation regularization used is to not change the nouns in the original sentence when transferring.
22 |
23 | ### Model Architecture
24 |
25 | 
26 |
27 | ### Experiments
28 | * The Yelp and Amazon review data sets was used for the evaluation, with a max review length of 17 and 7 respectively
29 | * The method seems to outperform the model in the [Cross-Alignment paper](https://arxiv.org/abs/1705.09655) in sentiment transfer and content preservation, but the perplexity scores are much worse.
30 |
31 |
32 | ## Observations
33 | * Doing an ablation experiment on the attention mechanism might have been interesting, because although attention has been very successful for translation and dialogue tasks, if the structure of the sentence does not change much from encoder to decoder, the benefit of attention might not be substantial.
34 |
--------------------------------------------------------------------------------
/_reviews/improved-techniques-for-training-gans.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: 'reviews/improved-techniques-for-training-gans.html'
4 | title: 'Improved Techniques for Training GANs'
5 | ---
6 |
7 | # Improved Techniques for Training GANs
8 | ---
9 |
10 | ## Idea
11 |
12 | GANs are trained with a game theoretic setting where both the discriminator and the generator are trained to minimize their individual losses, thereby, potentially impacting the loss of the other entity and leading to training collapse.
13 |
14 | This paper tries to list the techniques/tricks that can be used to stabilize and improve GAN training.
15 |
16 |
17 | ## Method
18 |
19 | The paper describes several different methods of improving the convergence of GANs
20 | * **Feature Matching:** Instead of simply training the network to maximize the discriminator loss, run a discriminator on the generated distribution to encourage it to generate output that is close to the desired distribution. i.e. Train the generator to match expected feature values on an intermediate layer of the discriminator.
21 | * **Minibatch Discrimination:** The collapse of GANs can be avoided to a better extent by having the discriminator look at multiple examples simultaneously as opposed to training it with each data-point in isolation.
22 | * **Historical Averaging:** This is adding a loss term that computes the divergence of the current model parameters from the average model parameters from the previous $t$ epochs.
23 | * **One-sided Label Smoothing:** Instead of predicting the discrete labels 0 and 1 in a classification problem, label smoothing sets the tragets to 0.1 and 0.9 and this reduces the vulnerability of neural networks to adversarial samples. In one-sided label smoothing, only the positive labels are smoothed.
24 | * **Virtual batch normalization:** To avoid adding batch-bias by performing batch-norm over every batch, a preset batch is chosen as a reference batch and each data-point is normalized with respect to that.
25 |
26 | The authors also propose using a pre-trained model (eg. The Inception model) and measuring the entropy of its predictions as a proxy of how confident the model is about the generated examples, which gives us an estimate of the image quality.
27 |
--------------------------------------------------------------------------------
/_reviews/style-transfer-through-back-translation.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: 'reviews/style-transfer-through-back-translation.html'
4 | title: 'Style Transfer Through Back-Translation'
5 | ---
6 |
7 | # Style Transfer Through Back-Translation
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors state that learning a language translation model better preserves the meaning of a sentence and reduces it's stylistic properties, allowing for adversarial generation of text in the different style as the original. They also claim to have improved automatic evaluation of style transfer, and manual evaluation of content preservation.
13 |
14 | ## Method
15 |
16 | * Based on [previous work](https://arxiv.org/abs/1610.05461) that concludes that machine translation strips sentences of their stylistic content, hence, translating to another language and back should be the preliminary step before conditioned generation on a new style. They use author style as the framework for evaluation.
17 | * The authors use a multi-decoder model i.e. one model per style, similar to previous work done for text style transfer.
18 | * The output of the decoders are sent to a classifier, with the objective of checking if the generated sentence has the attributes that the decoder needs to include while producing it. This is an additional training signal to guide the decoder.
19 | * The model loss is a linear combination of the reconstruction and classifier losses.
20 | * An attention vector is also used for alignment between the original sentence and the style transferred decoded sentence.
21 |
22 |
23 | ## Observations
24 |
25 | * Content preservation was evaluated manually. Although style transfer evaluation is trivial, this metric seems more difficult to find an automated metric for.
26 | * One of the main motivations given for style transfer in text is to alleviate bias by synthesizing training data. However, if that was the objective, wouldn't it be better to simply train the model with the bias such that it isn't able to differentiate between two distributions of the same content with different style distributions?
27 | * This model seems to have performed better for longer sentences, and also outperformed the cross-alignment paper for style transfer on political slant and sentiment.
28 |
--------------------------------------------------------------------------------
/_reviews/sequence-to-better-sequence-continuous-revision-of-combinatorial-structures.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/sequence-to-better-sequence-continuous-revision-of-combinatorial-structures.html"
4 | title: Sequence to Better Sequence - Continuous Revision of Combinatorial Structures
5 | ---
6 |
7 | # Sequence to Better Sequence: Continuous Revision of Combinatorial Structures
8 | ---
9 |
10 | ## Idea
11 | The authors present a recurrent network based variational auto-encoder that learns to transform sequences to maximize a given objective.
12 |
13 | ## Method
14 | * The objective is to revise the decoded sentence to obtain a maximum value of the outcome variable, while still preserving the semantics of the original sentence. This is done using unaligned corpora.
15 | * Instead of needing to solve a combinatorial space problem of re-arranging the words of the sentence, the improved sentence generation is approximated by shifting the latent representation of the sentences, as parameterized by neural networks.
16 | * The model comprises of the original distribution $x$, the latent distribution $z$, and the outcome to be maximized $y$. Based on this, the model needs to come up with a corresponding $x^{\star}$ which maximizes the outcome.
17 | * The mapping of $x$ to $z$ and back to $x$ is parameterized by a VAE that is trained on reconstruction loss, as well as a KL-divergence loss to constrain the generative prior to a Gaussian distribution. Both the encoder and decoder are RNN networks.
18 | * The outcome $y$ is predicted from $z$ using a feed-forward network, and trained using a mean-squared loss.
19 | * To enforce the invariance of the encoder to already improved sequences, a cyclical loss is applied to ensure that $F(x)$ and $F(E(D(x)))$ are exactly the same where $F$ is the outcome predicting feed-forward network, $E$ is the encoder and $D$ is the decoder.
20 | * During every training iteration, the latent representation $z$ is optimized to maximize the outcome $y$
21 |
22 | ### Experiments
23 | The authors evaluate this model on 2 tasks: Revising sentence positivity, and modifying modern text into a Shakespearan style.
24 |
25 | ### Model
26 | 
27 |
--------------------------------------------------------------------------------
/_reviews/wasserstein-autoencoders.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: 'reviews/wasserstein-autoencoders.html'
4 | title: 'Wasserstein Autoencoders'
5 | ---
6 |
7 | # Wasserstein Autoencoders
8 | ---
9 |
10 | ## Idea
11 | Wasserstein Autoencoders (WAE) are proposed as an alternative to Variational Autoencoders (VAE) as a method of getting the encoded data distribution to match the prior, which can be used for generative modeling. The main idea is to use the Wasserstein distance metric to penalize the distance between the encoded distribution and the prior, as opposed to the KL-divergence term used in VAEs.
12 |
13 | ## Background
14 | * Optimal Transport (OT) cost is a way of measuring the distance between probability distributions.
15 | * f-divergences like the KL-divergence is apparently a stronger notion of distance and it often maxes out, provided no useful gradients, something that OT costs are supposed to address by their weaker topology.
16 | * Classic f-divergences include the Kulback-Liebler divergence (KL-divergence) and the Jensen-Shannon divergence.
17 |
18 | ## Method
19 | * The authors propose 2 separate regularizers, (1) the Wasserstein adversarial loss, and (2) a maximum mean discrepancy regularization, which is known to work well to match standard normal distributions in high dimensions. These approaches are called WAE-GAN and WAE-MMD respectively.
20 | * The WAE-MMD model eliminates adversary training entirely and optimizes a min-min objective. A positive definite kernel is required to compute pairwise distances between the $n$ samples of the prior and encoded distributions.
21 | * The WAE-GAN model optimizes the same adversarial objective as the regular GAN, except that the objective is defined on the latent space instead of the actual input/output space. The divergence is calculated by $D_{JS}$ which is the Jensen-Shannon divergence. This is evaluated empirically by averaging the divergence of $n$ samples.
22 | * The authors explain the difference between VAEs and WAEs as being the absence of the mutual information term $\mathbb{I}_Q(X, Z)$ in WAEs.
23 |
24 | ## Observations
25 | * The WAE-GAN algorithm seems faster as it only needs to do $O(n)$ computations as opposed to the WAE-MMD metric which requires $O(n^2)$
26 | * "Typically Z has no more than 100 dimensions and $P_Z$ is Gaussian"
27 | * WAE-GAN seems to have performed better on the quantitative image quality metrics
28 |
--------------------------------------------------------------------------------
/_reviews/controlling-linguistic-style-aspects-in-neural-language-generation.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/controlling-linguistic-style-aspects-in-neural-language-generation.html"
4 | title: 'Controlling Linguistic Style Aspects in Neural Language Generation'
5 | ---
6 |
7 | # Controlling Linguistic Style Aspects in Neural Language Generation
8 |
9 | ## Idea
10 |
11 | The authors control multiple aspects of the text i.e. content and style
12 | via attribute vectors and generate modified bodies of text using a
13 | conditioned neural language model.
14 |
15 | ## Background
16 |
17 | As opposed to the method used in [Toward Controlled Generation of Text](https://arxiv.org/abs/1703.00955), this one doesn't use variational auto-encoders.
18 |
19 | ## Method
20 |
21 | - The authors rely on a conditioned LSTM-based neural language model
22 | to generate sentences with modified content and style. A movie
23 | review dataset from Rotten Tomatoes is used for the task.
24 |
25 | - Multiple style and content attributes can be manipulated and used
26 | for training/generation at the same time i.e. combinations of the
27 | attributes can also be used.
28 |
29 | - The labels attributed to the training set are known entities.
30 |
31 | - Simple rules, heuristics and lexicons are used to derive some of the
32 | labels that the training data is annotated with.
33 |
34 | - The system of multi-label conditioning is compared to dedicated
35 | language models for each attribute.
36 |
37 | - A few of the metrics like sentiment and professional reviews are
38 | evaluated using human annotations.
39 |
40 | ## Observations
41 |
42 | - The dedicated language models work better than the single
43 | conditioned language models but it doesn't scale to multiple
44 | attributes, which the authors cite as a desirable property to have.
45 |
46 | - The authors state the sentiment was difficult to model since the
47 | perplexity of sentence conditioned with sentiment is lesser compared
48 | to conditioned sentences of other attributes. This could also be
49 | attributed to the fact that it takes lesser changes to model
50 | sentiment than it does to model the other attributes.
51 |
52 | - Although the authors use manual annotation to label the sentiment of
53 | the generated sentences, a good generalizable sentiment analysis
54 | classifier might have been quicker.
55 |
--------------------------------------------------------------------------------
/_reviews/word-translation-without-parallel-data.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: 'reviews/word-translation-without-parallel-data.html'
4 | title: 'Word Translation Without Parallel Data'
5 | ---
6 |
7 | # Word Translation Without Parallel Data
8 | ---
9 |
10 | ## Idea
11 | The main idea of the paper is to separately and unsupervisedly learn word embeddings from 2 different languages using a method like Word2Vec and then align the embedding spaces of the 2 languages using adversarial training. This allows the similar words in each language to be mapped to roughly the same point on the manifold. This manifold can then be queried using one language's words and the corresponding word in the other languages closest in the embedding space should be the transalated word. In this way, the authors propose building an unsupervised bilingual dictionary.
12 |
13 | ## Method
14 | * The Procrustes method using aligned words as anchor points and tries to reduce an energy function that corresponds to a spring system between anchor points.
15 | * The authors come up with a custom metric of distance computation (CSLS) that evaluates the mean of cosine distances of a word to the K-nearest words in the other language to minimize the hubness problem of embeddings i.e. some words are the nearest neighbours of many points while some are not the nearest neighbour for any point.
16 | * The model enforces an additional constraint to ensure the the linear transformation from the source language embedding manifold to the target languange embedding manifold done by the network is done by an orthogonal matrix.
17 |
18 | ### Experiments
19 | * Fastext pre-trained vectors were used to translate the words into the embedding space.
20 | * The authors also evaluate this method on a low-resource language pair i.e. English-Esperanto, and acheive results comparable to supervised methods.
21 |
22 | ## Observations
23 | * This method albeit unsupervised, outperformed several supervised methods of generation bilingual lexicons, and should probably be considered a stepping stone towards non-parallel neural machine translation capabilities.
24 | * The authors don't elaborate on how exactly they extract word-translation pairs for evaluation while accounting for polysemy.
25 | * The adversarial objective to align the 2 monolingual embedding spaces seems to be adept at learn cross-lingual embeddings without parallel data and the Procrustes refinement approach ensures high-quality word mappings.
26 |
27 |
--------------------------------------------------------------------------------
/_reviews/controlling-politeness-in-neural-machine-translation-via-side-constraints.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/controlling-politeness-in-neural-machine-translation-via-side-constraints.html"
4 | title: Controlling Politeness in Neural Machine Translation via Side Constraints
5 | ---
6 |
7 | # Controlling Politeness in Neural Machine Translation via Side Constraints
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors intend to control politeness while translating one language
13 | to another by using honorifics. The politeness is not grammatically
14 | implied in the source text, but should be predicted in the target text.
15 |
16 | The term side constraints in this paper is just another way of saying
17 | attributes that the generated text is conditioned on.
18 |
19 | ## Background
20 |
21 | The method relies on the [attention model](https://arxiv.org/abs/1409.0473) for
22 | neural translation. Each word $y_i$ is predicted based on 3 variables:
23 | the recurrent state $s_i$, the previously predict word $y_{i - 1}$ and
24 | the context vector $c_i$.
25 |
26 | The model relies on a GRU-backed bi-directional sequence autoencoder and
27 | the hidden state from both directions are combined to form the
28 | annotation vector. The weight of each of these annotation vectors is
29 | computed via a single layer feed-forward neural network. The weights are
30 | computed through an alignment model $\alpha_{ij}$, which models the
31 | probability that $x_j$ is aligned with $y_i$.
32 |
33 | A beam search decoder is used for the generation phase.
34 |
35 | ## Method
36 |
37 | - The authors use a classifier first to automatically annotate the
38 | target language sentences as either formal or informal.
39 |
40 | - The data used was movie dialogues from OpenSubtitles
41 |
42 | - To avoid over-dependence on side-constraints and to learn to ignore
43 | side-constraints when they're irrelevant, the authors use a
44 | controlling co-efficient for how many neutral sentences are marked
45 | with a politeness feature. They also control how many labelled
46 | training instances are marked. Both co-efficients are 0.5 in their
47 | experiments.
48 |
49 | - The entire system is parameterized by an attentional encoder-decoder
50 | NMT model.
51 |
52 | ## Observations
53 |
54 | Side constraints can be applied to other phenomena too, given that we
55 | already possess a classifier that can be used to label the training set
56 | without manual annotations.
57 |
58 |
--------------------------------------------------------------------------------
/_reviews/adversarially-regularized-autoencoders.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/adversarially-regularized-autoencoders.html"
4 | title: 'Adversarially Regularized Autoencoders'
5 | ---
6 |
7 | # Adversarially Regularized Autoencoders
8 | ---
9 |
10 | ## Idea
11 | The authors attempt to learn disentangled representations for text using adversarially regularized autoencoders.
12 |
13 |
14 | ## Method
15 | * The ARAE proposed in this paper is a direct enhancement to a VAE.
16 | * The adversarial discriminator/critic in this model provides a regularization that causes the generator to enforce a low Wasserstein distance metric between the true samples and the generated samples.
17 | * Similar to several other papers, the generator is conditioned to reconstruct the unaligned text samples. It's denoted by $p_{\Psi}(x\|y,c) $ where $c$ is the content embedding and $y$ is the label.
18 |
19 | ### Experiments
20 | * They emulate the experimental setup of [Style Transfer from Non-Parallel Text by Cross-Alignment](https://arxiv.org/abs/1705.09655) by using the Yelp positive/negative reviews corpus.
21 | * 2 decoders are used, 1 each for positive and negative review decoding.
22 | * 4 metrics are automatically tested, namely:
23 | * Transfer strength using a pre-trained classifier (FastText, which isn't actually a classifier, but an embedding method)
24 | * BLEU score
25 | * Perplexity and Reverse Perplexity, both of which are assesses using an RNN model. (This is just $2^{\mathcal{L}_{rec}}$)
26 | * Human evaluations were also conducted to assess Transfer, Similarity and Naturalness.
27 | * Similarly the authors also evaluate topic transfer, which tries to convert snippets of text from Politics, Science and Music to each other.
28 |
29 |
30 | ## Observations
31 | * The model seems to outperform the model presented in the 'Style Transfer from Non-Parallel Text by Cross-Alignment' paper on both Transfer Strength and Content Preservation (as evaluated by BLEU scores)
32 | * The authors do not state which BLEU score they use.
33 | * The only novel offering this paper seems to provide by way of method is the usage of the Wasserstein-1 distance metric as a measure of distribution proximity as opposed to the Jensen-Shannon distance used in GANs.
34 | * The authors state multiple times that they're operating on a discrete space and this makes it difficult for to estimate a sequence loss, but this estimation is usually done using a softmax approximation for the output space and an item-wise cross-entropy loss. Discrete autoencoders already exist.
35 |
--------------------------------------------------------------------------------
/_reviews/supervised-learning-of-universal-sentence-representations-from-natural-language-inference-data.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: 'reviews/supervised-learning-of-universal-sentence-representations-from-natural-language-inference-data.html'
4 | title: 'Supervised Learning of Universal Sentence Representations from Natural Language Inference Data'
5 | ---
6 |
7 | # Supervised Learning of Universal Sentence Representations from Natural Language Inference Data
8 |
9 | ---
10 |
11 | ## Idea
12 |
13 | * The authors state the need for general purpose sentence embeddings similar to how word embeddings are used in several NLP tasks today.
14 | * They train their model on the Stanford Natural Language Inference (SNLI) corpus.
15 | * The authors claim an improvement over the [SkipThought method](https://arxiv.org/abs/1506.06726) in learning generalized sentence embeddings that can be used for downstream tasks.
16 |
17 |
18 | ## Method
19 |
20 | * The inspiration to train on a supervised task is taken from the fact that ImageNet, despite primarily being an object recognition model, learns features that can be used for a general computer vision task.
21 | * After obtaining separate representations of the premise and hypothesis (2 sentences), the combined representation is a concatenation of:
22 | * Individual sentence representations
23 | * Element-wise product of the two representations
24 | * Element-wise subtraction of the two representations
25 | * They try out the following architectural choices:
26 | * LSTM and GRU recurrent encoders
27 | * Only last hidden state used
28 | * BiLSTM network with mean/max pooling
29 | * All hidden states used
30 | * Self-attentive network
31 | * Hierarchical convolutional network
32 | * The model is evaluated on a variety of tasks, including binary & multi-label classification, entailment, semantic relatedness, unsupervised textual similarity, paraphrase detection, image & caption retrieval.
33 |
34 |
35 | ## Observations
36 |
37 | * This seems to be a very empirical paper. The main questions being answered is what neural network architecture, which task, and which dataset yields generalizable sentence embeddings.
38 | * The BiLSTM with 4096 units and max pooling works best for the dev set, compared with other approaches.
39 | * The authors observe that model convergence is quicker on Adam, but the results aren't as good.
40 | * Combining the data from SNLI + MultiNLI improved the reported results.
41 | * This approach outperformed SkipThought vectors on several benchmarks while being easier to train.
42 |
43 |
--------------------------------------------------------------------------------
/_reviews/natural-language-multitasking-analyzing-and-improving-syntactic-saliency-of-hidden-representations.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/natural-language-multitasking-analyzing-and-improving-syntactic-saliency-of-hidden-representations.html"
4 | title: Natural Language Multitasking Analyzing and Improving Syntactic Saliency of Hidden Representations
5 | ---
6 |
7 | # Natural Language Multitasking: Analyzing and Improving Syntactic Saliency of Hidden Representations
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors make the claim that training autoencoded representation for multiple linguistics tasks, to improve the syntactic preservation in the representation.
13 |
14 | The authors also claim that this leads to the ability to perform Word2Vec-style arithmetic that causes syntactic modifications to sentences.
15 |
16 |
17 | ## Method
18 |
19 | * The main aspect of the method described is to use multiple latent representationa decoders for multiple tasks, which seems to cause the latent space to become more expressive in terms of capturing syntactic structures of the language.
20 | * The authors make it clear that they're focusing on the learned representation and not the objectives that are being used to train the representation.
21 | * The tasks being trained on are:
22 | * Reconstruction
23 | * POS-tagging
24 | * EN $\rightarrow$ FR translation
25 | * EN $\rightarrow$ DE translation
26 | * LSTM RNNs are used for both the encoder and decoders. A character based RNN is used.
27 | * The authors come up with 14 popular sentences templates that they try to cluster the sentences on, based on their latent representations.
28 | * The loss metric of this clustering is the absolute number of sentences that were clustered incorrectly into one of the other 13 clusters.
29 | * The clustering error drops with the inclusion of multiple tasks that the hidden representation needs to generalize to.
30 |
31 |
32 | ### Architecture
33 |
34 | 
35 |
36 |
37 | ## Observations
38 |
39 | * The assumption is made that since the task specific multi-decoder models receive more training input, it should use fewer training examples when compared to the main model.
40 | * The authors attempt interpolated generation from the space between 2 sentences' latent representations, but the results aren't great. This could be attributed mainly to their usage of a character based language model.
41 | * Very few of the arithmetic operations on the representation spaces actually work, and when they do, the sentences are very short ($<5$ words)
42 |
--------------------------------------------------------------------------------
/_reviews/domain-adaptation-meets-disentangled-representation-learning-and-style-transfer.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/domain-adaptation-meets-disentangled-representation-learning-and-style-transfer.html"
4 | title: 'Domain Adaptation Meets Disentangled Representation Learning and Style Transfer'
5 | ---
6 |
7 | # Domain Adaptation Meets Disentangled Representation Learning and Style Transfer
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors intend to learn both common and specific representations of
13 | distinct handwriting styles, and then:
14 | - Use the common styles for domain adaptation.
15 | - Use the specific styles for style transfer.
16 |
17 | ## Method
18 |
19 | - The data set comprises of source images $S$, target images $T$, and
20 | a set of labels for only the source images $Y_S$.
21 |
22 | - The general idea is to learn the representations of $S_C$ (content
23 | of source), $S_S$ (style of source) and similarly, $T_C$ and $T_S$
24 | for the target respectively.
25 |
26 | - Semantic consistency and entropy are the two metrics that are used
27 | to bring the content representations together into the same
28 | distribution.
29 |
30 | - $F_S$ and $F_T$ are both feature extractors that try to disambiguate
31 | style from content.
32 |
33 | - The generators $G_S$ and $G_T$ are given the style of their
34 | respective networks with content from both the source and target
35 | sets. The idea is to train them such that it is encouraged by the
36 | network to maintain separation of responsibilities between the style
37 | components $S$ and the content components $C$.
38 |
39 | - $D_F$ is the adversarial component that tries to distinguish between
40 | $C_S$ and $C_T$, which the main network tries to make
41 | indistinguishable.
42 |
43 | - $F_C$ is a standard classifier that needs to be well trained on the
44 | supervised source data $X_S$ and corresponding labels $Y_S$.
45 |
46 | - The source discriminator $D_S$ and the target discriminator $D_T$
47 | both try to predict which among the real source image and the
48 | style-transferred image is the real deal.
49 |
50 | ### Architecture
51 |
52 | 
53 |
54 | ## Observations
55 |
56 | - The interesting aspect is the transfer learning of the labels done
57 | via cross-alignment of the two data sets.
58 |
59 | - The closest comparison I could derive in NLP would be to learn
60 | paraphrases of sentences by stripping each of style and relying only
61 | on the content information in $C_S$ and $C_T$ for comparison.
62 |
--------------------------------------------------------------------------------
/_reviews/semi-supervised-learning-with-deep-generative-models.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/semi-supervised-learning-with-deep-generative-models.html"
4 | title: 'Semi-supervised Learning with Deep Generative Models'
5 | ---
6 |
7 | # Semi-supervised Learning with Deep Generative Models
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors aim to tackle the issue of insufficient labeled data in many domains that prompt the usage of semi-supervised learning.
13 |
14 | The authors present a stochastic variational inference algorithm that allows for joint optimization of both model and variational parameters, and that is scalable to large data sets.
15 |
16 | ## Background
17 |
18 | The simplest algorithms for semi-supervised learning is a self-training scheme in which the model is bootstrapped with training data, and the predictions made with high confidence are used as labeled examples in an iterative process. This method is heuristic and prone to errors because poor predictions might be reinforced.
19 |
20 | ## Method
21 |
22 | The authors wish to use a variational autoencoder to compress the input representation and then classify the compressed representation.
23 |
24 | The encoder parameters are $\phi$ and the decoder parameters are $\theta$.
25 |
26 | There are 2 components in the overall model:
27 | * $M_1$: A latent discriminative model, which uses a Gaussian inference network $q_{\phi}(z\|x)$ to infer the latent variable $z$.
28 | * $M_2$: A generative semi-supervised model which infers both $z$ and $y$, assuming that $q_{\phi}(z,y\|x)$ has the factorized form $q_{\phi}(z\|x) q_{\phi}(y\|x)$ specified as a Gaussian distribution for $z$ and a multinomial distribution for $y$.
29 |
30 | The inference network $q_{\phi}(z\|x)$ is used on both the labeled and unlabeled data sets. The approximate posterior learned by the encoder is then used as a feature extractor to train the classifier.
31 |
32 | $q_{\phi}(z,y\|x)$ can be treated as a continuous-discrete mixture model, since $z$ is a continuous Gaussian distribution and $y$ is a multinomial distribution.
33 |
34 | ### Experiment Setup
35 | * Evaluated on MNIST
36 | * AdaGrad was the learning algorithm used.
37 |
38 | ### Architecture
39 | 
40 |
41 |
42 | ## Observations
43 | * This paper contains one of the first neural approaches to style/content disentanglement.
44 | * It is not clear to me why the probability distribution of the generated variable $x$ in model $M_2$ needs to be explicitly modeled as a Gaussian or Bernoulli distribution, since we already have the ground truth for $x$
45 | * There is no regularization in the current model to ensure that content doesn't leak into the style representation.
46 |
--------------------------------------------------------------------------------
/_reviews/fader-networks-manipulating-images-by-sliding-attributes.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/fader-networks-manipulating-images-by-sliding-attributes.html"
4 | title: 'Fader Networks: Manipulating Images by Sliding Attributes'
5 | ---
6 |
7 | # Fader Networks: Manipulating Images by Sliding Attributes
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors attempt to disentangle facial features from images and
13 | re-generate images after tuning (fader knobs) certain continuous-valued
14 | attributes of the image like age, expression, gender etc. This is an
15 | encoder-decoder architecture.
16 |
17 | The major difference touted compared to existing methods is that
18 | adversarial training is used to learn the latent space, as opposed to
19 | the decoder output, thus, helping the latent space become invariant to
20 | the attributes (conditioning labels).
21 |
22 | ## Method
23 |
24 | - The attributes that are binary during train time, can be treated as
25 | continuous during image generation.
26 |
27 | - The data set is pictures of actors with certain attributes, like
28 | 'smile', 'glasses', 'mouth-open' etc.
29 |
30 | - The architecture comprises of 3 main components, the encoder, the
31 | discriminator and the decoder. The discriminator is the adversarial
32 | component.
33 |
34 | - The discriminator is trained with a single objective in mind: to
35 | correct identify the attributes, given an encoded image
36 | representation
37 |
38 | - The encoder-decoder is trained with 2 objectives in mind:
39 |
40 | - The decoder being able to reconstruct the original input, given
41 | the encoded representation and the true attributes.
42 |
43 | - The encoded representation making it difficult for the
44 | discriminator to ascertain which attributes are present in the
45 | original image.
46 |
47 | - Without the adversarial component, the decoder learns to ignore the
48 | true attributes, and changing these at test time for conditioned
49 | generation makes no difference to the decoder output, which we don't
50 | want.
51 |
52 | - The cost attributed by the discriminator to the encoder loss is
53 | gradually increased from 0 over the course of the training.
54 |
55 | - The encoded image representation is generated by a convolutional
56 | network.
57 |
58 | - Augmentation of the face images is done by flipping the images
59 | horizontally.
60 |
61 | - The generated images were evaluated qualitatively and quantitatively
62 | for naturalness.
63 |
64 | ## Observations
65 |
66 | - The objective is to make the attributes the only source of
67 | information for the extra image attributes.
68 |
69 | - Avoiding having an adversarial network as part of the decoder is
70 | that backpropagation can occur even for discrete objectives, like
71 | text sequence prediction.
72 |
--------------------------------------------------------------------------------
/_reviews/learning-to-generate-reviews-and-discovering-sentiment.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/learning-to-generate-reviews-and-discovering-sentiment.html"
4 | title: Learning to Generate Reviews and Discovering Sentiment
5 | ---
6 |
7 | # Learning to Generate Reviews and Discovering Sentiment
8 | ---
9 |
10 | ## Idea
11 |
12 | Given sufficient representation capacity, the neurons in a network can
13 | disentangle higher level features like sentiment from an otherwise
14 | incoherent latent representation. Authors identify a neuron that
15 | encapsulates almost all of the sentiment attribute of a body of text,
16 | the value of which can be tuned to modify the sentiment of generative
17 | functions conditioned on the representation.
18 |
19 | ## Method
20 |
21 | - Character level language modelling is used for benchmarking, trained
22 | on the Amazon product review dataset (82 million product reviews).
23 |
24 | - Neural network architecture used comprised of a single layer
25 | multiplicative LSTM 4096 units wide. Training comprised of 1 epoch
26 | over 128-batch subsequences of length 256, with 1 million weight
27 | updates. Time taking across 4 GPUs was 1 month.
28 |
29 | - Model states are initialized to zeros. Tanh is applied to bound
30 | values between -1 and 1.
31 |
32 | - For each byte, the model updates its hidden state and predicts a
33 | probability distribution over the next possible byte.
34 |
35 | - Logistic regression classifier trained on the latent representation
36 | for different NLP tasks (relatedness, classification, paraphrasing).
37 | L1 penalty for text classification used instead of L2. (Claim is
38 | that this performed better with lesser data)
39 |
40 | ## Observations
41 |
42 | - Outperforms state-of-the art sentiment analysis systems, doesn't
43 | perform as well on subjective-objective and opinion polarity tasks.
44 | Sets a new baseline for movie reviews in the Stanford Sentiment
45 | Treebank.
46 |
47 | - L1 regularization is known to reduce sample complexity when there
48 | are many irrelevant features or outliers, since it is less sensitive
49 | to them.
50 |
51 | - A single unit within the mLSTM directly corresponds to most of the
52 | sentiment classification. The sentiment unit achieves close to state
53 | of the art sentiment classification results and the improvement on
54 | adding the remaining 4095 units of the mLSTM representation is
55 | minor.
56 |
57 | - Since the model is trained on Amazon reviews, it didn't generalize
58 | well to Yelp-style document reviews commenting about hospitality,
59 | location and ambience.
60 |
61 | - The model performs reasonably well on paraphrase detection, but not
62 | on semantic relatedness.
63 |
64 | - A couple things remain unclear
65 |
66 | - Why does the sentiment get captured in such a predictable
67 | manner? Could be some property of the mLSTM. Previous work by
68 | Karpathy shows that certain units are activated for different
69 | syntactic requirements in code.
70 |
71 | - Cross-domain training still seems unrealistic.
72 |
--------------------------------------------------------------------------------
/_reviews/generating-sentences-from-a-continuous-space.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/generating-sentences-from-a-continuous-space.html"
4 | title: Generating Sentences from a Continuous Space
5 | ---
6 |
7 | # Generating Sentences from a Continuous Space
8 | ---
9 |
10 | ## Idea
11 |
12 | The paper presents an alternative strategy to language modeling using RNNs. The authors attempt to use this to impute missing words in a sentence, to interpolate between the latent representations of 2 sentences, and also generate sentences by sampling from the space of the latent representation's prior probability.
13 |
14 | ## Background
15 |
16 | * The authors differentiate between types of sentence embeddings. Sequence Autoencoders have RNNs as encoders and decoders are are just used to regenerate the original text. Skip-Thought models are similar but the target sentence is different from the original sentence. Paragraph Vector models simply try to predict the words that are present in a given sentence.
17 |
18 | * Variational autoencoders impose a prior distribution on the latent representation, but a standard autoencoder does not.
19 |
20 | * The latent representation is usually parameterized by a diagonal Gaussian distribution.
21 |
22 | ## Method
23 |
24 | * The prior distribution of the latent representation acts as a regularizer.
25 |
26 | * Unclear what a 'global latent representation' is. Intuitively, each sentence would have its own representation.
27 |
28 | * The authors suggest KL-term annealing, which involves having a cost function like
29 |
30 | $$L(\theta; x) = \alpha (-KL(q_{\theta}(z|x)||p(z))) + E_{q_{\theta}(z|x)}[\log p_{\theta}(x|z)]$$
31 |
32 | where the value of $\alpha$ is raised from 0 to 1 during the course of the training. This can be thought of as a steady progression from a standard autoencoder to a VAE.
33 |
34 | * Word-level dropout used to force the decoder to rely primarily on the latent space for the sentence generation.
35 |
36 | * A beam search strategy is used by the decoder with beam-size 15.
37 |
38 | * Training sequences are read from right to left, to shorten the word dependencies.
39 |
40 | * An alternative to qualitative evaluation is presented by the usage of an adversarial classifier, which is partially similar to what a GAN does. Non-differentiability of the discrete RNN decoder network disallows usage of the adversarial criterion during training.
41 |
42 | * Model is compared against RNNLMs.
43 |
44 | ## Observations
45 |
46 | * The paper doesn't talk about the presence of dead-zones in the latent space. This should be more of a problem due to the discrete nature of word representation. It also doesn't explain why the reconstruction error doesn't dominate, and only the KL-divergence error dominates. Perhaps several experiments with a fixed KL-divergence factor would be better to arrive at a correct loss-function balance.
47 |
48 | * It is difficult to train models for which the KL-divergence term dominates.
49 |
50 | * A word-dropout of about 75% seems to work best from qualitative assessments. (These could very well have been cherry-picked)
51 |
52 | * RNNLMs seem to favour very generic sentences, and are less likely to be diverse.
53 |
--------------------------------------------------------------------------------
/_layouts/default.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
54 |
55 |
56 |
64 |
65 | {% if site.google_analytics %}
66 |
74 | {% endif %}
75 |
76 |
77 |
--------------------------------------------------------------------------------
/_reviews/style-transfer-in-text-exploration-and-evaluation.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/style-transfer-in-text-exploration-and-evaluation.html"
4 | title: 'Style Transfer in Text: Exploration and Evaluation'
5 | ---
6 |
7 | # Style Transfer in Text: Exploration and Evaluation
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors cite lack of parallel corpora and reliable evaluation metrics as the roadblocks for style transfer in natural language processing.
13 |
14 | They aim to learn separate content representations and style representations, as is the case with pretty much any work dealing with style transfer in computer vision or natural language processing.
15 |
16 | They measure 2 aspects of style transfer, namely transfer strength and content preservation.
17 |
18 | ## Method
19 |
20 | * The base method compared against is an autoencoder framework
21 | * The authors employ 2 models:
22 | * Multi-decoder [seq2seq model](http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural) that use the different decoders for different styles.
23 | * Style-embeddings augmented decoder (single decoder) to generate outputs in different styles.
24 | * Adversarial objectives are applied to the content representation of both both. The objective is dissimilar to most adversarial objectives as it tries to maximize entropy of the predicted label from the content representation by minimizing $$-\sum_{i=1}^M\sum_{j=1}^N H(P(j|Encoder(x_i; \theta_e); \theta_c))$$ where $M$ is the size of the training data and $N$ is the number of distinct styles.
25 |
26 | * Similar to the [persona-based neural conversation model](https://arxiv.org/abs/1603.06155), a style embedding is learned for each different style. The conditional generation is done using recurrent neural networks with the inputs being the recurrent networks current state, and the style embedding to apply.
27 | * The style embeddings matrix is not directly parameterized by the encoder, but the learning algorithm propagates changes based on how well it combines with the content representation to reconstruct the original text.
28 | * The methods are evaluated in the following manner:
29 | * Transfer strength is evaluated using a simple classifier
30 | * Content preservation is evaluated by computing the cosine distance between the original and the generated text embeddings.
31 |
32 | ### Architecture
33 |
34 | 
35 |
36 |
37 | ## Observations
38 |
39 | * The authors don't explain:
40 | * Why is a vanilla autoencoder the base model being compared with? It's objective does not optimize for transferring style.
41 | * What ratings qualify as a positive/negative review?
42 | * What kind of decoder strategy was used while predicting sequences? i.e. Greedy-search, Beam-search
43 | * Which dictionary was used to filter sentimentally polar words for the evaluation.
44 | * The results indicate the the models proposed by the author in general perform better than the auto-encoder for the purposes of transfer strength.
45 | * Although most of the generated sentences have some semblance of syntactic structure, the semantics are poor.
46 | * The solution seems generalizable to multi-class problems, but the authors have conducted evaluations on only binary-class problems
47 |
--------------------------------------------------------------------------------
/_reviews/unsupervised-machine-translation-using-monolingual-corpora-only.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/unsupervised-machine-translation-using-monolingual-corpora-only.html"
4 | title: 'Unsupervised Machine Translation Using Monolingual Corpora Only'
5 | ---
6 |
7 | # Unsupervised Machine Translation Using Monolingual Corpora Only
8 | ---
9 |
10 | ## Idea
11 | The authors propose a new neural machine translation system that uses non-parallel text from 2 different language and learns to translate simply training a reconstruction model along with a discriminator to aligh the latent spaces of the language models learnt for both languages.
12 |
13 | ## Method
14 | * The core idea used is that the model should be able to construct a sentence in a language given a noisy version of that sentence in the same language.
15 | * The proxy for the noisy version of the sentence is provided by a [previous work](https://arxiv.org/abs/1710.04087) which uses word-embedding space alignment to provide an unsupervised word-by-word translation framework. The result of this work is a generated bilingual lexicon, which can then be used to create a word-by-word translation in the target language, which can be considered 'noisy'.
16 | * From the above point, we can state that the encoder in this model is similar to a denoising autoencoder.
17 | * The source and target sentence latent representations are constrained to have the same representation by an adversarial discrimator that is trained to differentiate one language from the other solely using their latent embedding.
18 | * A single encoder and decoder is used for both domains and the parameters are shared. The attention weight parameters are also shared.
19 | * The input to the decoder is a sequence of hidden states as opposed to a single fixed-size vector.
20 | * The encoder and decoder are 3 layered bidirectional LSTMs. The decoder uses greedy decoding.
21 | * While training the reconstruction tasks on the monolingual corpora, noise is added to the input data before feeding it into the encoder. This noisy data is supposed to resemble the noisy word-by-word translations from the other language that the model is expected to translate during inference.
22 | * Word dropout and sentence shuffling has been used for regularization.
23 | * The discriminator is trained to recognize which language the encoded representation is from. The input to the discriminator is a series of hidden LSTM states.
24 | * Since this model operates on non-parallel corpora, an additional training signal is how well it can translate and then re-translate a piece of text back to the original language.
25 | * From the ablation studies, it was noted that the most important component was the inclusion the pretrained bilingual dictionary.
26 |
27 |
28 | ## Architecture
29 |
30 | 
31 |
32 |
33 | ## Observations
34 | * This is one of the first forays into completely unsupervised corpus machine translation. The method acheives a BLEU score of $32.76$ on an English-French translation task.
35 | * First off, the title of the paper is very misleading. The title should say 'non-parallel' corpora instead of 'monolingual' corpora.
36 | * The authors state that the discriminator is an MLP operating on the output of the encoder, which is a sequence of states. It is not clear if the states are simply flattened to be fed into the MLP or whether some other feature extractor is used.
37 |
--------------------------------------------------------------------------------
/_reviews/adversarial-learning-for-neural-dialogue-generation.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/adversarial-learning-for-neural-dialogue-generation.html"
4 | title: Adversarial Learning for Neural Dialogue Generation
5 | ---
6 |
7 | # Adversarial Learning for Neural Dialogue Generation
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors formulate this dialogue model as a reinforcement learning problem. The network used is a Generative Adversarial Network. The discriminator objective is the same as a Turing test predictor i.e. classifies whether the dialogue response is human or machine-generated. The goal is to improve to improve the generator to the point where the discriminator has trouble distinguishing between human and machine-generated responses.
13 |
14 | ## Method
15 |
16 | * The generator network is a neural seq2seq model, and the discriminator is similar to a Turing test evaluator.
17 | * The generation task is not formulated as a NMT task. Instead, it tries to maximize the likelihood of a response $y = \{y_1, y_2 ... y_T\}$ given a history of previous sentences $x$.
18 | * The generator defines the policy by which each word of the output sentence $y$ is generated using a softmax over the space of the vocabulary.
19 | * The discriminator uses a hierarchical neural autoencoder to generate a vector representation of an entire sequence of conversation i.e. $\{x, y\}$. This vector representation is then fed into a binary classifier which predicts whether the sentences were human- or machine-generated.
20 | * The generator is trained to maximize the expected reward of the generated utterance using the [REINFORCE algorithm](https://link.springer.com/chapter/10.1007/978-1-4615-3618-5_2).
21 | * The vanilla REINFORCE model doesn't assign rewards to each generated word, and rather assigns equal reward to all the tokens within a predicted sequence of words.
22 | * However, for partially decoded sequences, the discriminator must also be capable of generating classifications for partial sequences. Two methods are proposed to solve this:
23 | * Using a Monte-Carlo search to decode $N ( = 5)$ top candidate sentences given a partial sequences and using the discriminator average of the 5 complete sequences to predict the classification for the partial sequence.
24 | * Training the discriminator to directly also be able to classify partial sequences.
25 | * The Monte-Carlo search strategy was found to be more effective.
26 | * Teacher forcing is used to essentially short-circuit the distance between the generator and the true sequence.
27 | * The generative model is trained using [seq2seq](http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural) and an [attention mechanism](https://arxiv.org/abs/1409.0473). The discriminator is also pre-trained using part of the training data and generating sequences by beam-search and sampling.
28 | * Intuitively, low accuracy of a reasonably well trained discriminator would imply that the quality of generated sentences have improved significantly.
29 |
30 | ## Observations
31 |
32 | * The authors report that the responses generated by their system are more interactive, interesting, and non-repetitive. It'd be interesting to see how they quantify this. UPDATE: The source for this claim is human evaluations, which of course, could be subjective.
33 | * It's also observed that the system yielded better results when the context i.e. the $x$ preceding utterances were limited to 2.
34 | * The hierarchical neural model is the architecture of choice for the discriminator (evaluator).
--------------------------------------------------------------------------------
/_reviews/a-hierarchical-neural-autoencoder-for-paragraphs-and-documents.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/a-hierarchical-neural-autoencoder-for-paragraphs-and-documents.html"
4 | title: A Hierarchical Neural Autoencoder for Paragraphs and Documents
5 | ---
6 |
7 | # A Hierarchical Neural Autoencoder for Paragraphs and Documents
8 | ---
9 |
10 | ## Idea
11 |
12 | This work attempts to use a neural autoencoder to build hierarchical paragraph representations using sentence embeddings and decode the latent representation back into the original paragraph. This is an LSTM based model and different levels of LSTM are used to encode compositionality of token-to-token and sentence-to-sentence relations.
13 |
14 | ## Method
15 | * 3 different autoencoder models are experimented with. The first is a simple version that treats all the document tokens as a single sequences, just like a [Seq2Seq model](http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural). The second is a hierarchical autoencoder and the third a hierarchical autoencoder with an attention mechanism.
16 | * The construction of a hierarchical autoencoder is simple. We assume that we have word embeddings per token. The final hidden state obtained after operating an LSTM over a sequence of tokens in a sentence is assumed to be the sentence embedding. Similarly, the hidden state obtained after operating an LSTM over a sequence of sentence embeddings is the paragraph embedding. The decoding is done in a similar manner. First the sentences embeddings are obtained by unrolling the paragraph vectors, and then word vectors are obtained by unrolling sentence embeddings.
17 | * End of document $e_D$ and end of sentence $e_S$ tokens are treated as word embeddings, to signify the end of a sequence.
18 | * Sequences are predicted using a softmax function, over the space of the vocabulary.
19 | * Attention is computed by allowing, at each decoder step, to peek at every hidden state generated during the encoding phase. Each input sentence is characterized by a strength indicator $v_i$, which is a weighted combination of the hidden state at the last decoder step, and the encoder hidden state of sentence $i$. $v_i$ is normalized to create the attention weight $a_i$ of sentence $i$. The attention vector is the weighted sum of $a_i$ and the encoder hidden state of each source sentence $i$.
20 | * This attention vector $m_t$ is added as another parameter to the decoder LSTM, in addition to current input $e^s_t$ and previous hidden state of the decoder $h^s_{t-1} (dec)$.
21 | * Training done using stochastic gradient descent with mini-batches.
22 | * Model tested on a hotel reviews corpus and Wikipedia, using the ROUGE and BLEU metrics.
23 | * 1000 dimensional word embeddings used with LSTMs of size 1000. 4 layers of encoding and decoding LSTMs are used.
24 | * Input documents are reversed, similar to the original Seq2Seq paper.
25 | * At most 1.5 times the number of input words are allowed to be generated by the decoder. Unclear how this is controlled.
26 | * ROUGE and BLEU metrics don't evaluate coherence, so a custom grid evaluation metric is used. It measures the degree of preservation of the text order.
27 |
28 | ## Observations
29 | * Unclear whether a softmax to predict an end-of-sentence token or a binary classifier to predict end-of-sentence is actually used in the model.
30 | * As expected, Hierarchical + Attention > Hierarchical > Vanilla Seq2Seq, in terms of evaluation metrics.
31 | * Also as expected, performance is better on the hotel reviews corpus, because the format is more consistent.
32 |
--------------------------------------------------------------------------------
/_reviews/improved-variational-autoencoders-for-text-modeling-using-dilated-convolutions.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/improved-variational-autoencoders-for-text-modeling-using-dilated-convolutions.html"
4 | title: Improved Variational Autoencoders for Text Modeling using Dilated Convolutions
5 | ---
6 |
7 | # Improved Variational Autoencoders for Text Modeling using Dilated Convolutions
8 | ---
9 |
10 | ## Idea
11 |
12 | The paper presents an alternative architecture to LSTM based VAEs. As
13 | shown in an [earlier paper](https://arxiv.org/abs/1511.06349), LSTM-VAEs don't have
14 | a significant advantage over LSTM language models. The authors address
15 | this by using a dilated CNN decoder to vary the conditioning context of
16 | the decoder. The hypothesis is the the typical collapse of the loss
17 | function in favor of the KL-divergence term could be addressed by
18 | varying the contextual capacity of the decoder.
19 |
20 | ## Method
21 |
22 | - The authors use a typical LSTM based encoder model, use a dilated
23 | CNN as as the decoder of the VAE.
24 |
25 | - The architecture of the encoder doesn't matter as long as the
26 | posterior of the latent representation resembles a Gaussian with
27 | unit variance.
28 |
29 | - The idea of dilated CNNs was introduced with the intention of
30 | supplying varying contexts of words as features. As opposed to dense
31 | convolutions, dilated convolution skip time-steps to increase the
32 | receptive field of the operation, without increasing the
33 | computational costs. Dilations effectively introduce holes in a
34 | convolutional operation to be able to expand quickly.
35 |
36 | - It is okay for the posterior (latent representation) to not
37 | completely mimic the Gaussian prior. This will ensure that the space
38 | of the latent probabilities offer good generative properties.
39 |
40 | - Residual blocks are used for faster convergence and to enable
41 | building deeper architectures.
42 |
43 | - Predictions at each step of the decoder is conditioned on the
44 | convolutional features concatenated with the latent variable $z$.
45 | Context, unlike in typical CNN architectures, is restricted to only
46 | words that appear in previous time-steps.
47 |
48 | - The Gumbel softmax function is used as a continuous approximation of
49 | an otherwise discrete latent variable, in the framework for
50 | semi-supervised text classification.
51 |
52 | - For unsupervised clustering, the authors still use a discrete label
53 | $y$ to encode some information about an unlabeled text $x$, and the
54 | discrete label is then used for clustering.
55 |
56 | - The authors use an LSTM encoder to obtain the latent representation
57 | $z$, followed by the dilated CNN to decode. The LSTM encoder is
58 | shared by the classifier (discriminator), since the final hidden
59 | state is fed to an MLP architecture to obtain a classification.
60 |
61 | ## Observations
62 |
63 | - The large CNN model (LCNN) performed marginally better than the LSTM
64 | language model as long as the encoder was pre-trained using the LSTM
65 | language model. So, this approach stills requires a pre-trained LSTM
66 | language model in-order to outperform it.
67 |
68 | - It could be argued, as the authors do, that the dilated CNN
69 | architecture to incorporate a larger context of text helps improve
70 | language modelling and text classification performance, as evaluated
71 | by negative log-likelihood of the predicted sequences and
72 | perplexity.
73 |
--------------------------------------------------------------------------------
/_reviews/multispace-variational-encoderdecoders-for-semisupervised-labeled-sequence-transduction.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/multispace-variational-encoderdecoders-for-semisupervised-labeled-sequence-transduction.html"
4 | title: 'Multi-space Variational Encoder-Decoders for Semi-supervised Labeled Sequence Transduction'
5 | ---
6 |
7 | # Multi-space Variational Encoder-Decoders for Semi-supervised Labeled Sequence Transduction
8 |
9 | ## Idea
10 |
11 | The general idea seems similar to style-transfer in text. Labeled
12 | sequence transduction is just a roundabout way of saying that a source
13 | text $x^{(s)}$ is to be transformed into a target text $x^{(t)}$ such
14 | that $x^{(t)}$ is conditioned on the labels $y^{(t)}$.
15 |
16 | ## Background
17 |
18 | The morphological re-inflection problem tries to change a sequence of
19 | characters of an inflected word. For example, convert 'playing' into
20 | 'played', given a set of labels $y^{(t)}$ such that
21 | $y^{(t)}\_{pos}=\text{verb}$ and $y^{(t)}\_{tense}=\text{past}$
22 |
23 | ## Method
24 |
25 | - MSVEDs (system proposed by this paper) use multiple discrete and
26 | continuous latent variables in a character based recurrent network
27 | to model the transduction of inflected words into their re-inflected
28 | forms.
29 |
30 | - The basic architecture emulates a variational autoencoder. The
31 | encoder and decoder parameters are learned in the standard way, by
32 | maximizing the variational lower bound on the marginal
33 | log-likelihood of the data. The neural network parameterizes both
34 | the encoder and decoder weights and back-propagation is done using
35 | the VAE re-parameterization trick.
36 |
37 | - No explicit modeling seems to have been done to disentangle the
38 | latent representation $z$ from the labels $y$
39 |
40 | - Both encoder and decoder architectures are RNNs.
41 |
42 | - The paper also describes a semi-supervised version of the
43 | architecture in which the labels $y$ are inferred directly by a
44 | discriminative classifier on the target sequence $x^{(t)}$. This
45 | discriminative classifier is trained on labeled instances and is
46 | parameterized by an MLP that shares the initial layers of weights
47 | with the encoder of the network. This MLP culminates in a Gumbel
48 | Softmax layer. The Gumbel softmax layer is used to obtain a
49 | continuous approximation of the latent variables, and the
50 | probability distribution varies with the temperature $\tau$ where
51 | $\tau = 0$ implies a one-hot encoded softmax, and $\tau > 0$ implies
52 | a less confident softmax output.
53 |
54 | - The KL-term is gradually annealed from 0 to a $\lambda_m$, to
55 | prevent the latent representation collapsing into the Gaussian
56 | prior.
57 |
58 | - Recurrent dropout is used in the form of omitting random words, and
59 | forcing the decoder to rely on the latent representation instead of
60 | just the ground-truth at each time-step.
61 |
62 | ## Observations
63 |
64 | - The experimental observations show that the bi-directional
65 | encoder/decoder model works better than the single directional
66 | model.
67 |
68 | - The results also indicate a performance boost when the model is
69 | trained using unlabeled sequences, but the growth rate of
70 | performance gains grows slower as more data is added.
71 |
72 | - The conventional method performs well when the re-inflection just
73 | involves suffixing the original word, but the MSVED offers better
74 | generalized performance, when the inflected word needs to be
75 | lemmatized and then augmented.
76 |
--------------------------------------------------------------------------------
/_sass/rouge-github.scss:
--------------------------------------------------------------------------------
1 | .highlight table td { padding: 5px; }
2 | .highlight table pre { margin: 0; }
3 | .highlight .cm {
4 | color: #777772;
5 | font-style: italic;
6 | }
7 | .highlight .cp {
8 | color: #797676;
9 | font-weight: bold;
10 | }
11 | .highlight .c1 {
12 | color: #777772;
13 | font-style: italic;
14 | }
15 | .highlight .cs {
16 | color: #797676;
17 | font-weight: bold;
18 | font-style: italic;
19 | }
20 | .highlight .c, .highlight .cd {
21 | color: #777772;
22 | font-style: italic;
23 | }
24 | .highlight .err {
25 | color: #a61717;
26 | background-color: #e3d2d2;
27 | }
28 | .highlight .gd {
29 | color: #000000;
30 | background-color: #ffdddd;
31 | }
32 | .highlight .ge {
33 | color: #000000;
34 | font-style: italic;
35 | }
36 | .highlight .gr {
37 | color: #aa0000;
38 | }
39 | .highlight .gh {
40 | color: #797676;
41 | }
42 | .highlight .gi {
43 | color: #000000;
44 | background-color: #ddffdd;
45 | }
46 | .highlight .go {
47 | color: #888888;
48 | }
49 | .highlight .gp {
50 | color: #555555;
51 | }
52 | .highlight .gs {
53 | font-weight: bold;
54 | }
55 | .highlight .gu {
56 | color: #aaaaaa;
57 | }
58 | .highlight .gt {
59 | color: #aa0000;
60 | }
61 | .highlight .kc {
62 | color: #000000;
63 | font-weight: bold;
64 | }
65 | .highlight .kd {
66 | color: #000000;
67 | font-weight: bold;
68 | }
69 | .highlight .kn {
70 | color: #000000;
71 | font-weight: bold;
72 | }
73 | .highlight .kp {
74 | color: #000000;
75 | font-weight: bold;
76 | }
77 | .highlight .kr {
78 | color: #000000;
79 | font-weight: bold;
80 | }
81 | .highlight .kt {
82 | color: #445588;
83 | font-weight: bold;
84 | }
85 | .highlight .k, .highlight .kv {
86 | color: #000000;
87 | font-weight: bold;
88 | }
89 | .highlight .mf {
90 | color: #009999;
91 | }
92 | .highlight .mh {
93 | color: #009999;
94 | }
95 | .highlight .il {
96 | color: #009999;
97 | }
98 | .highlight .mi {
99 | color: #009999;
100 | }
101 | .highlight .mo {
102 | color: #009999;
103 | }
104 | .highlight .m, .highlight .mb, .highlight .mx {
105 | color: #009999;
106 | }
107 | .highlight .sb {
108 | color: #d14;
109 | }
110 | .highlight .sc {
111 | color: #d14;
112 | }
113 | .highlight .sd {
114 | color: #d14;
115 | }
116 | .highlight .s2 {
117 | color: #d14;
118 | }
119 | .highlight .se {
120 | color: #d14;
121 | }
122 | .highlight .sh {
123 | color: #d14;
124 | }
125 | .highlight .si {
126 | color: #d14;
127 | }
128 | .highlight .sx {
129 | color: #d14;
130 | }
131 | .highlight .sr {
132 | color: #009926;
133 | }
134 | .highlight .s1 {
135 | color: #d14;
136 | }
137 | .highlight .ss {
138 | color: #990073;
139 | }
140 | .highlight .s {
141 | color: #d14;
142 | }
143 | .highlight .na {
144 | color: #008080;
145 | }
146 | .highlight .bp {
147 | color: #797676;
148 | }
149 | .highlight .nb {
150 | color: #0086B3;
151 | }
152 | .highlight .nc {
153 | color: #445588;
154 | font-weight: bold;
155 | }
156 | .highlight .no {
157 | color: #008080;
158 | }
159 | .highlight .nd {
160 | color: #3c5d5d;
161 | font-weight: bold;
162 | }
163 | .highlight .ni {
164 | color: #800080;
165 | }
166 | .highlight .ne {
167 | color: #990000;
168 | font-weight: bold;
169 | }
170 | .highlight .nf {
171 | color: #990000;
172 | font-weight: bold;
173 | }
174 | .highlight .nl {
175 | color: #990000;
176 | font-weight: bold;
177 | }
178 | .highlight .nn {
179 | color: #555555;
180 | }
181 | .highlight .nt {
182 | color: #000080;
183 | }
184 | .highlight .vc {
185 | color: #008080;
186 | }
187 | .highlight .vg {
188 | color: #008080;
189 | }
190 | .highlight .vi {
191 | color: #008080;
192 | }
193 | .highlight .nv {
194 | color: #008080;
195 | }
196 | .highlight .ow {
197 | color: #000000;
198 | font-weight: bold;
199 | }
200 | .highlight .o {
201 | color: #000000;
202 | font-weight: bold;
203 | }
204 | .highlight .w {
205 | color: #bbbbbb;
206 | }
207 | .highlight {
208 | background-color: #f8f8f8;
209 | }
210 |
--------------------------------------------------------------------------------
/_reviews/toward-controlled-generation-of-text.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/toward-controlled-generation-of-text.html"
4 | title: Toward Controlled Generation of Text
5 | ---
6 |
7 | # Toward Controlled Generation of Text
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors aim to disentangle representations of style and content in the latent code of a Variational AutoEncoder. The style is called the **structured code** and is learned by discriminators for each attribute that needs to be disentangled from the latent space.
13 |
14 | ## Method
15 | * This method does not use adversarial training.
16 | * The basic approach can be described as below:
17 | * $x$ is the source corpus
18 | * The encoder is parameterized to generate a latent code $z$, which is a variational latent space that resembles a Gaussian prior. (This is enforced by a KL-divergence loss)
19 | * The structured code $c$ is a known label of the text (discrete or continouous)
20 | * The decoder generator produces the output corpus $\hat{x}$ conditioned on $(z, c)$. It uses greedy decoding.
21 | * A classifier/regressor discriminator predicts the structured code of the output corpus $\hat{x}$ to ensure that it is the same as the one the generator was conditioned on i.e. $G(z, c)$. The discriminator is pretrained.
22 | * Each decoder step in $\hat{x}$ is predicted using a softmax function scaled by a temperature $\tau$. Higher temperatures flatten the softmax distribution for each word prediction and increase word diversity. Conversely, setting $\tau = 0$ will resembled a hardmax. For their experiments the authors gradually anneal $\tau \rightarrow 0$
23 | * The authors describe 3 separate losses to train their model.
24 | * A reconstruction loss that ensures that the generated sentence $\hat{x}$ is the same as the original sentence $x$. This is equivalent to minimizing the negative log-likelihood of generating $\hat{x}$.
25 | * A discriminator validates if the predicted class/value for $\hat{x}$ is the same as the corresponding class/value for $x$. This is a cross-entropy loss over the probability distribution of the labels. This discriminator loss can be further subdivided into 2 terms.
26 | * Maximize the expected log likelihood of predicting the correct distribution of the structured code $c$ given the labelled examples $X_L$. This happens before the generator model training.
27 | * Maximize the expected log likelihood of predicting the correct distribution of the structured code $c$ given the generated sentences $\hat{x}$. Also minimize the empirically observed Shannon entropy of the observed discriminator prediction $q_D(c'\|\hat{x})$, which reduces uncertainty and increases confidence of the structured code prediction.
28 | * The encoder from loss 1, is used to regenerate the latent distribution $z$ devoid of the structured code from the output distribution $\hat{x}$. The authors call this an **independence constraint**, in that regardless of the structured code $c$ that is currently present in either $x$ or $\hat{x}$, processing either through the generator should produce a consistent $z$. This allows the encoder to encode only latent factors that are independent of the structured code.
29 | * [A wake-sleep algorithm](http://science.sciencemag.org/content/268/5214/1158) is used to alternatively train the generator and discriminator.
30 | * The model was applied only to short sentences with length $<15$ words.
31 | * The encoder/decoder setup is implemented using single layer LSTMs and the discriminator is implemented using conv-net. The KL term is annealed from 0 to 1 during training.
32 |
33 | ### Architecture
34 |
35 | 
36 |
37 | ### Learning Process
38 |
39 | 
40 |
41 | ## Observations
42 | * The model performs better than the [S-VAE](http://papers.nips.cc/paper/5352-semi-supervised-learning-with-deep-generative-models) implementation in terms of sentiment accuracy of generated sentences.
43 | * From the reported results, it seems that adding the independence constraint helps the generated sentences retain the content successfully.
44 |
--------------------------------------------------------------------------------
/_reviews/infovae-information-maximizing-variational-autoencoders.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/infovae-information-maximizing-variational-autoencoders.html"
4 | title: 'InfoVAE: Information Maximizing Variational Autoencoders'
5 | ---
6 |
7 | # InfoVAE: Information Maximizing Variational Autoencoders
8 |
9 | ## Idea
10 |
11 | The authors indicate that the variational inference training objectives
12 | as defined in the original paper [@kingma2013auto] is not expressive
13 | enough for a good generative model, but more expressive conditional
14 | distributions end up ignoring the latent space altogether. The authors
15 | wish to address this by proposing a new training objective for
16 | Variational Autoencoders.
17 |
18 | ## Background
19 |
20 | The variational inference lower bound is derived in the original paper
21 | as
22 |
23 | $$\begin{align*}
24 | \mathcal{L}_{ELBO}
25 | &=& - D_{KL}(q_{\phi}(z|x) || p_{\theta}(z)) + \\
26 | & & \mathbb{E}_{q_{\phi}(z|x)} [log p_{\theta}(x|z)] \\ \\
27 | &\leq& log p_{\theta}(x)
28 | \end{align*}$$
29 |
30 | where the first term is the KL divergence loss that
31 | encourages the inferred latent space to be similar to a prior usually a
32 | Gaussian distribution and the second term minimizes the negative log
33 | likelihood of observing the data point $x$ given the inferred latent
34 | variable $z$.
35 |
36 | ## Method
37 |
38 | The authors cite 2 problems with the ELBO objective, 'information
39 | preference' and 'exploding latent space':
40 |
41 | - **Information Preference** The original ELBO term can be re-written
42 | as a sum of 2 divergences.
43 |
44 | $$\mathcal{L}_{ELBO} = - D_{KL}(p_{data}(x) || p_{\theta}(x)) - \mathbb{E}_{p_{data}}[D_{KL}(q_{\phi}(z|x) || log p_{\theta}(z|x))]$$
45 |
46 | The first divergence becomes 0 when the reconstruction is perfect,
47 | and the second becomes 0 when $x$ and $z$ are independent under
48 | $p_{\theta}$ and $q_{\phi}$ and no information is gained from the
49 | latent code.
50 |
51 | - **Exploding latent space** The learned distribution
52 | $q_{\phi}(z|x_i)$ could be a $\delta$-distribution centered at
53 | $x_i$, making this seem optimal for the reconstruction loss.
54 | However, this is a case of extreme over-fitting, because a
55 | $p_{\theta}$ mapping could be learned for every $q_{\phi}(z|x_i)$
56 | that could lead to good reconstruction. This is not beneficial,
57 | because we want the $q_{\phi}(z)$ to be almost the same as the prior
58 | $p(z)$ and this causes this learning algorithm to learn a bijection,
59 | instead of a generalized representation.
60 |
61 | ### Proposed Solution
62 |
63 | Instead of minimizing the previous KL-divergence
64 | $- D_{KL}(q_{\phi}(z|x) || p_{\theta}(z))$, try to minimize
65 | $- D_{KL}(q_{\phi}(z) || p_{\theta}(z))$ where
66 |
67 | $$q_{\phi}(z) = \int_{x} q_{\phi}(z|x) p_{data}(x) dx$$
68 |
69 | Since this cannot be computed directly, we need to use a likelihood-free
70 | optimization technique. The InfoVAE objective can thus be written as
71 |
72 | $$\mathcal{L}_{InfoVAE} = - \lambda D_{KL}(q_{\phi}(z) || p_{\theta}(z)) + \mathbb{E}_{q_{\phi}(z|x)} [log p_{\theta}(x|z)]$$
73 |
74 | for any $\lambda > 0$
75 |
76 | ### Optimization Techniques
77 |
78 | - Adversarial training to minimize the Jensen-Shannon divergence
79 | between $q_{\phi}(z)$ and $p(z)$.
80 |
81 | - Stein variational gradient that descends
82 | $D_{KL}(q_{\phi}(z) || p_{\theta}(z))$
83 |
84 | - Maximum mean discrepancy (MMD), computed by comparing all the
85 | moments of a distribution. This can be done using the kernel trick.
86 |
87 | ### Experiments
88 |
89 | The authors use 2 strategies to empirically measure the distance between
90 | $q_{\phi}(z)$ and $p(z)$
91 |
92 | - Calculate the MMD statistic over the entire data
93 |
94 | - Calculate the log determinant of the covariance matrix of the
95 | distribution $q_{\phi}(z)$. Since When $p(z)$ is the standard
96 | Gaussian and $q_{\phi}(z)$ is trying to emulate the distribution,
97 | $log[det (\sum_{q_{\phi}})]) = 0$
98 |
99 | ## Observations
100 |
101 | - The improvements suggested are only for the more expressive
102 | generators and don't apply to simple conditional distribution
103 | families for $p_{\theta} (x|z)$ like a Gaussian distribution.
104 |
105 | - The MMD optimization seems to perform best empirically.
106 |
--------------------------------------------------------------------------------
/_reviews/beam-search-strategies-for-neural-machine-translation.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/beam-search-strategies-for-neural-machine-translation.html"
4 | title: Beam Search Strategies for Neural Machine Translation
5 | ---
6 |
7 | # Beam Search Strategies for Neural Machine Translation
8 | ---
9 |
10 |
11 | ## Idea
12 |
13 | The standard beam search strategy for Neural Machine Translation (NMT)
14 | is for the decoder to predict the target sequence word-by-word and
15 | maintain a fixed amount of potential word candidates to predict at each
16 | step. The paper proposes a more flexible decoder strategy, by pruning
17 | the search graph, and reducing the number of candidates with the same
18 | partial hypothesis (shared past).
19 |
20 |
21 | ## Background
22 |
23 | The drawbacks of a vanilla beam-search is that it is less adaptive,
24 | because:
25 |
26 | - Some candidates might not be as good as the current best
27 |
28 | - Good candidates might not be considered because they missed out on
29 | the threshold for candidate inclusion marginally. Addressing this by
30 | naively increasing beam search size will result in slowing down the
31 | decoder.
32 |
33 | $$beam\_size \propto model\_accuracy$$
34 |
35 | $$beam\_size \propto \frac{1}{model\_performance}$$
36 |
37 | Standard beam search builds a translation from left-to-right and keeps a
38 | fixed number (beam) of translation candidates with the highest
39 | log-probability at each time step. If an end-of-sequence (EOS) token is
40 | encountered, the beam-size is reduced by 1 and the sequence is added to
41 | the candidate list of translations. When the beam-size eventually
42 | becomes 0, the log-probability of all the sequences weighted by sequence
43 | length is evaluated and the translation with the highest log-probability
44 | score is picked.
45 |
46 |
47 | ## Method
48 |
49 | 4 separate beam search strategies are proposed. One or more of these can
50 | be applied on top of vanilla beam search.
51 |
52 | - Relative threshold pruning: This eliminates candidates are are
53 | beyond a relative distance to the current best candidate.
54 |
55 | $$score(cand) \leq rp * \max_{c \in C} {score(c)}$$
56 |
57 | where $rp$ is the pruning threshold.
58 |
59 | - Absolute threshold pruning: Almost the same as the previous strategy
60 | except that the difference between the best and pruned candidates is
61 | determined by an absolute value $ap$.
62 |
63 | $$score(cand) \leq \max_{c \in C} {score(c)} - ap$$
64 |
65 | - Relative local threshold pruning: This is similar to the first
66 | strategy, but instead of considering the log-probability of the
67 | entire sequence, only the log-probability of the last generated word
68 | instead of the total sequence score.
69 |
70 | $$score_w(cand) \leq rpl * \max_{c \in C} {score_w(c)}$$
71 |
72 | - Maximum candidates per node: This strategy eliminates candidates
73 | based on whether too many current candidates share the same
74 | predecessor words, in an attempt to diversify the predictions.
75 |
76 | $$cand_{pred = P} \text{ if} \sum_{c_{pred = P}} 1 > mc$$
77 |
78 | where $mc$ is the maximum candidate threshold for candidates
79 | that share common predecessor words.
80 |
81 | English $\rightarrow$ German translation done at the sub-word level
82 | (reduces the computational complexity of the output softmax). English
83 | $\rightarrow$ Chinese translation done at the word level.
84 |
85 | NMT implementation similar to the [Neural Attention](https://arxiv.org/abs/1409.0473)
86 | paper. Embedding dimension of 620 used, with RNN GRU unit for the latent
87 | representation of 1000 units. SGD is the learning algorithm used. Number
88 | of epochs not discussed, but a batch-size of 64 is used for training.
89 |
90 | Experiments done to choose the most conservative threshold that doesn't
91 | degrade translation accuracy.
92 |
93 | Beam-sizes of 5 to 14 are experimented with.
94 |
95 |
96 | ## Observations
97 |
98 | - Relative pruning worked best for beam-size 5, and absolute pruning
99 | worked best for beam-size 14.
100 |
101 | - For English $\rightarrow$ German, speed-up of 13% for beam-size 5,
102 | and 43% for beam-size 14.
103 |
104 | - For English $\rightarrow$ Chinese, speed-up of 10% for beam-size 5,
105 | and 24% for beam-size 14.
106 |
107 | - Adding diversity to the decoder using the Maximum Candidates pruning
108 | strategy did not improve translation results.
109 |
110 | \bibliographystyle{unsrt}
111 |
--------------------------------------------------------------------------------
/_reviews/natural-language-processing-almost-from-scratch.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/natural-language-processing-almost-from-scratch.html"
4 | title: 'Natural Language Processing (almost) from Scratch'
5 | ---
6 |
7 | # Natural Language Processing (almost) from Scratch
8 | ---
9 |
10 | ## Idea
11 |
12 | The paper attempts to train a generic single learning system for multi-task learning. The tasks include Part-of-Speech (POS) tagging, chunking (CHUNK), Named Entity Recognition (NER) and Semantic Role Labeling (SRL). The authors intend to achieve this without hand-engineering task-specific features, and instead rely on a large amount on unlabeled data. They also wish to avoid baselines that have been created using differently labeled data.
13 |
14 | ## Background
15 | * The state-of-the-art (SoTA) system for POS tagging uses bidirectional sequence decoders (Viterbi algorithm) and maximum entropy classifiers to determine, which among a set of pre-defined tags, can be attributed to a token.
16 | * Chunking is essentially the same as POS-tagging, but for phrases instead of single words. SoTA for chunking uses pairwise SVM-classifiers, for which the features were word-contexts. Matrix SVD based methods have also been successful.
17 | * For NER, the SoTA is a linear model combined with Viterbi decoding, where the features include the tokens themselves, the POS tags, CHUNK tags, suffixes and prefixes.
18 | * SRL is similar to obtaining an entity-relation model from unstructured data (text). SoTA on SRL are parse trees, CHUNK and POS tags, voice, types of verb etc. in combination with context-window classifiers.
19 |
20 | ## Method
21 | * The system used by the authors is a simple MLP architecture with minimal pre-processing and no task-specific engineered features.
22 | * The first layer extracts word-level features and the second layer extracts sentence-level features. The architecture uses the equivalent of an embedding/lookup layer that models the language by learning dense representations of words. Initial embedding layer size was 50 (increased to 500 in later experiments). Vocab size varied from 100k to 130k words. Sentence level embedding is learnt by using convolutions.
23 | * The authors also propose training a small network, and using the trained embeddings to initialize a larger network, as a form of transfer learning.
24 | * There could be multiple lookup tables, with the feature vector for a word being the concatenation of all the lookup tables entries.
25 | * For some tasks, the training objective is a multi-class softmax probability and for others, the objective is to collectively maximize the probability of the entire sequence rather that the prediction at each step individually.
26 | * In contrast to previous methods, the authors use the $tanh$ non-linearity as the activation function in their neural network architecture. The authors also explain that not have non-linear activations in a multi-layer architecture is the same as having only a single layer.
27 | * For convolutions, the tag prediction is done for the word in the middle of the convolutional window. Padding is done to ensure that there are tokens preceding the actual first word, and tokens following the last word.
28 | * The training objective is the maximization of the log-likelihood and the optimizer used is stochastic gradient ascent.
29 | * Sequence-like decoding for each of the tasks is done using the Viterbi algorithm that uses dynamic programming to maximize the likelihood of entire sequences.
30 | * The proposed system eliminates the need of parse-trees for the SRL task. The authors hint at this being a deviation from Chomsky grammars which is hierarchical, and describe this as a Harris grammar which is similar to a set of functions being applied on top of a sequence.
31 | * The authors also point to how features from different tasks can be combined into a smaller subspace, and an two objectives could be alternatively trained, one for the meta learner, and one for the task specific learners.
32 | * The authors implemented the neural network from scratch in C, without a symbolic computation framework. This is presumably a very time consuming effort since every back-propagated gradient would have to be hand-computed first, and then coded.
33 |
34 | ## Observations
35 | * One pertinent question is whether multi-task learning is only effective if the subtasks are not completely orthogonal to each other. This is not evident from the paper's conclusions because a few of the subtasks are dependent on features extracted from the other tasks.
36 | * The network displays consistently good performance across all tasks, sometimes even surpassing the baselines, especially when provided with additional data, and by using transferred embeddings.
37 |
--------------------------------------------------------------------------------
/_reviews/adversarial-generation-of-natural-language.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/adversarial-generation-of-natural-language.html"
4 | title: Adversarial Generation of Natural Language
5 | ---
6 |
7 | # Adversarial Generation of Natural Language
8 | ---
9 |
10 | ## Idea
11 |
12 | The paper tries to leverage the success of GANs in the computer vision domain to generate language. The authors attempt to address the discrete space problem without the usage of gradient estimators. They also intend to generate language based on context-free grammars and also from conditional generation using sentence attributes.
13 |
14 | ## Background
15 |
16 | * [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661) are comprised of a generator that tries to mimic the original distribution $P(x)$ by sampling from Gaussian noise $P(z)$ using a generator function $G(z)$, and a discriminator that determines whether samples from 2 distributions $P(x)$ and $G(z)$ are genuine or fake.
17 | * Teacher-forcing ensures that the output at the last time-step of the prediction sequences is one of the inputs to the new timestep, which corresponds to MLE training of the model i.e. during training, utilize the actual ground-truth of the previous timestep as an additional information signal for the present timestep. The paper also mentions the need to eliminate exposure bias, which is caused when a bad prediction early in the sequence has a cascading effect on the overall quality of the sequence.
18 | * The paper comments on the the lipschitz constraint from the [Wasserstein GAN](https://arxiv.org/abs/1701.07875) paper as an important finding. This constraint restricts the weights of the discriminator network such that they lie in a fixed interval, and that the discriminator is trained multiple times per generator training epoch. This is supposed to protect against allowing the discriminator to easily distinguish between the one-hot vector encoded true-data distribution and the dense real-valued predictions. WGANs are supposed to provide better gradients to the generator because the lipschitz constraint prevents discriminator saturation.
19 |
20 | ## Method
21 |
22 | * Experiments are performed on both recurrent as well as convolutional models, as well as using curriculum learning
23 | * Recurrent Model:
24 | * The model architecture itself doesn't seem novel from the point of view of a typical GAN. Teacher-forcing is used at each time step to condition the output of the current timestep on the ground-truth of the last timestep. The paper notes that in a vanilla RNN, the encoding phase requires the current time step to be conditioned on information from the previous timesteps contained in the hidden state, but this doesn't hold true for the generating (decoding) phase, which is a problem teacher-forcing tries to address.
25 | * There is a slight analogy between teacher-forcing and attention, in that teacher forcing peeks at the ground truth from the previous time-step and attention mechanisms peek at different hidden states from the encoding phase.
26 | * Greedy decoding is performed to predict the next character/word.
27 | * Convolutional Model:
28 | * The convolutional model comprises of 5 residual blocks with 1D convolutional layers. A residual block involves 2 convolutional operations, followed by adding the original input to the output of the $2^{nd}$ convolutional layer.
29 | * No pooling or dropout is used. Regularization is done using batch-normalization layers.
30 | * [Curriculum Learning](https://dl.acm.org/citation.cfm?id=1553380): As opposed to the timestep by timestep prediction techniques of the previous 2 methods, curriculum learning is used to predict entire sequences.
31 | * Evaluation is done by evaluating generated sentences using a constituency parser to check if the generated data adheres to a context-free grammar's production rules. The true data will also have been produced using this context-free grammar (CFG).
32 | * The paper also suggests conditional generation of language based on a togglable vector (question vs. statement generation; positive vs negative sentiment generation). The method suggested is concatenating a vector of 1s or 0s to the output convolutional layer depending on whether or not the attribute in question is present. The paper doesn't explain how this is achieved with the LSTM based GAN. Presumably, this concatenation can be done on the fixed sized latent vector obtained after the LSTM is run over the input sequence.
33 | * Learning algorithms used are ADAM and SGD. LSTM generator/discriminator learning is smoothed
34 |
35 | ## Observations
36 |
37 | * WGAN and WGAN-GP (a variant with a gradient penalty discriminator term to avoid saturation) are the only models able to generalize to sequences of length 11 for the LSTM-based GANs. (Tests performed for sequences of length 5 and 11)
38 | * WGAN-GP performed similarly well for the CNN-based GAN.
39 |
--------------------------------------------------------------------------------
/_reviews/infogan-interpretable-representation-learning-by-information-maximizing-generative-adversarial-nets.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/infogan-interpretable-representation-learning-by-information-maximizing-generative-adversarial-nets.html"
4 | title: |
5 | InfoGAN: Interpretable Representation Learning by Information Maximizing
6 | Generative Adversarial Nets
7 | ---
8 |
9 | # InfoGAN: Interpretable Representation Learning by Information Maximizing Generative Adversarial Nets
10 | ---
11 |
12 | ## Idea
13 |
14 | The motivation behind this work is to be able to learn interpretable disentangled representation from the latent space that otherwise would not exhibit these properties. This is achieved by maximizing the mutual information between a subset of the latent variables and the observable (known) variable.
15 |
16 | ## Background
17 |
18 | Representation learning learns a dense embedding of the entities in our data set which can then be used for downstream tasks. It is an unsupervised form of feature extraction.
19 |
20 | ### Generative Adversarial Networks
21 | * Typically, generative models i.e. decoders that are capable of producing a reasonably good approximation for the distribution of the source data, are good indicators of a well-learned representation. eg. [GANs](https://arxiv.org/abs/1406.2661) and [VAEs](https://arxiv.org/abs/1312.6114).
22 | * This paper builds off the idea of GANs. The min-max game played by the generator and discriminator in GANs is given by the equation
23 |
24 | $$min_G max_D V(D, G) = \mathbb{E}_{x \sim P_{data}}[\log D(x)] + \mathbb{E}_{z \sim
25 | noise} [\log(1 - D(G(z)))]$$
26 |
27 | * The first term is the expected number of times the discriminator successfully identifies the true data distribution and the second term is the expected number of times the discriminator successfully identifies the data generated from noise sampling. From the perspective of the discriminator, ideally $D(\cdot) = 1$ if the data point is from the true distribution and $D(\cdot) = 0$ if the data point is from the sampled distribution.
28 |
29 | ### Mutual Information
30 | * Mutual information between two variables $X$ and $Y$ is defined as the information learned about one random variable, say $Y$ from the other, say $X$. This can be expressed as the difference of 2 entropy terms like so:
31 |
32 | $$I(X;Y) = H(X) - H(X|Y) = H(Y) - H(Y|X)$$
33 |
34 | * Maximizing the mutual information here, in the context of the first expression would mean that the distribution $P(X\|Y)$ is a lot less uncertain than the distribution $P(X)$ and hence can be, on average, expressed in fewer information bits.
35 |
36 | ## Method
37 | * In addition to the standard latent variable $z$ the authors introduce a separate latent code $c$ that is meant to be the interpretable part of the latent space.
38 | * The author propose an information theoretic regularization to prevent the explicitly modeled latent code $c$ from being bypassed by the generator $G(z,c)$. They formulate this constraint by stating that the mutual information between the latent code and the generator distribution must be high. i.e. $I(c; G(z,c))$ must be maximized, This would imply that $c$ and $G(z,c)$ would be highly entangled and $c$ would not be lost in the generation process. $I(c; G(z,c))$ can be rewritten such that
39 |
40 | $$I(c; G(z,c)) = H(c) - H(c|G(z,c))$$
41 |
42 | * Since $H(X) = \mathbb{E}[-\log P(X)]$, we can say that
43 |
44 | $$I(c; G(z,c)) = H(c) + \mathbb{E}_{x \sim G(z,c)}[\mathbb{E}_{c' \sim P(c,x)}[\log P(c'\|x)]] + H(c)$$
45 |
46 | * Now, since we don't know the posterior $P(c,x)$, we can use an auxiliary distribution to approximate it and use the KL-divergence between the true and auxiliary distributions to derive a lower bound for the mutual information, which is
47 |
48 | $$I(c; G(z,c)) \geq \mathbb{E}_{c \sim P(c), x \sim G(z,c)}[\log Q(c\|x)] + H(c)$$
49 |
50 | where $Q(c\|x)$ is the distribution that approximates the posterior $P(c'\|x)$
51 | * Let $L_I(G,Q)$ be the lower bound of the mutual information $I(c; G(z,c))$. We want to maximize this (or, minimize the negative lower bound, in practice).
52 | * The authors derive the variational information maximization lower-bound to entangle $c$ with $G(z,c)$ and add it to the minimax game of a vanilla GAN, with a hyperparameter $\lambda$
53 |
54 | $$\min_{G,Q} \max_D V_{InfoGAN}(D,G,Q) = V_{GAN}(D,G) - \lambda L_I(G,Q)$$
55 |
56 | * The implementation uses the training techniques introduced by [DCGAN](https://arxiv.org/abs/1511.06434) to stabilize training.
57 | * The experiments use 3 latent factors, $c_1 ~ Cat(K=10,p=0.1)$ which is a factored distribution to represent the one-of-ten possible values of the digit in MNIST, and $c_2,c_3 ~ unif(-1,1)$ which hope to capture other semantics of the digits.
58 |
59 | ## Observations
60 | * The representation learning doesn't seem to be completely
61 | unsupervised, as we need prior information about the latent code to
62 | determine the conditional distribution (e.g. softmax distribution
63 | for categorical labels), but this can just be set to a factored
64 | Gaussian
65 | * The results seem very convincing as varying each latent code
66 | independently causes changes in digit value, rotation and width,
67 | when changing $c_1$, $c_2$ and $c_3$ respectively.
68 | * However, it is not evident how the network learns such a neat
69 | separation of rotation and width and attributes them independently
70 | to $c_2$ and $c_3$.
71 |
--------------------------------------------------------------------------------
/_reviews/style-transfer-from-non-parallel-text-by-cross-alignment.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | permalink: "reviews/style-transfer-from-non-parallel-text-by-cross-alignment.html"
4 | title: 'Style Transfer from Non-Parallel Text by Cross-Alignment'
5 | ---
6 |
7 | # Style Transfer from Non-Parallel Text by Cross-Alignment
8 | ---
9 |
10 | ## Idea
11 |
12 | The authors aim to perform style transfer on language using non-parallel corpora by separating content from style. They re-align the latent spaces to perform three tasks: sentiment modification, decipherment of word-substitution ciphers, and recovery of word order.
13 |
14 | ## Method
15 |
16 | The authors' method involves learning an encoder that takes a sentence and its original style indicator as input, and maps it to a content representation devoid of style. This representation is then decoded by a style-dependent decoder.
17 |
18 | ### Notation
19 |
20 | * $y \rightarrow$ latent style variable
21 | * $z \rightarrow$ latent content variable
22 | * $x \rightarrow$ data point generated from the conditional distribution
23 | * $P(x\|y,z)$
24 |
25 | ### Formulation
26 |
27 | There are two non-parallel corpora $X_1 = {x_1^{(1)} ... x_1^{(n)}}$, drawn from $p(x_1\|y_1)$ and $X_2 = {x_2^{(1)} ... x_2^{(n)}}$, drawn from $p(x_2\|y_2)$
28 |
29 | We want to estimate the style transferred distributions $p(x_1\|x_2;y_1,y2)$ and $p(x_2\|x_1;y_1,y2)$
30 |
31 | The authors propose a constraint that $x_1$ and $x_2$'s marginal distributions can only be recovered if for any different styles $y, y' \in Y$, distributions $p(x\|y)$ and $p(x\|y')$ are different, which is a fair assumption to make because if $p(x\|y)$ = $p(x\|y')$, then the style changes would be indiscernible.
32 |
33 | They also prove that if the content $z$ is sampled from a centered isotropic distribution, the styles cannot be recovered from $x$, but in the case of $z$ being a more complex distribution like a Gaussian mixture, then the affine transformation that converts $y, z$ into $x$ can be recovered.
34 |
35 | The reconstruction loss is the same as the one used by an autoencoder
36 |
37 | $$
38 | \begin{align}
39 | \mathcal{L}(\theta_E,\theta_G)
40 | &=& \mathbb{E}_{x_1 \sim X_1}[-\log p_G(x_1\|y_1,E(x_1, y_1))] \nonumber \\
41 | & & + \mathbb{E}_{x_2 \sim X_2}[-\log p_G(x_2\|y_2,E(x_2, y_2))]
42 | \end{align}
43 | $$
44 |
45 | ### Solution 1: Aligned Autoencoder {#aligned-autoencoder}
46 |
47 | Instead of the KL divergence loss, the authors propose aligning the distributions $P_E(z\|x_1)$ and $P_E(z\|x_2)$ where $E$ is the encoder function. This is done by training an adversarial discriminator to distinguish between the two distributions.
48 |
49 | The adversarial objective is expressed as below where $D(\cdot)$ predicts 0 if it predicts the source distribution to be $X_1$ and 1 if it predicts the source distribution to be $X_2$
50 | $$\begin{align}
51 | \mathcal{L}_{adv}(\theta_E,\theta_D)
52 | &=& \mathbb{E}_{x_1 \sim X_1}[-\log D(E(x_1,y_1))] \nonumber \\
53 | & & + \mathbb{E}_{x_2 \sim X_2}[-\log(1 - D(E(x_2,y_2)))]
54 | \end{align}$$
55 |
56 | The overall optimization objective combining equations 1 and 2 can be written as
57 |
58 | $$\mathcal{L} = \operatorname*{min}_{E,G} \operatorname*{max}_{D} \mathcal{L} - \lambda \mathcal{L}_{adv}$$
59 |
60 | ### Solution 2: Cross-aligned Autoencoder
61 |
62 | This is similar to the previous solution, but instead of trying to align $P_E(z\|x_1)$ and $P_E(z\|x_2)$ using an adversarial discriminator, two distinct adversarial discriminators are used to align a sequence of real and transferred generator hidden states. i.e. $D_1$ is used to align the distributions $G(y_1, z_1)$ and $G(y_1, z_2)$. Similarly, $D_2$ is used to align the distributions $G(y_2, z_2)$ and $G(y_2, z_1)$. These discriminators are trained with the objective of being unable to identify the content distributions $P(z_1)$ and $P(z_2)$
63 |
64 | Professor-forcing is used to train both of these discriminators. Professor forcing uses a discriminator to distinguish if the decoder hidden states are a result of training-time teacher forcing or test time scheduled sampling. This is a generalized version of simply using a final encoder state, as was the case in the Aligned Autoencoder solution.
65 |
66 | The overall optimization objective combining equation 1 and two discriminator versions of equation 2 can be written as:
67 |
68 | $$\mathcal{L} = \operatorname*{min}_{E,G} \operatorname*{max}_{D} \mathcal{L} - \lambda (\mathcal{L}_{adv_1} + \mathcal{L}_{adv_2})$$
69 |
70 | ### Learning Process
71 |
72 | 
73 |
74 | ### Experiment Setup
75 |
76 | * As opposed to the simple feed-forward classifier used for $D$ in the aligned autoencoder, $D_1$ and $D_2$ use convolutional nets for [text classification](https://arxiv.org/abs/1408.5882).
77 | * They use Yelp reviews as the data set with rating $>3$ as positive and rating $<3$ as negative examples. Reviews with a sentence count $>10$ and sentences with a word count $>10$ are filtered out. Vocab size used is 10K.
78 | * Style transfer is evaluated using a pre-trained classifier.
79 | * Content transfer was evaluation using human evaluations.
80 |
81 | ## Observations
82 |
83 | * Despite the corpora being non-parallel, the content of both corpora is mostly homogenous.
84 | * The authors cite the reason for not using VAEs for this task as the utility of having rich and unperturbed representations, which VAEs do not possess, because of the ELBO objective which forces the latent representation to be consistent with a prior distribution.
85 | * The sentiment transfer model succeeds in retaining content 41.5% of the time.
86 | * The model described in the ['Toward Controlled Generation' paper](https://arxiv.org/abs/1703.00955) performed better in the sentiment style transfer task. The authors attribute this to the fact that their loss objective is directly parameterized by a sentiment classifier. Although the authors claim that the overall transfer quality is better, that metric is obtained from human evaluations and the difference is marginal.
87 | * Amongst the different models, the cross-aligned autoencoder with one discriminator per style performs the best on all tasks.
88 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | CC0 1.0 Universal
2 |
3 | Statement of Purpose
4 |
5 | The laws of most jurisdictions throughout the world automatically confer
6 | exclusive Copyright and Related Rights (defined below) upon the creator and
7 | subsequent owner(s) (each and all, an "owner") of an original work of
8 | authorship and/or a database (each, a "Work").
9 |
10 | Certain owners wish to permanently relinquish those rights to a Work for the
11 | purpose of contributing to a commons of creative, cultural and scientific
12 | works ("Commons") that the public can reliably and without fear of later
13 | claims of infringement build upon, modify, incorporate in other works, reuse
14 | and redistribute as freely as possible in any form whatsoever and for any
15 | purposes, including without limitation commercial purposes. These owners may
16 | contribute to the Commons to promote the ideal of a free culture and the
17 | further production of creative, cultural and scientific works, or to gain
18 | reputation or greater distribution for their Work in part through the use and
19 | efforts of others.
20 |
21 | For these and/or other purposes and motivations, and without any expectation
22 | of additional consideration or compensation, the person associating CC0 with a
23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
25 | and publicly distribute the Work under its terms, with knowledge of his or her
26 | Copyright and Related Rights in the Work and the meaning and intended legal
27 | effect of CC0 on those rights.
28 |
29 | 1. Copyright and Related Rights. A Work made available under CC0 may be
30 | protected by copyright and related or neighboring rights ("Copyright and
31 | Related Rights"). Copyright and Related Rights include, but are not limited
32 | to, the following:
33 |
34 | i. the right to reproduce, adapt, distribute, perform, display, communicate,
35 | and translate a Work;
36 |
37 | ii. moral rights retained by the original author(s) and/or performer(s);
38 |
39 | iii. publicity and privacy rights pertaining to a person's image or likeness
40 | depicted in a Work;
41 |
42 | iv. rights protecting against unfair competition in regards to a Work,
43 | subject to the limitations in paragraph 4(a), below;
44 |
45 | v. rights protecting the extraction, dissemination, use and reuse of data in
46 | a Work;
47 |
48 | vi. database rights (such as those arising under Directive 96/9/EC of the
49 | European Parliament and of the Council of 11 March 1996 on the legal
50 | protection of databases, and under any national implementation thereof,
51 | including any amended or successor version of such directive); and
52 |
53 | vii. other similar, equivalent or corresponding rights throughout the world
54 | based on applicable law or treaty, and any national implementations thereof.
55 |
56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of,
57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
59 | and Related Rights and associated claims and causes of action, whether now
60 | known or unknown (including existing as well as future claims and causes of
61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum
62 | duration provided by applicable law or treaty (including future time
63 | extensions), (iii) in any current or future medium and for any number of
64 | copies, and (iv) for any purpose whatsoever, including without limitation
65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
66 | the Waiver for the benefit of each member of the public at large and to the
67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver
68 | shall not be subject to revocation, rescission, cancellation, termination, or
69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work
70 | by the public as contemplated by Affirmer's express Statement of Purpose.
71 |
72 | 3. Public License Fallback. Should any part of the Waiver for any reason be
73 | judged legally invalid or ineffective under applicable law, then the Waiver
74 | shall be preserved to the maximum extent permitted taking into account
75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
76 | is so judged Affirmer hereby grants to each affected person a royalty-free,
77 | non transferable, non sublicensable, non exclusive, irrevocable and
78 | unconditional license to exercise Affirmer's Copyright and Related Rights in
79 | the Work (i) in all territories worldwide, (ii) for the maximum duration
80 | provided by applicable law or treaty (including future time extensions), (iii)
81 | in any current or future medium and for any number of copies, and (iv) for any
82 | purpose whatsoever, including without limitation commercial, advertising or
83 | promotional purposes (the "License"). The License shall be deemed effective as
84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the
85 | License for any reason be judged legally invalid or ineffective under
86 | applicable law, such partial invalidity or ineffectiveness shall not
87 | invalidate the remainder of the License, and in such case Affirmer hereby
88 | affirms that he or she will not (i) exercise any of his or her remaining
89 | Copyright and Related Rights in the Work or (ii) assert any associated claims
90 | and causes of action with respect to the Work, in either case contrary to
91 | Affirmer's express Statement of Purpose.
92 |
93 | 4. Limitations and Disclaimers.
94 |
95 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
96 | surrendered, licensed or otherwise affected by this document.
97 |
98 | b. Affirmer offers the Work as-is and makes no representations or warranties
99 | of any kind concerning the Work, express, implied, statutory or otherwise,
100 | including without limitation warranties of title, merchantability, fitness
101 | for a particular purpose, non infringement, or the absence of latent or
102 | other defects, accuracy, or the present or absence of errors, whether or not
103 | discoverable, all to the greatest extent permissible under applicable law.
104 |
105 | c. Affirmer disclaims responsibility for clearing rights of other persons
106 | that may apply to the Work or any use thereof, including without limitation
107 | any person's Copyright and Related Rights in the Work. Further, Affirmer
108 | disclaims responsibility for obtaining any necessary consents, permissions
109 | or other rights required for any use of the Work.
110 |
111 | d. Affirmer understands and acknowledges that Creative Commons is not a
112 | party to this document and has no duty or obligation with respect to this
113 | CC0 or use of the Work.
114 |
115 | For more information, please see
116 |
117 |
--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | layout: default
3 | ---
4 |
5 | `2003-10` - [Paper](http://www.jmlr.org/papers/v3/bengio03a.html) - [Notes](reviews/a-neural-probabilistic-language-model.html) - A Neural Probabilistic Language Model
6 |
7 | `2011-08` - [Paper](https://arxiv.org/abs/1103.0398) - [Notes](reviews/natural-language-processing-almost-from-scratch.html) - Natural Language Processing almost from Scratch
8 |
9 | `2014-10` - [Paper](https://arxiv.org/abs/1406.5298) - [Notes](reviews/semi-supervised-learning-with-deep-generative-models.html) - Semi-Supervised Learning with Deep Generative Models
10 |
11 | `2015-06` - [Paper](https://arxiv.org/abs/1506.01057) - [Notes](reviews/a-hierarchical-neural-autoencoder-for-paragraphs-and-documents.html) - A Hierarchical Neural Autoencoder for Paragraphs and Documents
12 |
13 | `2015-11` - [Paper](https://arxiv.org/abs/1511.06349) - [Notes](reviews/generating-sentences-from-a-continuous-space.html) - Generating Sentences from a Continuous Space
14 |
15 | `2016-06` - [Paper](https://arxiv.org/abs/1606.03498) - [Notes](reviews/improved-techniques-for-training-gans.html) - Improved Techniques for Training GANs
16 |
17 | `2016-06` - [Paper](https://arxiv.org/abs/1606.03657) - [Notes](reviews/infogan-interpretable-representation-learning-by-information-maximizing-generative-adversarial-nets.html) - InfoGAN: Interpretable Representation Learning by Information Maximizing Generative Adversarial Nets
18 |
19 | `2016-06` - [Paper](https://aclanthology.info/papers/N16-1005/n16-1005) - [Notes](reviews/controlling-politeness-in-neural-machine-translation-via-side-constraints.html) - Controlling Politeness in Neural Machine Translation via Side Constraints
20 |
21 | `2017-02` - [Paper](https://arxiv.org/abs/1702.08139) - [Notes](reviews/improved-variational-autoencoders-for-text-modeling-using-dilated-convolutions.html) - Improved Variational Autoencoders for Text Modeling using Dilated Convolutions
22 |
23 | `2017-02` - [Paper](https://arxiv.org/abs/1702.01806) - [Notes](reviews/beam-search-strategies-for-neural-machine-translation.html) - Beam Search Strategies for Neural Machine Translation
24 |
25 | `2017-04` - [Paper](https://arxiv.org/abs/1704.01444) - [Notes](reviews/learning-to-generate-reviews-and-discovering-sentiment.html) - Learning to Generate Reviews and Discovering Sentiment
26 |
27 | `2017-05` - [Paper](https://arxiv.org/abs/1705.09655) - [Notes](reviews/style-transfer-from-non-parallel-text-by-cross-alignment.html) - Style Transfer from Non-Parallel Text by Cross-Alignment
28 |
29 | `2017-06` - [Paper](https://arxiv.org/abs/1706.02262) - [Notes](reviews/infovae-information-maximizing-variational-autoencoders.html) - InfoVAE: Information Maximizing Variational Autoencoders
30 |
31 | `2018-01` - [Paper](http://proceedings.mlr.press/v70/mueller17a.html) - [Notes](reviews/sequence-to-better-sequence-continuous-revision-of-combinatorial-structures.html) - Sequence to Better Sequence: Continuous Revision of Combinatorial Structures
32 |
33 | `2017-06` - [Paper](https://arxiv.org/abs/1705.10929) - [Notes](reviews/adversarial-generation-of-natural-language.html) - Adversarial Generation of Natural Language
34 |
35 | `2017-06` - [Paper](https://arxiv.org/abs/1706.04223) - [Notes](reviews/adversarially-regularized-autoencoders.html) - Adversarially Regularized Autoencoders
36 |
37 | `2017-06` - [Paper](https://arxiv.org/abs/1706.00409) - [Notes](reviews/fader-networks-manipulating-images-by-sliding-attributes.html) - Fader Networks: Manipulating Images by Sliding Attributes
38 |
39 | `2017-07` - [Paper](https://arxiv.org/abs/1703.00955) - [Notes](reviews/toward-controlled-generation-of-text.html) - Toward Controlled Generation of Text
40 |
41 | `2017-07` - [Paper](https://arxiv.org/abs/1707.02633) - [Notes](reviews/controlling-linguistic-style-aspects-in-neural-language-generation.html) - Controlling Linguistic Style Aspects in Neural Language Generation
42 |
43 | `2017-08` - [Paper](https://arxiv.org/abs/1704.01691) - [Notes](reviews/multispace-variational-encoderdecoders-for-semisupervised-labeled-sequence-transduction.html) - Multi-space Variational Encoder-Decoders for Semi-supervised Labeled Sequence Transduction
44 |
45 | `2017-09` - [Paper](https://arxiv.org/abs/1705.02364) - [Notes](reviews/supervised-learning-of-universal-sentence-representations-from-natural-language-inference-data.html) - Supervised Learning of Universal Sentence Representations from Natural Language Inference Data
46 |
47 | `2017-09` - [Paper](https://arxiv.org/abs/1701.06547) - [Notes](reviews/adversarial-learning-for-neural-dialogue-generation.html) - Adversarial Learning for Neural Dialogue Generation
48 |
49 | `2017-09` - [Paper](https://arxiv.org/abs/1709.08878) - [Notes](reviews/generating-sentences-by-editing-prototypes.html) - Generating Sentences by Editing Prototypes
50 |
51 | `2017-10` - [Paper](https://arxiv.org/abs/1710.04087) - [Notes](reviews/word-translation-without-parallel-data.html) - Word Translation Without Parallel Data
52 |
53 | `2017-10` - [Paper](https://arxiv.org/abs/1711.00043) - [Notes](reviews/unsupervised-machine-translation-using-monolingual-corpora-only.html) - Unsupervised Machine Translation Using Monolingual Corpora Only
54 |
55 | `2017-11` - [Paper](https://arxiv.org/abs/1711.06861) - [Notes](reviews/style-transfer-in-text-exploration-and-evaluation.html) - Style Transfer in Text: Exploration and Evaluation
56 |
57 | `2017-11` - [Paper](https://arxiv.org/abs/1711.01558) - [Notes](reviews/wasserstein-autoencoders.html) - Wasserstein Auto-Encoders
58 |
59 | `2017-12` - [Paper](https://arxiv.org/abs/1712.10066) - [Notes](reviews/disentangled-representations-for-manipulation-of-sentiment-in-text.html) - Disentangled Representations for Manipulation of Sentiment in Text
60 |
61 | `2017-12` - [Paper](https://arxiv.org/abs/1711.09395) - [Notes](reviews/improved-neural-text-attribute-transfer-with-non-parallel-data.html) - Improved Neural Text Attribute Transfer with Non-parallel Data
62 |
63 | `2017-12` - [Paper](https://arxiv.org/abs/1711.09163) - [Notes](reviews/jade-joint-autoencoders-for-disentanglement.html) - JADE: Joint Autoencoders for Dis-Entanglement
64 |
65 | `2018-01` - [Paper](https://arxiv.org/abs/1712.09025) - [Notes](reviews/domain-adaptation-meets-disentangled-representation-learning-and-style-transfer.html) - Domain Adaptation Meets Disentangled Representation Learning and Style Transfer
66 |
67 | `2018-01` - [Paper](https://arxiv.org/abs/1801.06024) - [Notes](reviews/natural-language-multitasking-analyzing-and-improving-syntactic-saliency-of-hidden-representations.html) - Natural Language Multitasking: Analyzing and Improving Syntactic Saliency of Hidden Representations
68 |
69 | `2018-04` - [Paper](https://arxiv.org/abs/1804.09000) - [Notes](reviews/style-transfer-through-back-translation.html) - Style Transfer Through Back-Translation
70 |
71 | `2018-06` - [Paper](https://arxiv.org/abs/1806.01483) - [Notes](reviews/jtav-jointly-learning-social-media-content-representation-by-fusing-textual-acoustic-and-visual-features.html) - JTAV: Jointly Learning Social Media Content Representation by Fusing Textual, Acoustic, and Visual Features
72 |
73 | `2018-08` - [Paper](https://arxiv.org/abs/1706.07639) - [Notes](reviews/causal-embeddings-for-recommendation.html) - Causal Embeddings for Recommendation
74 |
75 |
76 |
--------------------------------------------------------------------------------
/_sass/jekyll-theme-slate.scss:
--------------------------------------------------------------------------------
1 | @import "rouge-github";
2 |
3 | /*******************************************************************************
4 | MeyerWeb Reset
5 | *******************************************************************************/
6 |
7 | html, body, div, span, applet, object, iframe,
8 | h1, h2, h3, h4, h5, h6, p, blockquote, pre,
9 | a, abbr, acronym, address, big, cite, code,
10 | del, dfn, em, img, ins, kbd, q, s, samp,
11 | small, strike, strong, sub, sup, tt, var,
12 | b, u, i, center,
13 | dl, dt, dd, ol, ul, li,
14 | fieldset, form, label, legend,
15 | table, caption, tbody, tfoot, thead, tr, th, td,
16 | article, aside, canvas, details, embed,
17 | figure, figcaption, footer, header, hgroup,
18 | menu, nav, output, ruby, section, summary,
19 | time, mark, audio, video {
20 | margin: 0;
21 | padding: 0;
22 | border: 0;
23 | font: inherit;
24 | vertical-align: baseline;
25 | }
26 |
27 | /* HTML5 display-role reset for older browsers */
28 | article, aside, details, figcaption, figure,
29 | footer, header, hgroup, menu, nav, section {
30 | display: block;
31 | }
32 |
33 | ol, ul {
34 | list-style: none;
35 | }
36 |
37 | table {
38 | border-collapse: collapse;
39 | border-spacing: 0;
40 | }
41 |
42 | /*******************************************************************************
43 | Theme Styles
44 | *******************************************************************************/
45 |
46 | body {
47 | box-sizing: border-box;
48 | color:#373737;
49 | background: #212121;
50 | font-size: 16px;
51 | font-family: 'Myriad Pro', Calibri, Helvetica, Arial, sans-serif;
52 | line-height: 1.5;
53 | -webkit-font-smoothing: antialiased;
54 | }
55 |
56 | h1, h2, h3, h4, h5, h6 {
57 | margin: 10px 0;
58 | font-weight: 700;
59 | color:#222222;
60 | font-family: 'Lucida Grande', 'Calibri', Helvetica, Arial, sans-serif;
61 | letter-spacing: -1px;
62 | }
63 |
64 | h1 {
65 | font-size: 36px;
66 | font-weight: 700;
67 | }
68 |
69 | h2 {
70 | padding-bottom: 10px;
71 | font-size: 32px;
72 | background: url('../images/bg_hr.png') repeat-x bottom;
73 | }
74 |
75 | h3 {
76 | font-size: 24px;
77 | }
78 |
79 | h4 {
80 | font-size: 21px;
81 | }
82 |
83 | h5 {
84 | font-size: 18px;
85 | }
86 |
87 | h6 {
88 | font-size: 16px;
89 | }
90 |
91 | p {
92 | margin: 10px 0 15px 0;
93 | }
94 |
95 | footer p {
96 | color: #f2f2f2;
97 | }
98 |
99 | a {
100 | text-decoration: none;
101 | color: #0F79D0;
102 | text-shadow: none;
103 |
104 | transition: color 0.5s ease;
105 | transition: text-shadow 0.5s ease;
106 | -webkit-transition: color 0.5s ease;
107 | -webkit-transition: text-shadow 0.5s ease;
108 | -moz-transition: color 0.5s ease;
109 | -moz-transition: text-shadow 0.5s ease;
110 | -o-transition: color 0.5s ease;
111 | -o-transition: text-shadow 0.5s ease;
112 | -ms-transition: color 0.5s ease;
113 | -ms-transition: text-shadow 0.5s ease;
114 | }
115 |
116 | a:hover, a:focus {
117 | text-decoration: underline;
118 | }
119 |
120 | footer a {
121 | color: #F2F2F2;
122 | text-decoration: underline;
123 | }
124 |
125 | em, cite {
126 | font-style: italic;
127 | }
128 |
129 | strong {
130 | font-weight: bold;
131 | }
132 |
133 | img {
134 | position: relative;
135 | margin: 0 auto;
136 | max-width: 739px;
137 | padding: 5px;
138 | margin: 10px 0 10px 0;
139 | border: 1px solid #ebebeb;
140 |
141 | box-shadow: 0 0 5px #ebebeb;
142 | -webkit-box-shadow: 0 0 5px #ebebeb;
143 | -moz-box-shadow: 0 0 5px #ebebeb;
144 | -o-box-shadow: 0 0 5px #ebebeb;
145 | -ms-box-shadow: 0 0 5px #ebebeb;
146 | }
147 |
148 | p img {
149 | display: inline;
150 | margin: 0;
151 | padding: 0;
152 | vertical-align: middle;
153 | text-align: center;
154 | border: none;
155 | }
156 |
157 | pre, code {
158 | color: #222;
159 | background-color: #fff;
160 |
161 | font-family: Monaco, "Bitstream Vera Sans Mono", "Lucida Console", Terminal, monospace;
162 | font-size: 14px;
163 |
164 | border-radius: 2px;
165 | -moz-border-radius: 2px;
166 | -webkit-border-radius: 2px;
167 | }
168 |
169 | pre {
170 | padding: 10px;
171 | box-shadow: 0 0 10px rgba(0,0,0,.1);
172 | overflow: auto;
173 | }
174 |
175 | code {
176 | padding: 3px;
177 | margin: 0 3px;
178 | box-shadow: 0 0 10px rgba(0,0,0,.1);
179 | }
180 |
181 | pre code {
182 | display: block;
183 | box-shadow: none;
184 | }
185 |
186 | blockquote {
187 | color: #666;
188 | margin-bottom: 20px;
189 | padding: 0 0 0 20px;
190 | border-left: 3px solid #bbb;
191 | }
192 |
193 |
194 | ul, ol, dl {
195 | margin-bottom: 15px
196 | }
197 |
198 | ul {
199 | list-style-position: inside;
200 | list-style: disc;
201 | padding-left: 20px;
202 | }
203 |
204 | ol {
205 | list-style-position: inside;
206 | list-style: decimal;
207 | padding-left: 20px;
208 | }
209 |
210 | dl dt {
211 | font-weight: bold;
212 | }
213 |
214 | dl dd {
215 | padding-left: 20px;
216 | font-style: italic;
217 | }
218 |
219 | dl p {
220 | padding-left: 20px;
221 | font-style: italic;
222 | }
223 |
224 | hr {
225 | height: 1px;
226 | margin-bottom: 5px;
227 | border: none;
228 | background: url('../images/bg_hr.png') repeat-x center;
229 | }
230 |
231 | table {
232 | border: 1px solid #373737;
233 | margin-bottom: 20px;
234 | text-align: left;
235 | }
236 |
237 | th {
238 | font-family: 'Lucida Grande', 'Helvetica Neue', Helvetica, Arial, sans-serif;
239 | padding: 10px;
240 | background: #373737;
241 | color: #fff;
242 | }
243 |
244 | td {
245 | padding: 10px;
246 | border: 1px solid #373737;
247 | }
248 |
249 | form {
250 | background: #f2f2f2;
251 | padding: 20px;
252 | }
253 |
254 | /*******************************************************************************
255 | Full-Width Styles
256 | *******************************************************************************/
257 |
258 | .outer {
259 | width: 100%;
260 | }
261 |
262 | .inner {
263 | position: relative;
264 | max-width: 80%;
265 | padding: 20px 10px;
266 | margin: 0 auto;
267 | }
268 |
269 | #forkme_banner {
270 | display: block;
271 | position: absolute;
272 | top:0;
273 | right: 10px;
274 | z-index: 10;
275 | padding: 10px 50px 10px 10px;
276 | color: #fff;
277 | background: url('../images/blacktocat.png') #0090ff no-repeat 95% 50%;
278 | font-weight: 700;
279 | box-shadow: 0 0 10px rgba(0,0,0,.5);
280 | border-bottom-left-radius: 2px;
281 | border-bottom-right-radius: 2px;
282 | }
283 |
284 | #header_wrap {
285 | background: #212121;
286 | background: -moz-linear-gradient(top, #373737, #212121);
287 | background: -webkit-linear-gradient(top, #373737, #212121);
288 | background: -ms-linear-gradient(top, #373737, #212121);
289 | background: -o-linear-gradient(top, #373737, #212121);
290 | background: linear-gradient(top, #373737, #212121);
291 | }
292 |
293 | #header_wrap .inner {
294 | padding: 50px 10px 30px 10px;
295 | }
296 |
297 | #project_title {
298 | margin: 0;
299 | color: #fff;
300 | font-size: 42px;
301 | font-weight: 700;
302 | text-shadow: #111 0px 0px 10px;
303 | }
304 |
305 | #project_tagline {
306 | color: #fff;
307 | font-size: 24px;
308 | font-weight: 300;
309 | background: none;
310 | text-shadow: #111 0px 0px 10px;
311 | }
312 |
313 | #downloads {
314 | position: absolute;
315 | width: 210px;
316 | z-index: 10;
317 | bottom: -40px;
318 | right: 0;
319 | height: 70px;
320 | background: url('../images/icon_download.png') no-repeat 0% 90%;
321 | }
322 |
323 | .zip_download_link {
324 | display: block;
325 | float: right;
326 | width: 90px;
327 | height:70px;
328 | text-indent: -5000px;
329 | overflow: hidden;
330 | background: url(../images/sprite_download.png) no-repeat bottom left;
331 | }
332 |
333 | .tar_download_link {
334 | display: block;
335 | float: right;
336 | width: 90px;
337 | height:70px;
338 | text-indent: -5000px;
339 | overflow: hidden;
340 | background: url(../images/sprite_download.png) no-repeat bottom right;
341 | margin-left: 10px;
342 | }
343 |
344 | .zip_download_link:hover {
345 | background: url(../images/sprite_download.png) no-repeat top left;
346 | }
347 |
348 | .tar_download_link:hover {
349 | background: url(../images/sprite_download.png) no-repeat top right;
350 | }
351 |
352 | #main_content_wrap {
353 | background: #ffffff;
354 | border-top: 1px solid #111;
355 | border-bottom: 1px solid #111;
356 | }
357 |
358 | #main_content {
359 | padding-top: 40px;
360 | }
361 |
362 | #footer_wrap {
363 | background: #212121;
364 | }
365 |
366 |
367 |
368 | /*******************************************************************************
369 | Small Device Styles
370 | *******************************************************************************/
371 |
372 | @media screen and (max-width: 992px) {
373 | img {
374 | max-width: 100%;
375 | }
376 | }
377 |
378 | @media screen and (max-width: 480px) {
379 | body {
380 | font-size:14px;
381 | }
382 |
383 | #downloads {
384 | display: none;
385 | }
386 |
387 | .inner {
388 | min-width: 320px;
389 | max-width: 480px;
390 | }
391 |
392 | #project_title {
393 | font-size: 32px;
394 | }
395 |
396 | h1 {
397 | font-size: 28px;
398 | }
399 |
400 | h2 {
401 | font-size: 24px;
402 | }
403 |
404 | h3 {
405 | font-size: 21px;
406 | }
407 |
408 | h4 {
409 | font-size: 18px;
410 | }
411 |
412 | h5 {
413 | font-size: 14px;
414 | }
415 |
416 | h6 {
417 | font-size: 12px;
418 | }
419 |
420 | code, pre {
421 | font-size: 11px;
422 | }
423 |
424 | }
425 |
426 | @media screen and (max-width: 320px) {
427 | body {
428 | font-size:14px;
429 | }
430 |
431 | #downloads {
432 | display: none;
433 | }
434 |
435 | .inner {
436 | min-width: 240px;
437 | max-width: 320px;
438 | }
439 |
440 | #project_title {
441 | font-size: 28px;
442 | }
443 |
444 | h1 {
445 | font-size: 24px;
446 | }
447 |
448 | h2 {
449 | font-size: 21px;
450 | }
451 |
452 | h3 {
453 | font-size: 18px;
454 | }
455 |
456 | h4 {
457 | font-size: 16px;
458 | }
459 |
460 | h5 {
461 | font-size: 14px;
462 | }
463 |
464 | h6 {
465 | font-size: 12px;
466 | }
467 |
468 | code, pre {
469 | min-width: 240px;
470 | max-width: 320px;
471 | font-size: 11px;
472 | }
473 |
474 | }
475 |
--------------------------------------------------------------------------------