├── data ├── __init__.py ├── silhouettes │ ├── caltech101_silhouettes_28.mat │ └── caltech101_silhouettes_28_split1.mat ├── import_data_alphadigs.py ├── import_data_mnist.py └── data_preprocessing.py ├── models ├── __init__.py ├── autoencoder.py ├── loss_functions.py ├── vrmax.py ├── iwae.py └── vae.py ├── __init__.py ├── network ├── __init__.py ├── classifier.py ├── deterministic_layer.py ├── network.py └── stochastic_layer.py ├── prior ├── __init__.py ├── GMM.py ├── gaussian.py └── swiss_roll.py ├── visualization ├── samples.py ├── __init__.py ├── reconstruction.py └── utils.py ├── load_save.py ├── init_models.py ├── README.md └── exp.py /data/__init__.py: -------------------------------------------------------------------------------- 1 | import data_preprocessing 2 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | import loss_functions 2 | import vae 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | import network 2 | import models 3 | import data 4 | import visualization 5 | import prior 6 | -------------------------------------------------------------------------------- /network/__init__.py: -------------------------------------------------------------------------------- 1 | import stochastic_layer 2 | import deterministic_layer 3 | import network 4 | import classifier 5 | -------------------------------------------------------------------------------- /data/silhouettes/caltech101_silhouettes_28.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YingzhenLi/vae_renyi_divergence/HEAD/data/silhouettes/caltech101_silhouettes_28.mat -------------------------------------------------------------------------------- /data/silhouettes/caltech101_silhouettes_28_split1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YingzhenLi/vae_renyi_divergence/HEAD/data/silhouettes/caltech101_silhouettes_28_split1.mat -------------------------------------------------------------------------------- /prior/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | class Prior(object): 3 | 4 | def sample(self, num_samples): 5 | raise NotImplementedError() 6 | 7 | def get_name(self): 8 | raise NotImplementedError() 9 | -------------------------------------------------------------------------------- /visualization/samples.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from __init__ import plot_images 4 | 5 | def plot_samples(sess, shape, prior, decoder): 6 | """ 7 | Plot the reconstruction of data. 8 | """ 9 | z = prior.sample(100) 10 | x = decoder.encode(z, sampling = False) 11 | 12 | samples = sess.run(x) 13 | plot_images(samples, shape, '', 'samples') 14 | -------------------------------------------------------------------------------- /visualization/__init__.py: -------------------------------------------------------------------------------- 1 | import reconstruction 2 | import utils 3 | 4 | def plot_images(images, shape, path, filename): 5 | # finally save to file 6 | import matplotlib 7 | matplotlib.use('Agg') 8 | import matplotlib.pyplot as plt 9 | 10 | images = utils.reshape_and_tile_images(images, shape) 11 | plt.imshow(images, cmap='Greys') 12 | plt.axis('off') 13 | plt.savefig(path + filename + ".svg", format="svg") 14 | plt.close() 15 | -------------------------------------------------------------------------------- /visualization/reconstruction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from __init__ import plot_images 4 | 5 | def plot_recon(sess, x, shape, encoder, decoder): 6 | """ 7 | Plot the reconstruction of data. 8 | """ 9 | input = tf.placeholder(tf.float32, shape=x.shape) 10 | z = encoder.encode(input, sampling = False) 11 | x_recon = decoder.encode(z, sampling = False) 12 | 13 | input_recon = sess.run(x_recon, feed_dict = {input: x}) 14 | plot_images(input_recon, shape, '', 'recon_sample') 15 | -------------------------------------------------------------------------------- /visualization/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | """ 5 | Function of drawing images, copied from Theano Tutorial. 6 | """ 7 | 8 | def reshape_and_tile_images(array, shape=(28, 28), n_cols=None): 9 | if n_cols is None: 10 | n_cols = int(math.sqrt(array.shape[0])) 11 | n_rows = int(math.ceil(float(array.shape[0])/n_cols)) 12 | 13 | def cell(i, j): 14 | ind = i*n_cols+j 15 | if i*n_cols+j < array.shape[0]: 16 | return array[ind].reshape(*shape, order='C') 17 | else: 18 | return np.zeros(shape) 19 | 20 | def row(i): 21 | return np.concatenate([cell(i, j) for j in range(n_cols)], axis=1) 22 | 23 | return np.concatenate([row(i) for i in range(n_rows)], axis=0) 24 | -------------------------------------------------------------------------------- /prior/GMM.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from __init__ import Prior 4 | 5 | class GMM_diag(Prior): 6 | 7 | def __init__(self, size, Mu_list, Sigma_list, weights = None): 8 | self.size = size 9 | self.Mu_list = Mu_list 10 | self.Sigma_list = Sigma_list 11 | self.num_mixture = Mu_list.shape[0] 12 | if weights is None: 13 | weights = np.ones(self.num_mixture) 14 | self.weights = weights / np.sum(weights) 15 | 16 | def sample(self, num_samples): 17 | # first select the mixture 18 | i = np.random.choice(self.num_mixture, num_samples, p = self.weights) 19 | eps = tf.random_normal([num_samples, self.size]) 20 | output = self.Mu_list[i] + eps * self.Sigma_list[i] 21 | return output 22 | 23 | def get_name(self): 24 | return 'prior_GMM_diag' 25 | 26 | -------------------------------------------------------------------------------- /network/classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from deterministic_layer import Deterministic_Layer 4 | 5 | def construct_classifier(layer_sizes, activation='softplus'): 6 | """ 7 | Construct the stochastic layer. 8 | """ 9 | D_layers = [] 10 | for l in xrange(len(layer_sizes) - 1): 11 | if l < len(layer_sizes) - 2: 12 | func = 'relu' 13 | else: 14 | func = activation 15 | D_layers.append(Deterministic_Layer(layer_sizes[l], layer_sizes[l+1], func)) 16 | 17 | classifier = Classifier(D_layers) 18 | return classifier 19 | 20 | class Classifier(object): 21 | 22 | def __init__(self, D_layers): 23 | self.D_layers = D_layers 24 | self.params = [] 25 | for layer in self.D_layers: 26 | self.params = self.params + layer.params 27 | 28 | def encode(self, input): 29 | output = input 30 | for layer in self.D_layers: 31 | output = layer.encode(output) 32 | return output 33 | 34 | def get_name(self): 35 | return 'classifier' 36 | 37 | -------------------------------------------------------------------------------- /prior/gaussian.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from __init__ import Prior 4 | 5 | class Gaussian_diag(Prior): 6 | 7 | def __init__(self, size, Mu, Sigma): 8 | self.size = size 9 | self.Mu = Mu 10 | self.Sigma = Sigma 11 | 12 | def sample(self, num_samples): 13 | eps = tf.random_normal([num_samples, self.size]) 14 | output = self.Mu + eps * self.Sigma 15 | return output 16 | 17 | def update(self, samples): 18 | # update the parameters by matching empirical moments 19 | mean, var = tf.nn.moments(samples, axes=[0]) 20 | self.Mu = mean 21 | self.Sigma = tf.sqrt(var) 22 | 23 | def get_name(self): 24 | return 'prior_gaussian_diag' 25 | 26 | class Gaussian_full(Prior): 27 | 28 | def __init__(self, Mu, Sigma): 29 | self.Mu = Mu 30 | self.Sigma = Sigma 31 | 32 | def sample(self, num_samples): 33 | eps = tf.random_normal([num_samples, self.size]) 34 | output = self.Mu + tf.matmul(eps, self.Sigma) 35 | return output 36 | 37 | def get_name(self): 38 | return 'prior_gaussian_full' 39 | 40 | -------------------------------------------------------------------------------- /data/import_data_alphadigs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.io import loadmat 3 | 4 | def read(path, num_per_digit_train = 10, SEED = 0): 5 | 6 | # load data 7 | mat = loadmat(path + 'binaryalphadigs.mat') 8 | img = mat['dat'] 9 | lbl = mat['classlabels'] 10 | 11 | num_class = 36 12 | num_per_digit_train = min(num_per_digit_train, 39) 13 | num_per_digit_test = 39 - num_per_digit_train 14 | num_data_train = num_per_digit_train * num_class 15 | num_data_test = num_per_digit_test * num_class 16 | 17 | rows = 20; cols = 16 18 | img_train = np.zeros([rows*cols, num_data_train]) 19 | lbl_train = np.zeros([num_class, num_data_train]) 20 | img_test = np.zeros([rows*cols, num_data_test]) 21 | lbl_test = np.zeros([num_class, num_data_test]) 22 | 23 | np.random.seed(SEED) 24 | for j in xrange(num_class): 25 | ind = np.random.permutation(range(39)) 26 | for k in xrange(num_per_digit_train): 27 | img_train[:, k * num_class + j] = np.ravel(img[j, ind[k]]) 28 | lbl_train[j, k * num_class + j] = 1 29 | for k in xrange(num_per_digit_test): 30 | img_test[:, k * num_class + j] = np.ravel(img[j, ind[num_per_digit_train + k]]) 31 | lbl_test[j, k * num_class + j] = 1 32 | 33 | return img_train, lbl_train, img_test, lbl_test 34 | 35 | -------------------------------------------------------------------------------- /prior/swiss_roll.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from __init__ import Prior 4 | 5 | class Swiss_Roll(Prior): 6 | 7 | def __init__(self, size, center, radius): 8 | self.size = size 9 | self.center = center 10 | self.radius = radius 11 | 12 | def sample(self, num_samples): 13 | unit = tf.sqrt(tf.random_uniform([num_samples, self.size])) 14 | r = unit * self.radius 15 | theta = np.pi * 4.0 * unit 16 | 17 | return output 18 | 19 | def get_name(self): 20 | return 'prior_gaussian_diag' 21 | 22 | def sample_z_from_swiss_roll_distribution(batchsize, z_dim, label_indices, n_labels, gpu=False): 23 | def sample(label, n_labels): 24 | uni = np.random.uniform(0.0, 1.0) / float(n_labels) + float(label) / float(n_labels) 25 | r = math.sqrt(uni) * 3.0 26 | rad = np.pi * 4.0 * math.sqrt(uni) 27 | x = r * cos(rad) 28 | y = r * sin(rad) 29 | return np.array([x, y]).reshape((2,)) 30 | 31 | z = np.zeros((batchsize, z_dim), dtype=np.float32) 32 | for batch in xrange(batchsize): 33 | for zi in xrange(z_dim / 2): 34 | z[batch, zi*2:zi*2+2] = sample(label_indices[batch], n_labels) 35 | 36 | z = Variable(z) 37 | if gpu: 38 | z.to_gpu() 39 | return z 40 | -------------------------------------------------------------------------------- /load_save.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | def path_name(dataset, alpha, num_samples, backward_pass, extra_string = None): 6 | 7 | path = 'ckpts/' + dataset + '/' 8 | if backward_pass == 'max': 9 | folder_name = 'max_k%d' % num_samples 10 | else: 11 | folder_name = 'alpha%.2f_k%d' % (alpha, num_samples) 12 | if extra_string is not None: 13 | folder_name += '_' + extra_string 14 | 15 | path = path + folder_name + '/' 16 | return path 17 | 18 | def save_checkpoint(sess, path, checkpoint=1, var_list = None): 19 | if not os.path.exists(path): 20 | os.makedirs(path) 21 | # save model 22 | fname = path + 'checkpoint%d.ckpt' % checkpoint 23 | saver = tf.train.Saver(var_list) 24 | save_path = saver.save(sess, fname) 25 | print("Model saved in %s" % save_path) 26 | 27 | def load_checkpoint(sess, path, checkpoint=1): 28 | # load model 29 | try: 30 | fname = path + 'checkpoint%d.ckpt' % checkpoint 31 | saver = tf.train.Saver() 32 | saver.restore(sess, fname) 33 | print("Model restored from %s" % fname) 34 | except: 35 | print "Failed to load model from %s" % fname 36 | 37 | -------------------------------------------------------------------------------- /models/autoencoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from scipy.misc import logsumexp 5 | from network.network import construct_network 6 | 7 | np.random.seed(0) 8 | tf.set_random_seed(0) 9 | 10 | def construct_autoencoder(variables_size, hidden_layers, \ 11 | data_type='real', activation='softplus'): 12 | """ 13 | Construct an auto-encoder, return both encoder and decoder 14 | """ 15 | layer_sizes = [] 16 | l = 0 17 | 18 | # first construct the encoder 19 | for d_layers in hidden_layers: 20 | sizes = [variables_size[l]] 21 | sizes.extend(d_layers) 22 | sizes.append(variables_size[l+1]) 23 | layer_sizes.append(sizes) 24 | l += 1 25 | encoder = construct_network(layer_sizes, 'gaussian', 'real', activation, 'q') 26 | print 'q network architecture:', layer_sizes 27 | print 'prob. type of q net:', encoder.get_name() 28 | 29 | # then construct the decoder 30 | layer_sizes = [list(reversed(sizes)) for sizes in layer_sizes] 31 | layer_sizes = list(reversed(layer_sizes)) 32 | decoder = construct_network(layer_sizes, 'gaussian', data_type, activation, 'p') 33 | print 'p network architecture:', layer_sizes 34 | print 'prob. type of p net:', decoder.get_name() 35 | 36 | return encoder, decoder 37 | 38 | -------------------------------------------------------------------------------- /init_models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from models.autoencoder import construct_autoencoder 5 | from network.classifier import construct_classifier 6 | from prior.gaussian import Gaussian_diag 7 | from prior.GMM import GMM_diag 8 | 9 | def init_model(variables_size, hidden_layers, data_type, activation = 'softplus'): 10 | 11 | # first initialise models 12 | encoder, decoder = construct_autoencoder(variables_size, hidden_layers, \ 13 | data_type, activation) 14 | 15 | return [encoder, decoder] 16 | 17 | def init_prior_gaussian(output_size, mu = 0.0, sigma = 1.0): 18 | prior = Gaussian_diag(output_size, mu, sigma) 19 | return prior 20 | 21 | def init_prior_GMM(output_size, mu_list = None, sigma_list = None, weights = None): 22 | if weights is None: 23 | num_mixture = 4 24 | weights = np.ones(num_mixture) 25 | num_mixture = weights.shape[0] 26 | if mu_list is None: 27 | mu_list = np.random.randn(num_mixture, output_size) * 1.0 28 | if sigma_list is None: 29 | sigma_list = np.ones([num_mixture, output_size]) 30 | prior = GMM_diag(output_size, mu_list, sigma_list, weights) 31 | return prior 32 | 33 | def init_classifier(layer_sizes): 34 | classifier = construct_classifier(layer_sizes, 'sigmoid') 35 | return classifier 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /data/import_data_mnist.py: -------------------------------------------------------------------------------- 1 | import os, struct 2 | from array import array 3 | import numpy as np 4 | 5 | def read(path, num_per_digit = 0, dataset = "training", seed = 0, digits = None): 6 | """ 7 | Python function for importing the MNIST data set. 8 | """ 9 | 10 | if dataset is "training": 11 | fname_img = os.path.join(path, 'train-images-idx3-ubyte') 12 | fname_lbl = os.path.join(path, 'train-labels-idx1-ubyte') 13 | elif dataset is "testing": 14 | fname_img = os.path.join(path, 't10k-images-idx3-ubyte') 15 | fname_lbl = os.path.join(path, 't10k-labels-idx1-ubyte') 16 | else: 17 | raise ValueError, "dataset must be 'testing' or 'training'" 18 | 19 | flbl = open(fname_lbl, 'rb') 20 | magic_nr, size = struct.unpack(">II", flbl.read(8)) 21 | lbl = array("b", flbl.read()) 22 | flbl.close() 23 | 24 | fimg = open(fname_img, 'rb') 25 | magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16)) 26 | img = array("B", fimg.read()) 27 | fimg.close() 28 | 29 | if digits is None: 30 | digits = np.arange(10) 31 | num_digits = len(digits) 32 | if dataset == 'training': 33 | num_per_digit = 6000 34 | if dataset == 'testing': 35 | num_per_digit = 1000 36 | if num_per_digit > 0 and num_per_digit < len(img) / (rows * cols) / num_digits: 37 | num_data = num_per_digit * num_digits 38 | else: 39 | num_data = len(img) / (rows * cols) 40 | images = np.zeros([rows*cols, num_data]) 41 | labels = np.zeros([10, num_data]) 42 | for j in xrange(num_digits): 43 | ind = [ k for k in xrange(size) if lbl[k] == digits[j] ] 44 | if len(ind) == 0: 45 | raise ValueError, "invalid digits, should be in range 0-9" 46 | if num_per_digit == 0 or num_per_digit > len(ind): 47 | num_per_digit = len(ind) 48 | if seed is None: 49 | ind = ind[:num_per_digit] 50 | else: 51 | np.random.seed(seed) 52 | ind = np.random.permutation(ind)[:num_per_digit] 53 | for i in xrange(num_per_digit): 54 | #print i, num_per_digit, len(ind), i* num_digits + j, ind[i]*rows*cols, (ind[i]+1)*rows*cols 55 | images[:, i * num_digits + j] = img[ ind[i]*rows*cols : (ind[i]+1)*rows*cols ] 56 | labels[lbl[ind[i]], i * num_digits + j] = 1 57 | 58 | # here each column in images corresponds to a datapoint 59 | return images, labels 60 | -------------------------------------------------------------------------------- /network/deterministic_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | np.random.seed(0) 5 | tf.set_random_seed(0) 6 | 7 | def init_weights(input_size, output_size, constant=1.0, seed=123): 8 | """ Glorot and Bengio, 2010's initialization of network weights""" 9 | scale = constant*np.sqrt(6.0/(input_size + output_size)) 10 | if output_size > 0: 11 | return tf.random_uniform((input_size, output_size), 12 | minval=-scale, maxval=scale, 13 | dtype=tf.float32, seed=seed) 14 | else: 15 | return tf.random_uniform([input_size], 16 | minval=-scale, maxval=scale, 17 | dtype=tf.float32, seed=seed) 18 | 19 | class Deterministic_Layer(object): 20 | def __init__(self, input_size, output_size, activation): 21 | self.input_size = input_size 22 | self.output_size = output_size 23 | # activation function 24 | self.name = activation 25 | if activation == 'softplus': 26 | self._activation = tf.nn.softplus 27 | if activation == 'relu': 28 | self._activation = tf.nn.relu 29 | if activation == 'sigmoid': 30 | self._activation = tf.sigmoid 31 | if activation == 'tanh': 32 | self._activation = tf.tanh 33 | if activation == 'linear': 34 | self._activation = lambda x: x 35 | if activation == 'softmax': 36 | self._activation = tf.nn.softmax 37 | # parameters 38 | W = tf.Variable(init_weights(input_size, output_size)) 39 | b = tf.Variable(tf.zeros([output_size])) 40 | #b = tf.Variable(init_weights(output_size, 0)) 41 | self.params = [W, b] 42 | 43 | def encode(self, input): 44 | return self._activation(tf.matmul(input, self.params[0]) + self.params[1]) 45 | 46 | def get_name(self): 47 | return self.name 48 | 49 | class MLP(object): 50 | def __init__(self, D_layers): 51 | self.D_layers = D_layers 52 | self.params = [] 53 | for layer in self.D_layers: 54 | self.params = self.params + layer.params 55 | 56 | def encode(self, input): 57 | output = input 58 | for layer in self.D_layers: 59 | output = layer.encode(output) 60 | 61 | return output 62 | 63 | def get_name(self): 64 | return 'MLP' 65 | 66 | -------------------------------------------------------------------------------- /network/network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from scipy.misc import logsumexp 5 | from stochastic_layer import construct_Stoc_Layer 6 | from deterministic_layer import Deterministic_Layer, MLP 7 | 8 | np.random.seed(0) 9 | tf.set_random_seed(0) 10 | 11 | class Network(object): 12 | """ 13 | A stochastic network containing several stochastic layers. 14 | """ 15 | def __init__(self, S_layers): 16 | self.S_layers = S_layers 17 | self.params = [] 18 | for layer in self.S_layers: 19 | self.params = self.params + layer.params 20 | 21 | def encode(self, input, sampling): 22 | output = input 23 | for layer in self.S_layers: 24 | output, _ = layer.encode(output, sampling) 25 | return output 26 | 27 | def encode_and_log_prob(self, input, eval_output = None): 28 | # evaluate on eval_output if provided 29 | if eval_output is None: 30 | eval_output = [None for layer in self.S_layers] 31 | output = input 32 | output_list = [] 33 | l = 0 34 | for layer in self.S_layers: 35 | output, logprob = layer.encode_and_log_prob(output, eval_output[l]) 36 | if eval_output[l] is not None: 37 | output = eval_output[l] 38 | output_list.append(output) 39 | if l == 0: 40 | logprob_total = logprob 41 | else: 42 | logprob_total = logprob_total + logprob 43 | l += 1 44 | return output_list, logprob 45 | 46 | def get_prob_type(self): 47 | return self.S_layers[-1].get_prob_type() 48 | 49 | def get_name(self): 50 | return 'stochastic_network' 51 | 52 | def construct_network(layer_sizes, prob_type = 'gaussian', data_type='real', \ 53 | activation='softplus', prefix = 'p'): 54 | """ 55 | Construct a stochastic network. 56 | """ 57 | S_layers = [] 58 | l = 0 59 | for sizes in layer_sizes: 60 | if l == len(layer_sizes) - 1: 61 | if data_type == 'real': prob_type = 'gaussian' 62 | if data_type == 'bool': prob_type = 'bernoulli' 63 | print prefix, l, data_type, prob_type 64 | S_layers.append(construct_Stoc_Layer(sizes, prob_type, activation, prefix)) 65 | l += 1 66 | network = Network(S_layers) 67 | return network 68 | 69 | def construct_mlp(layer_sizes, activation = 'softplus'): 70 | """ 71 | Construct a deterministic network 72 | """ 73 | D_layers = [] 74 | L = len(layer_sizes) - 1 75 | for l in xrange(L): 76 | D_layers.append(Deterministic_Layer(layer_sizes[l], layer_sizes[l+1], activation)) 77 | 78 | mlp = MLP(D_layers) 79 | return mlp 80 | 81 | -------------------------------------------------------------------------------- /models/loss_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from scipy.misc import logsumexp 5 | 6 | np.random.seed(0) 7 | tf.set_random_seed(0) 8 | 9 | def reconstruction_loss(input, encoder, decoder, num_samples): 10 | """ 11 | Compute log p(x|z) and log q(z|x) 12 | """ 13 | # compute log_q 14 | x_rep = tf.tile(input, [num_samples, 1]) 15 | z_list, logq = encoder.encode_and_log_prob(x_rep) 16 | # compute log_p 17 | samples = list(reversed(z_list[:-1])) 18 | samples.append(x_rep) 19 | _, logpxz = decoder.encode_and_log_prob(z_list[-1], eval_output = samples) 20 | 21 | return logpxz, logq, z_list 22 | 23 | def reconstruction_mse_loss(input, encoder, decoder, sampling = False): 24 | z = encoder.encode(input, sampling) 25 | input_recon = decoder.encode(z, sampling) 26 | loss = tf.square(input - input_recon) 27 | return tf.reduce_mean(tf.reduce_sum(loss, 1)), z 28 | 29 | def reconstruction_cross_entropy(input, encoder, decoder, sampling = False): 30 | z = encoder.encode(input, sampling) 31 | input_recon = decoder.encode(z, sampling) 32 | loss = -input * tf.log(tf.clip_by_value(input_recon, 1e-9, 1.0)) \ 33 | - (1.0 - input) * tf.log(tf.clip_by_value(1 - input_recon, 1e-9, 1.0)) 34 | return tf.reduce_mean(tf.reduce_sum(loss, 1)), z 35 | 36 | def log_prior(z, prob_type = 'gaussian'): 37 | if prob_type == 'gaussian': 38 | return log_prior_gaussian(z) 39 | if prob_type == 'bernoulli': 40 | return log_prior_bernoulli(z) 41 | if prob_type == 'bernoulli_sym': 42 | return log_prior_bernoulli_sym(z) 43 | if prob_type == 'softmax': 44 | return log_prior_softmax(z, int(z.get_shape()[0])) 45 | 46 | def log_prior_gaussian(z, Mu = 0.0, Sigma = 1.0): 47 | logprob = -(0.5 * np.log(2 * np.pi) + tf.log(Sigma)) \ 48 | - 0.5 * ((z - Mu) / Sigma) ** 2 49 | return tf.reduce_sum(logprob, 1) 50 | 51 | def log_prior_bernoulli(z, Mu = 0.5): 52 | logprob = z * tf.log(tf.clip_by_value(Mu, 1e-9, 1.0)) \ 53 | + (1 - z) * tf.log(tf.clip_by_value(1 - Mu, 1e-9, 1.0)) 54 | return tf.reduce_sum(logprob, 1) 55 | 56 | def log_prior_bernoulli_sym(z, Mu = 0.5): 57 | a = (z + 1.0) / 2.0 58 | logprob = a * tf.log(tf.clip_by_value(Mu, 1e-9, 1.0)) \ 59 | + (1 - a) * tf.log(tf.clip_by_value(1 - Mu, 1e-9, 1.0)) 60 | return tf.reduce_sum(logprob, 1) 61 | 62 | def log_prior_softmax(z, N = 2.0): 63 | # TODO: implement other logits 64 | logprob = z * tf.log(1.0 / float(N)) 65 | return tf.reduce_sum(logprob, 1) 66 | 67 | def classification_cross_entropy(y_pred, y_data): 68 | loss = -y_data * tf.log(tf.clip_by_value(y_pred, 1e-9, 1.0)) \ 69 | - (1 - y_data) * tf.log(tf.clip_by_value(1 - y_pred, 1e-9, 1.0)) 70 | return tf.reduce_mean(loss) 71 | 72 | def classification_cross_entropy_softmax(y_pred, y_data): 73 | loss = -y_data * tf.log(tf.clip_by_value(y_pred, 1e-9, 1.0)) 74 | return tf.reduce_mean(tf.reduce_sum(loss, 1)) 75 | 76 | def classification_error_one_hot(y_pred, y_data): 77 | # assume y_data is a one-hot vector 78 | # and y_pred contains probabilities 79 | correct_prediction = tf.equal(tf.argmax(y_data,1), tf.argmax(y_pred,1)) 80 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 81 | return 1.0 - accuracy 82 | 83 | def adversarial_loss(y_p, y_q): 84 | loss = -tf.log(tf.clip_by_value(y_p, 1e-9, 1.0)) \ 85 | - tf.log(tf.clip_by_value(1 - y_q, 1e-9, 1.0)) 86 | return tf.reduce_mean(tf.reduce_sum(loss, 1)) 87 | 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Renyi divergence variational inference applied to variational auto-encoders 2 | 3 | **Update 2: 14 Sept 2016** 4 | 5 | There are two ways to implement IWAE/VAE with other alpha settings (except 6 | alpha = 1.0 which gives you the vanila VAE). One is to actually compute the 7 | energy as a scalar and let tensorflow work out the rest for you. The previous 8 | naive implementation (see [vae.py](models/vae.py)) did this. The other follows 9 | [section 4.2 of the paper](http://arxiv.org/pdf/1602.02311v2.pdf#5), in which 10 | you compute the gradients on **a list of unormalized log importance weights**, 11 | and form the final gradient by computing the weighted average of them. 12 | My internal use numpy code used this strategy. 13 | 14 | So as another quick update I also provide the second strategy implementation 15 | in tensorflow. Please see [iwae.py](models/iwae.py) for details -- just a few 16 | lines of changes. This is of almost the same flavor as 17 | [the theano version](https://github.com/yburda/iwae/blob/master/iwae.py#L142), 18 | except that they treated VAE/IWAE as different cases. In contrast this tensorflow 19 | code **handles both cases in the same way** as justified by the paper. 20 | 21 | If you want to compare both solutions, use --loss vae for the first and 22 | --loss iwae for the second. You can specify --alpha for both cases and 23 | --loss iwae also supports --alpha 1.0 (VAE). 24 | **Some remarks**: First the runtime for both are roughly the same (I have only 25 | tested them on my laptop). Second the produced results might differ for 26 | a few nats. This is due to the numerical issues for logsumexp. 27 | 28 | Unfortunately, implementing VR-max following similar style of iwae.py still 29 | does not give you runtime advantage. So I still keep the dirty tryout 30 | [vrmax.py](models/vrmax.py). Will come back to this -- again stay tuned! 31 | 32 | ====================================== 33 | 34 | **Update 1: 09 Sept 2016** 35 | 36 | Recently I found that the previous naive implementation in vae.py does not 37 | give you time savings with the max trick, when compared to my internel use 38 | numpy version. This is probably because tensorflow/theano does not 39 | automatically recognize not to compute the gradients of the samples I dropped. 40 | 41 | So as a temporary update I provide a dirty solution (see vrmax.py) that 42 | collects the max-weight samples and repeats the VAE procedure for them. 43 | Yes I know it's far from optimized, but at least it already gives you 44 | ~2x speed-up on CPUs (and maybe 1.5x~1.7x on GPUs depending on your settings). 45 | 46 | Will come back to this issue -- stay tuned! 47 | 48 | ====================================== 49 | 50 | I provide a tensorflow implementation of VAE training with Renyi divergence. 51 | Math details can be found here: 52 | 53 | Yingzhen Li and Richard E. Turner. Renyi divergence variational inference. 54 | (http://arxiv.org/abs/1602.02311) 55 | 56 | I only included some small dataset for testing. To add in more datasets, 57 | download them somewhere else and then add them to the data/ directory. 58 | 59 | For example, you can download: 60 | 61 | MNIST dataset (http://yann.lecun.com/exdb/mnist/) 62 | 63 | and include all the data files in directory data/MNIST/ 64 | 65 | OMNIGLOT (https://github.com/yburda/iwae/tree/master/datasets/OMNIGLOT) 66 | 67 | and include all the data files in directory data/OMNIGLOT/ 68 | 69 | Frey Face (https://github.com/y0ast/Variational-Autoencoder/blob/master/freyfaces.pkl) 70 | 71 | and include all the data files in directory data/freyface/ 72 | 73 | To have a quick test, run 74 | 75 | python exp.py --data [dataset name] --alpha [alpha value] -k [num of samples] 76 | --dimZ [dimension of the latents] 77 | 78 | See exp.py file for more options. In particular, alpha = 1.0 returns 79 | the vanila VAE, alpha = 0.0 gives IWAE. 80 | 81 | If you want to see the max trick, add in one more option --backward_pass max. 82 | -------------------------------------------------------------------------------- /data/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import import_data_alphadigs as alphadigs 3 | import import_data_mnist as mnist 4 | import numpy as np 5 | import cPickle 6 | import argparse 7 | from scipy.io import loadmat 8 | 9 | def load_data(dataset, path, ratio = 0.9, seed = 0, return_labels = False): 10 | if dataset == 'freyface': 11 | data_train, data_test = load_data_freyface(path, ratio, seed) 12 | if dataset == 'alphadigits': 13 | data_train, data_test, labels_train, labels_test = \ 14 | load_data_alphadigits(path, ratio, seed) 15 | if dataset == 'mnist': 16 | data_train, data_test, labels_train, labels_test = \ 17 | load_data_mnist(path, ratio, seed) 18 | if dataset == 'silhouettes': 19 | data_train, data_test = load_data_silhouettes(path, ratio, seed) 20 | if dataset == 'omniglot': 21 | data_train, data_test, labels_train, labels_test = \ 22 | load_data_omniglot(path, ratio, seed) 23 | if return_labels and dataset not in ['freyface', 'silhouettes']: 24 | return data_train, data_test, labels_train, labels_test 25 | else: 26 | return data_train, data_test 27 | 28 | def load_data_freyface(path, ratio = 0.9, seed = 0): 29 | # load and split data 30 | print "Loading data" 31 | f = open(path + 'freyface/freyfaces.pkl','rb') 32 | data = cPickle.load(f) 33 | data = np.array(data, dtype='f') # float32 34 | f.close() 35 | 36 | np.random.seed(seed) 37 | np.random.shuffle(data) 38 | num_train = int(ratio * data.shape[0]) 39 | data_train = data[:num_train] 40 | data_test = data[num_train:] 41 | 42 | return data_train, data_test 43 | 44 | def load_data_alphadigits(path, ratio = 0.9, seed = 0): 45 | # load and split data 46 | print "Loading data" 47 | data_train, labels_train, data_test, labels_test = \ 48 | alphadigs.read(path, int(39 * ratio), SEED = seed) 49 | # transform to float32 50 | data_train = np.array(data_train.T, dtype='f') # float32 51 | data_test = np.array(data_test.T, dtype='f') # float32 52 | labels_train = np.array(labels_train.T, dtype='f') # float32 53 | labels_test = np.array(labels_test.T, dtype='f') # float32 54 | 55 | return data_train, data_test, labels_train, labels_test 56 | 57 | def load_data_omniglot(path, ratio = 0.9, seed = 0): 58 | # load and split data 59 | print "Loading data" 60 | mat = loadmat(path + 'OMNIGLOT/chardata.mat') 61 | data_train = np.array(mat['data'].T, dtype='f') # float32 62 | data_test = np.array(mat['testdata'].T, dtype='f') # float32 63 | labels_train = np.array(mat['target'].T, dtype='f') # float32 64 | labels_test = np.array(mat['testtarget'].T, dtype='f') # float32 65 | 66 | return data_train, data_test, labels_train, labels_test 67 | 68 | def load_data_mnist(path, ratio = 0.9, seed = 0, digits = None): 69 | # load and split data 70 | print "Loading data" 71 | path = path + 'MNIST/' 72 | data_train, labels_train = mnist.read(path, 0, "training", seed, digits) 73 | data_test, labels_test = mnist.read(path, 0, "testing", seed, digits) 74 | #data_train = np.array(data >= 0.5 * np.max(data, 0), dtype = int) # binary 75 | #data_test = np.array(data >= 0.5 * np.max(data, 0), dtype = int) # binary 76 | data_train /= 255.0 # real-value 77 | data_test /= 255.0 # real-value 78 | # transform to float32 79 | data_train = np.array(data_train.T, dtype='f') # float32 80 | data_test = np.array(data_test.T, dtype='f') # float32 81 | labels_train = np.array(labels_train.T, dtype='f') # float32 82 | labels_test = np.array(labels_test.T, dtype='f') # float32 83 | return data_train, data_test, labels_train, labels_test 84 | 85 | def load_data_silhouettes(path, ratio = 0.9, seed = 0): 86 | import scipy.io 87 | imgs_filename = path + 'silhouettes/' \ 88 | + 'caltech101_silhouettes_28_split1.mat' 89 | with open(imgs_filename, 'rb') as f: 90 | images = scipy.io.loadmat(imgs_filename) 91 | 92 | images_train = images['train_data'].astype('float32') 93 | images_test = images['test_data'].astype('float32') 94 | images_val = images['val_data'].astype('float32') 95 | #n_validation = images_val.shape[0] 96 | #images_train = np.vstack((images_train, images_val)) 97 | 98 | # flip digits? 99 | images_train = 1.0 - images_train 100 | images_test = 1.0 - images_test 101 | 102 | return images_train, images_test#, n_validation 103 | 104 | -------------------------------------------------------------------------------- /exp.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../') 3 | from init_models import * 4 | import numpy as np 5 | import argparse 6 | from data.data_preprocessing import load_data 7 | from load_save import * 8 | from visualization.reconstruction import plot_recon 9 | from visualization.samples import plot_samples 10 | 11 | def main(dataset, dimZ, hidden_layers, n_iters, learning_rate = 0.0005, \ 12 | batch_size = 100, seed = 0, alpha = 1.0, num_samples = 1, \ 13 | save = False, backward_pass = 'full', activation = 'softplus', \ 14 | loss = 'vae', checkpoint = 0): 15 | 16 | # load data 17 | ratio = 0.9 18 | path = 'data/' 19 | supervised = False 20 | data_train, data_test = load_data(dataset, path, ratio, seed, supervised) 21 | if dataset == 'freyface': 22 | data_type = 'real' 23 | else: 24 | data_type = 'bool' 25 | 26 | # initialise the computation 27 | sess = tf.Session() 28 | variables_size = [data_train.shape[1], dimZ] 29 | 30 | # TODO: other training methods coming soon... 31 | if backward_pass == 'max': 32 | loss = 'vrmax' 33 | if loss == 'vae': 34 | kwargs = {'alpha': alpha, 'backward_pass': backward_pass} 35 | print 'training model: variational auto-encoder' 36 | print 'back propagating all the samples' 37 | from models.vae import init_optimizer 38 | if loss == 'vrmax': 39 | kwargs = {'alpha': alpha, 'backward_pass': backward_pass} 40 | print 'training model: VAE with alpha = -infty' 41 | print 'back propagating only 1 sample: using the max trick' 42 | from models.vrmax import init_optimizer 43 | if loss == 'iwae': 44 | kwargs = {'alpha': alpha} 45 | print 'training model: IWAE style training with alpha = %.2f' % alpha 46 | print 'gradient is first computed on every samples,', 47 | print 'then averaged with importance weights (smoothed by alpha)', 48 | print 'see the VR bound paper, section 4.2 for details.' 49 | from models.iwae import init_optimizer 50 | 51 | models = init_model(variables_size, hidden_layers, data_type, activation) 52 | prior = init_prior_gaussian(variables_size[-1]) 53 | if checkpoint > 0: 54 | path = path_name(dataset, alpha, num_samples, backward_pass) 55 | load_checkpoint(sess, path, checkpoint) 56 | initialised_var = set(tf.all_variables()) 57 | else: 58 | initialised_var = set([]) 59 | 60 | fit, score = init_optimizer(models, variables_size[0], batch_size, \ 61 | num_samples, **kwargs) 62 | 63 | # now check init 64 | init_var_list = set(tf.all_variables()) - initialised_var 65 | if len(init_var_list) > 0: 66 | # Initializing the tensor flow variables 67 | init = tf.initialize_variables(var_list = init_var_list) 68 | #init = tf.initialize_all_variables() 69 | sess.run(init) 70 | checkpoint += 1 71 | 72 | num_iter_trained = 0 73 | print "Training..." 74 | for n_iter in n_iters: 75 | fit(sess, data_train, n_iter, learning_rate) 76 | num_iter_trained += n_iter 77 | print "Evaluating test data..." 78 | lowerbound_test, time_test = \ 79 | score(sess, data_test, num_samples = 10) 80 | print "test data LL (lowerbound) = %.2f, time = %.2fs, iter %d" \ 81 | % (lowerbound_test, time_test, num_iter_trained) 82 | 83 | # plot reconstructions 84 | if dataset == 'freyface': 85 | shape = (28, 20) 86 | if 'mnist' in dataset: 87 | shape = (28, 28) 88 | print 'ploting reconstructions...' 89 | recon_input = data_test[:100] 90 | plot_recon(sess, recon_input, shape, models[0], models[1]) 91 | 92 | print 'ploting samples from the generative model...' 93 | plot_samples(sess, shape, prior, models[1]) 94 | 95 | # save model 96 | if save: 97 | path = path_name(dataset, alpha, num_samples, backward_pass) 98 | save_checkpoint(sess, path, checkpoint) 99 | 100 | if __name__ == '__main__': 101 | parser = argparse.ArgumentParser(description='Run RVAE experiments.') 102 | parser.add_argument('--data', '-D', type=str, default='freyface') 103 | parser.add_argument('--num_layers', '-l', type=int, choices=[1, 2], default=1) 104 | parser.add_argument('--num_samples', '-k', type=int, default=1) 105 | parser.add_argument('--alpha', '-a', type=float, default=1.0) 106 | parser.add_argument('--dimZ', '-Z', type=int, default=5) 107 | parser.add_argument('--dimH', '-H', type=int, default=200) 108 | parser.add_argument('--iter', '-i', type=int, default=100) 109 | parser.add_argument('--save_model', '-s', action='store_true', default=False) 110 | parser.add_argument('--seed', '-S', type=int, default=0) 111 | parser.add_argument('--backward_pass', '-b', type=str, default='full') 112 | parser.add_argument('--learning_rate', type=float, default=0.0005) 113 | parser.add_argument('--batch_size', type=int, default=100) 114 | parser.add_argument('--activation', type=str, default='softplus') 115 | parser.add_argument('--loss', type=str, default='vae') 116 | parser.add_argument('--checkpoint', type=int, default=0) 117 | 118 | args = parser.parse_args() 119 | if args.dimH > 0: 120 | hidden_layers = [[args.dimH for i in xrange(args.num_layers)]] 121 | else: 122 | hidden_layers = [[]] 123 | if args.backward_pass not in ['full', 'single', 'max', 'min']: 124 | args.backward_pass = 'full' 125 | 126 | print 'settings:' 127 | print 'activation function:', args.activation 128 | print 'dataset:', args.data 129 | print 'alpha:', args.alpha 130 | print 'dimZ:', args.dimZ 131 | print 'hidden layer sizes:', hidden_layers 132 | print 'num. samples:', args.num_samples 133 | print 'backward pass method:', args.backward_pass 134 | print 'learning rate:', args.learning_rate 135 | print 'batch_size:', args.batch_size 136 | 137 | iter_each_round = 10 138 | num_rounds = args.iter / iter_each_round 139 | n_iters = list(np.ones(num_rounds, dtype = int) * iter_each_round) 140 | main(args.data, args.dimZ, hidden_layers, n_iters, args.learning_rate, \ 141 | args.batch_size, args.seed, args.alpha, args.num_samples, \ 142 | args.save_model, args.backward_pass, args.activation, 143 | args.loss, args.checkpoint) 144 | 145 | -------------------------------------------------------------------------------- /models/vrmax.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from scipy.misc import logsumexp 5 | from network.network import construct_network 6 | from loss_functions import reconstruction_loss 7 | from loss_functions import log_prior 8 | from vae import variational_lowerbound 9 | 10 | def vrmax(x, encoder, decoder, num_samples, batch_size): 11 | """ 12 | Compute the VR-max trick 13 | """ 14 | logpxz = 0.0 15 | logqzx = 0.0 16 | L = len(encoder.S_layers) 17 | x_rep = tf.tile(x, [num_samples, 1]) 18 | input = x_rep 19 | 20 | # do encoding 21 | samples = [] 22 | for l in xrange(L): 23 | output, logq = encoder.S_layers[l].encode_and_log_prob(input) 24 | logqzx = logqzx + logq 25 | samples.append(output) 26 | input = output 27 | 28 | # do decoding 29 | samples = list(reversed(samples)) 30 | samples.append(x_rep) 31 | for l in xrange(L): 32 | _, logp = decoder.S_layers[l].encode_and_log_prob(samples[l], eval_output = samples[l+1]) 33 | logpxz = logpxz + logp 34 | 35 | logpz = log_prior(output, encoder.S_layers[l].get_prob_type()) 36 | logF = tf.reshape(logpxz + logpz - logqzx, [num_samples, batch_size]) 37 | 38 | # now compute the gradients 39 | # first test automatic gradient computation 40 | indices = tf.argmax(logF, 0) * batch_size + tf.constant(np.arange(batch_size)) 41 | samples_max = [] 42 | for l in xrange(len(samples)): 43 | samples_max.append(tf.gather(samples[l], indices)) 44 | 45 | # NOT VERY EFFICIENT! RE-COMPUTE THE BOUND 46 | logpxz_max = 0.0 47 | logqzx_max = 0.0 48 | logpz_max = 0.0 49 | for l in xrange(L): 50 | _, logp = decoder.S_layers[l].encode_and_log_prob(samples_max[l], eval_output = samples_max[l+1]) 51 | logpxz_max = logpxz_max + logp 52 | _, logq = encoder.S_layers[L-1-l].encode_and_log_prob(samples_max[l+1], eval_output = samples_max[l]) 53 | logqzx_max = logqzx_max + logq 54 | logpz_max = log_prior(samples_max[0], encoder.S_layers[L-1].get_prob_type()) 55 | lowerbound = tf.reduce_mean(logpxz_max + logpz_max - logqzx_max) 56 | 57 | return lowerbound 58 | 59 | def make_functions_vae(models, input_size, num_samples, batch_size, \ 60 | alpha = 1.0, backward_pass = 'full'): 61 | encoder, decoder = models 62 | 63 | input = tf.placeholder(tf.float32, [batch_size, input_size]) 64 | lowerbound = vrmax(input, encoder, decoder, num_samples, batch_size) 65 | 66 | learning_rate_ph = tf.placeholder(tf.float32, shape = []) 67 | optimizer = \ 68 | tf.train.AdamOptimizer(learning_rate=learning_rate_ph, \ 69 | beta1=0.9, beta2=0.999, epsilon=10e-8 \ 70 | ).minimize(-lowerbound) 71 | 72 | def updateParams(sess, X, learning_rate = 0.0005): 73 | opt, cost = sess.run((optimizer, lowerbound), 74 | feed_dict={input: X, 75 | learning_rate_ph:learning_rate}) 76 | return cost 77 | 78 | return updateParams, lowerbound 79 | 80 | def init_optimizer(models, input_size, batch_size = 100, num_samples = 1, **kwargs): 81 | 82 | encoder = models[0]; decoder = models[1] 83 | # vae 84 | if 'alpha' not in kwargs: 85 | alpha = 1.0 86 | else: 87 | alpha = kwargs['alpha'] 88 | if 'backward_pass' not in kwargs: 89 | backward_pass = 'full' 90 | else: 91 | backward_pass = kwargs['backward_pass'] 92 | updateParams, lowerbound = \ 93 | make_functions_vae(models, input_size, \ 94 | num_samples, batch_size, \ 95 | alpha, backward_pass) 96 | 97 | def fit(sess, X, n_iter = 100, learning_rate = 0.0005, verbose = True): 98 | # first make batches of source data 99 | [N, dimX] = X.shape 100 | N_batch = N / batch_size 101 | if np.mod(N, batch_size) != 0: 102 | N_batch += 1 103 | print "training the model for %d iterations with lr=%f" % \ 104 | (n_iter, learning_rate) 105 | 106 | begin = time.time() 107 | for iteration in xrange(1, n_iter + 1): 108 | iteration_lowerbound = 0 109 | ind_s = np.random.permutation(range(N)) 110 | 111 | for j in xrange(0, N_batch): 112 | indl = j * batch_size 113 | indr = (j+1) * batch_size 114 | ind = ind_s[indl:min(indr, N)] 115 | if indr > N: 116 | ind = np.concatenate((ind, ind_s[:(indr-N)])) 117 | batch = X[ind] 118 | lowerbound = updateParams(sess, batch, learning_rate) 119 | iteration_lowerbound += lowerbound * batch_size 120 | 121 | if verbose: 122 | end = time.time() 123 | print("Iteration %d, lowerbound = %.2f, time = %.2fs" 124 | % (iteration, iteration_lowerbound / N, end - begin)) 125 | begin = end 126 | 127 | 128 | def eval_test_ll(sess, X, num_samples): 129 | lowerbound = sess.run(variational_lowerbound(X, encoder, decoder, num_samples, X.shape[0], 0.0)) 130 | 131 | return lowerbound 132 | 133 | def score(sess, X, num_samples = 100): 134 | """ 135 | Computer lower bound on data, following the IWAE paper. 136 | """ 137 | 138 | begin = time.time() 139 | print 'num. samples for eval:', num_samples 140 | 141 | # compute log_q 142 | lowerbound_total = 0 143 | num_data_test = X.shape[0] 144 | if num_data_test % batch_size == 0: 145 | num_batch = num_data_test / batch_size 146 | else: 147 | num_batch = num_data_test / batch_size + 1 148 | 149 | for i in xrange(num_batch): 150 | indl = i*batch_size 151 | indr = min((i+1)*batch_size, num_data_test) 152 | minibatch = X[indl:indr] 153 | lowerbound = eval_test_ll(sess, minibatch, num_samples) 154 | lowerbound_total += lowerbound * (indr - indl) 155 | 156 | end = time.time() 157 | time_test = end - begin 158 | lowerbound_total = lowerbound_total / float(num_data_test) 159 | 160 | return lowerbound_total, time_test 161 | 162 | return fit, score 163 | -------------------------------------------------------------------------------- /models/iwae.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from scipy.misc import logsumexp 5 | from network.network import construct_network 6 | from loss_functions import reconstruction_loss 7 | from loss_functions import log_prior 8 | from vae import variational_lowerbound 9 | 10 | def iwae(x, encoder, decoder, num_samples, batch_size, alpha = 0.0): 11 | """ 12 | Compute the loss function of VR lowerbound 13 | """ 14 | #logpxz, logqzx, z_list = reconstruction_loss(x, encoder, decoder, num_samples) 15 | logpxz = 0.0 16 | logqzx = 0.0 17 | L = len(encoder.S_layers) 18 | x_rep = tf.tile(x, [num_samples, 1]) 19 | input = x_rep 20 | 21 | # do encoding 22 | samples = [] 23 | for l in xrange(L): 24 | output, logq = encoder.S_layers[l].encode_and_log_prob(input) 25 | logqzx = logqzx + logq 26 | samples.append(output) 27 | input = output 28 | 29 | # do decoding 30 | samples = list(reversed(samples)) 31 | samples.append(x_rep) 32 | for l in xrange(L): 33 | _, logp = decoder.S_layers[l].encode_and_log_prob(samples[l], eval_output = samples[l+1]) 34 | logpxz = logpxz + logp 35 | 36 | logpz = log_prior(output, encoder.S_layers[l].get_prob_type()) 37 | logF = logpz + logpxz - logqzx 38 | 39 | # first compute lowerbound 40 | K = float(num_samples) 41 | logF_matrix = tf.reshape(logF, [num_samples, batch_size]) * (1 - alpha) 42 | logF_max = tf.reduce_max(logF_matrix, 0) 43 | logF_matrix -= logF_max 44 | logF_normalizer = tf.clip_by_value(tf.reduce_sum(tf.exp(logF_matrix), 0), 1e-9, np.inf) 45 | logF_normalizer = tf.log(logF_normalizer) 46 | # note here we need to substract log K as we use reduce_sum above 47 | if np.abs(alpha - 1.0) > 10e-3: 48 | lowerbound = tf.reduce_mean(logF_normalizer + logF_max - tf.log(K)) / (1 - alpha) 49 | else: 50 | lowerbound = tf.reduce_mean(logF) 51 | 52 | # now compute the importance weighted version of gradients 53 | log_ws = tf.reshape(logF_matrix - logF_normalizer, shape=[-1]) 54 | ws = tf.stop_gradient(tf.exp(log_ws), name = 'importance_weights_no_grad') 55 | params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) 56 | gradients = tf.gradients(-logF * ws, params) 57 | grad = zip(gradients, params) 58 | 59 | return lowerbound, grad 60 | 61 | def make_functions_vae(models, input_size, num_samples, batch_size, alpha = 0.0): 62 | encoder, decoder = models 63 | 64 | input = tf.placeholder(tf.float32, [batch_size, input_size]) 65 | lowerbound, grad = iwae(input, encoder, decoder, num_samples, batch_size, \ 66 | alpha) 67 | 68 | learning_rate_ph = tf.placeholder(tf.float32, shape = []) 69 | optimizer = \ 70 | tf.train.AdamOptimizer(learning_rate=learning_rate_ph, \ 71 | beta1=0.9, beta2=0.999, epsilon=10e-8 \ 72 | ).apply_gradients(grad) 73 | 74 | def updateParams(sess, X, learning_rate = 0.0005): 75 | opt, cost = sess.run((optimizer, lowerbound), 76 | feed_dict={input: X, 77 | learning_rate_ph:learning_rate}) 78 | return cost 79 | 80 | return updateParams, lowerbound 81 | 82 | def init_optimizer(models, input_size, batch_size = 100, num_samples = 1, **kwargs): 83 | 84 | encoder = models[0]; decoder = models[1] 85 | # vae 86 | if 'alpha' not in kwargs: 87 | alpha = 0.0 88 | else: 89 | alpha = kwargs['alpha'] 90 | updateParams, lowerbound = \ 91 | make_functions_vae(models, input_size, \ 92 | num_samples, batch_size, \ 93 | alpha) 94 | 95 | def fit(sess, X, n_iter = 100, learning_rate = 0.0005, verbose = True): 96 | # first make batches of source data 97 | [N, dimX] = X.shape 98 | N_batch = N / batch_size 99 | if np.mod(N, batch_size) != 0: 100 | N_batch += 1 101 | print "training the model for %d iterations with lr=%f" % \ 102 | (n_iter, learning_rate) 103 | 104 | begin = time.time() 105 | for iteration in xrange(1, n_iter + 1): 106 | iteration_lowerbound = 0 107 | ind_s = np.random.permutation(range(N)) 108 | 109 | for j in xrange(0, N_batch): 110 | indl = j * batch_size 111 | indr = (j+1) * batch_size 112 | ind = ind_s[indl:min(indr, N)] 113 | if indr > N: 114 | ind = np.concatenate((ind, ind_s[:(indr-N)])) 115 | batch = X[ind] 116 | lowerbound = updateParams(sess, batch, learning_rate) 117 | iteration_lowerbound += lowerbound * batch_size 118 | 119 | if verbose: 120 | end = time.time() 121 | print("Iteration %d, lowerbound = %.2f, time = %.2fs" 122 | % (iteration, iteration_lowerbound / N, end - begin)) 123 | begin = end 124 | 125 | 126 | def eval_test_ll(sess, X, num_samples): 127 | lowerbound = sess.run(variational_lowerbound(X, encoder, decoder, num_samples, X.shape[0], 0.0)) 128 | 129 | return lowerbound 130 | 131 | def score(sess, X, num_samples = 100): 132 | """ 133 | Computer lower bound on data, following the IWAE paper. 134 | """ 135 | 136 | begin = time.time() 137 | print 'num. samples for eval:', num_samples 138 | 139 | # compute log_q 140 | lowerbound_total = 0 141 | num_data_test = X.shape[0] 142 | if num_data_test % batch_size == 0: 143 | num_batch = num_data_test / batch_size 144 | else: 145 | num_batch = num_data_test / batch_size + 1 146 | 147 | for i in xrange(num_batch): 148 | indl = i*batch_size 149 | indr = min((i+1)*batch_size, num_data_test) 150 | minibatch = X[indl:indr] 151 | lowerbound = eval_test_ll(sess, minibatch, num_samples) 152 | lowerbound_total += lowerbound * (indr - indl) 153 | 154 | end = time.time() 155 | time_test = end - begin 156 | lowerbound_total = lowerbound_total / float(num_data_test) 157 | 158 | return lowerbound_total, time_test 159 | 160 | return fit, score 161 | -------------------------------------------------------------------------------- /models/vae.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from scipy.misc import logsumexp 5 | from network.network import construct_network 6 | from loss_functions import reconstruction_loss 7 | from loss_functions import log_prior 8 | 9 | def variational_lowerbound(x, encoder, decoder, num_samples, batch_size, \ 10 | alpha = 1.0, backward_pass = 'full'): 11 | """ 12 | Compute the loss function of VR lowerbound 13 | """ 14 | #logpxz, logqzx, z_list = reconstruction_loss(x, encoder, decoder, num_samples) 15 | logpxz = 0.0 16 | logqzx = 0.0 17 | L = len(encoder.S_layers) 18 | x_rep = tf.tile(x, [num_samples, 1]) 19 | input = x_rep 20 | 21 | # do encoding 22 | samples = [] 23 | for l in xrange(L): 24 | output, logq = encoder.S_layers[l].encode_and_log_prob(input) 25 | logqzx = logqzx + logq 26 | samples.append(output) 27 | input = output 28 | 29 | # do decoding 30 | samples = list(reversed(samples)) 31 | samples.append(x_rep) 32 | for l in xrange(L): 33 | _, logp = decoder.S_layers[l].encode_and_log_prob(samples[l], eval_output = samples[l+1]) 34 | logpxz = logpxz + logp 35 | 36 | logpz = log_prior(output, encoder.S_layers[l].get_prob_type()) 37 | logF = logpz + logpxz - logqzx 38 | 39 | if backward_pass == 'max': 40 | logF = tf.reshape(logF, [num_samples, batch_size]) 41 | logF = tf.reduce_max(logF, 0) 42 | lowerbound = tf.reduce_mean(logF) 43 | elif backward_pass == 'min': 44 | logF = tf.reshape(logF, [num_samples, batch_size]) 45 | logF = tf.reduce_min(logF, 0) 46 | lowerbound = tf.reduce_mean(logF) 47 | elif np.abs(alpha - 1.0) < 10e-3: 48 | lowerbound = tf.reduce_mean(logF) 49 | else: 50 | logF = tf.reshape(logF, [num_samples, batch_size]) 51 | logF = logF * (1 - alpha) 52 | logF_max = tf.reduce_max(logF, 0) 53 | logF = tf.log(tf.clip_by_value(tf.reduce_mean(tf.exp(logF - logF_max), 0), 1e-9, np.inf)) 54 | logF = (logF + logF_max) / (1 - alpha) 55 | lowerbound = tf.reduce_mean(logF) 56 | return lowerbound#, logpz, logpxz, logqzx 57 | 58 | def make_functions_vae(models, input_size, num_samples, batch_size, \ 59 | alpha = 1.0, backward_pass = 'full'): 60 | encoder, decoder = models 61 | 62 | input = tf.placeholder(tf.float32, [batch_size, input_size]) 63 | lowerbound = variational_lowerbound(input, encoder, decoder, num_samples, batch_size, \ 64 | alpha, backward_pass) 65 | 66 | learning_rate_ph = tf.placeholder(tf.float32, shape = []) 67 | optimizer = \ 68 | tf.train.AdamOptimizer(learning_rate=learning_rate_ph, \ 69 | beta1=0.9, beta2=0.999, epsilon=10e-8 \ 70 | ).minimize(-lowerbound) 71 | 72 | def updateParams(sess, X, learning_rate = 0.0005): 73 | opt, cost = sess.run((optimizer, lowerbound), 74 | feed_dict={input: X, 75 | learning_rate_ph:learning_rate}) 76 | return cost 77 | 78 | return updateParams, lowerbound 79 | 80 | def init_optimizer(models, input_size, batch_size = 100, num_samples = 1, **kwargs): 81 | 82 | encoder = models[0]; decoder = models[1] 83 | # vae 84 | if 'alpha' not in kwargs: 85 | alpha = 1.0 86 | else: 87 | alpha = kwargs['alpha'] 88 | if 'backward_pass' not in kwargs: 89 | backward_pass = 'full' 90 | else: 91 | backward_pass = kwargs['backward_pass'] 92 | updateParams, lowerbound = \ 93 | make_functions_vae(models, input_size, \ 94 | num_samples, batch_size, \ 95 | alpha, backward_pass) 96 | 97 | def fit(sess, X, n_iter = 100, learning_rate = 0.0005, verbose = True): 98 | # first make batches of source data 99 | [N, dimX] = X.shape 100 | N_batch = N / batch_size 101 | if np.mod(N, batch_size) != 0: 102 | N_batch += 1 103 | print "training the model for %d iterations with lr=%f" % \ 104 | (n_iter, learning_rate) 105 | 106 | begin = time.time() 107 | for iteration in xrange(1, n_iter + 1): 108 | iteration_lowerbound = 0 109 | ind_s = np.random.permutation(range(N)) 110 | 111 | for j in xrange(0, N_batch): 112 | indl = j * batch_size 113 | indr = (j+1) * batch_size 114 | ind = ind_s[indl:min(indr, N)] 115 | if indr > N: 116 | ind = np.concatenate((ind, ind_s[:(indr-N)])) 117 | batch = X[ind] 118 | lowerbound = updateParams(sess, batch, learning_rate) 119 | iteration_lowerbound += lowerbound * batch_size 120 | 121 | if verbose: 122 | end = time.time() 123 | print("Iteration %d, lowerbound = %.2f, time = %.2fs" 124 | % (iteration, iteration_lowerbound / N, end - begin)) 125 | begin = end 126 | 127 | 128 | def eval_test_ll(sess, X, num_samples): 129 | lowerbound = sess.run(variational_lowerbound(X, encoder, decoder, num_samples, X.shape[0], 0.0)) 130 | 131 | return lowerbound 132 | 133 | def score(sess, X, num_samples = 100): 134 | """ 135 | Computer lower bound on data, following the IWAE paper. 136 | """ 137 | 138 | begin = time.time() 139 | print 'num. samples for eval:', num_samples 140 | 141 | # compute log_q 142 | lowerbound_total = 0 143 | num_data_test = X.shape[0] 144 | if num_data_test % batch_size == 0: 145 | num_batch = num_data_test / batch_size 146 | else: 147 | num_batch = num_data_test / batch_size + 1 148 | 149 | for i in xrange(num_batch): 150 | indl = i*batch_size 151 | indr = min((i+1)*batch_size, num_data_test) 152 | minibatch = X[indl:indr] 153 | lowerbound = eval_test_ll(sess, minibatch, num_samples) 154 | lowerbound_total += lowerbound * (indr - indl) 155 | 156 | end = time.time() 157 | time_test = end - begin 158 | lowerbound_total = lowerbound_total / float(num_data_test) 159 | 160 | return lowerbound_total, time_test 161 | 162 | return fit, score 163 | -------------------------------------------------------------------------------- /network/stochastic_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from deterministic_layer import Deterministic_Layer 4 | 5 | def construct_Stoc_Layer(layer_sizes, prob_type = 'gaussian', activation='softplus', prefix = 'q'): 6 | """ 7 | Construct the stochastic layer. 8 | """ 9 | D_layers = [] 10 | if len(layer_sizes) > 2: 11 | for l in xrange(len(layer_sizes) - 2): 12 | D_layers.append(Deterministic_Layer(layer_sizes[l], layer_sizes[l+1], activation)) 13 | 14 | if prob_type == 'gaussian': 15 | if prefix == 'p': 16 | activation = 'sigmoid' 17 | if prefix == 'q': 18 | activation = 'linear' 19 | Mu_layer = Deterministic_Layer(layer_sizes[-2], layer_sizes[-1], activation) 20 | Log_Sigma_layer = Deterministic_Layer(layer_sizes[-2], layer_sizes[-1], 'linear') 21 | S_layer = Gaussian_Stoc_Layer(D_layers, Mu_layer, Log_Sigma_layer) 22 | 23 | if prob_type == 'bernoulli': 24 | Mu_layer = Deterministic_Layer(layer_sizes[-2], layer_sizes[-1], 'sigmoid') 25 | S_layer = Bernoulli_Stoc_Layer(D_layers, Mu_layer) 26 | 27 | if prob_type == 'bernoulli_sym': 28 | Mu_layer = Deterministic_Layer(layer_sizes[-2], layer_sizes[-1], 'sigmoid') 29 | S_layer = Bernoulli_sym_Stoc_Layer(D_layers, Mu_layer) 30 | 31 | if prob_type == 'softmax': 32 | Mu_layer = Deterministic_Layer(layer_sizes[-2], layer_sizes[-1], 'softmax') 33 | S_layer = Softmax_Stoc_Layer(D_layers, Mu_layer) 34 | 35 | return S_layer 36 | 37 | class Stoc_Layer(object): 38 | 39 | def encode(self, input, sampling): 40 | raise NotImplementedError() 41 | 42 | def log_prob(self, output, params): 43 | raise NotImplementedError() 44 | 45 | def encode_and_log_prob(self, input, eval_output = None): 46 | # evaluate on eval_output if provided 47 | if eval_output is None: 48 | output, params = self.encode(input, sampling = True) 49 | else: 50 | _, params = self.encode(input, sampling = False) 51 | output = eval_output 52 | logprob = self.log_prob(output, params) 53 | return output, logprob 54 | 55 | def get_name(self): 56 | return 'stochastic_layer' 57 | 58 | class Gaussian_Stoc_Layer(Stoc_Layer): 59 | 60 | def __init__(self, D_layers, Mu_layer, Log_Sigma_layer): 61 | self.D_layers = D_layers 62 | self.params = [] 63 | for layer in self.D_layers: 64 | self.params = self.params + layer.params 65 | self.Mu_layer = Mu_layer 66 | self.Log_Sigma_layer = Log_Sigma_layer 67 | self.params = self.params + self.Mu_layer.params 68 | self.params = self.params + self.Log_Sigma_layer.params 69 | # output size 70 | self.output_size = self.Mu_layer.output_size 71 | 72 | def encode(self, input, sampling): 73 | output = input 74 | for layer in self.D_layers: 75 | output = layer.encode(output) 76 | # now compute mu and sigma 77 | Mu = self.Mu_layer.encode(output) 78 | Log_Sigma = 0.5 * self.Log_Sigma_layer.encode(output) 79 | if sampling: 80 | eps = tf.random_normal(Mu.get_shape()) 81 | output = Mu + tf.exp(Log_Sigma) * eps 82 | else: 83 | output = Mu 84 | return output, [Mu, Log_Sigma] 85 | 86 | def log_prob(self, output, params): 87 | (Mu, Log_Sigma) = params 88 | logprob = -(0.5 * np.log(2 * np.pi) + Log_Sigma) \ 89 | - 0.5 * ((output - Mu) / tf.exp(Log_Sigma)) ** 2 90 | return tf.reduce_sum(logprob, 1) 91 | 92 | def get_prob_type(self): 93 | return 'gaussian' 94 | 95 | class Bernoulli_Stoc_Layer(Stoc_Layer): 96 | 97 | def __init__(self, D_layers, Mu_layer): 98 | self.D_layers = D_layers 99 | self.params = [] 100 | for layer in self.D_layers: 101 | self.params = self.params + layer.params 102 | self.Mu_layer = Mu_layer 103 | self.params = self.params + self.Mu_layer.params 104 | # output size 105 | self.output_size = self.Mu_layer.output_size 106 | 107 | def encode(self, input, sampling): 108 | output = input 109 | for layer in self.D_layers: 110 | output = layer.encode(output) 111 | # now compute mu and sigma 112 | Mu = self.Mu_layer.encode(output) 113 | if sampling: 114 | shape = Mu.get_shape() 115 | eps = tf.random_uniform(shape) 116 | output = tf.select(eps - Mu <= 0, tf.ones(shape), tf.zeros(shape)) 117 | else: 118 | output = Mu 119 | return output, Mu 120 | 121 | def log_prob(self, output, params): 122 | Mu = params 123 | logprob = output * tf.log(tf.clip_by_value(Mu, 1e-9, 1.0)) \ 124 | + (1 - output) * tf.log(tf.clip_by_value(1.0 - Mu, 1e-9, 1.0)) 125 | return tf.reduce_sum(logprob, 1) 126 | 127 | def get_prob_type(self): 128 | return 'bernoulli' 129 | 130 | class Bernoulli_sym_Stoc_Layer(Stoc_Layer): 131 | 132 | def __init__(self, D_layers, Mu_layer): 133 | self.D_layers = D_layers 134 | self.params = [] 135 | for layer in self.D_layers: 136 | self.params = self.params + layer.params 137 | self.Mu_layer = Mu_layer 138 | self.params = self.params + self.Mu_layer.params 139 | # output size 140 | self.output_size = self.Mu_layer.output_size 141 | 142 | def encode(self, input, sampling): 143 | output = input 144 | for layer in self.D_layers: 145 | output = layer.encode(output) 146 | # now compute mu 147 | Mu = self.Mu_layer.encode(output) 148 | if sampling: 149 | shape = Mu.get_shape() 150 | eps = tf.random_uniform(shape) 151 | output = tf.select(eps - Mu <= 0, tf.ones(shape), tf.zeros(shape)) 152 | else: 153 | output = Mu 154 | output = output * 2.0 - 1.0 155 | return output, Mu 156 | 157 | def log_prob(self, output, params): 158 | Mu = params 159 | z = (output + 1.0) / 2.0 160 | logprob = z * tf.log(tf.clip_by_value(Mu, 1e-9, 1.0)) \ 161 | + (1 - z) * tf.log(tf.clip_by_value(1.0 - Mu, 1e-9, 1.0)) 162 | return tf.reduce_sum(logprob, 1) 163 | 164 | def get_prob_type(self): 165 | return 'bernoulli_sym' 166 | 167 | class Softmax_Stoc_Layer(Stoc_Layer): 168 | 169 | def __init__(self, D_layers, Logit_layer): 170 | self.D_layers = D_layers 171 | self.params = [] 172 | for layer in self.D_layers: 173 | self.params = self.params + layer.params 174 | self.Logit_layer = Logit_layer 175 | self.params = self.params + self.Logit_layer.params 176 | # output size 177 | self.output_size = self.Logit_layer.output_size 178 | 179 | def encode(self, input, sampling = False): 180 | output = input 181 | for layer in self.D_layers: 182 | output = layer.encode(output) 183 | # now compute mu and sigma 184 | Logit = self.Logit_layer.encode(output) 185 | Logit = tf.log(tf.nn.softmax(Logit)) 186 | output = tf.exp(Logit) # probability vector 187 | #if sampling: 188 | # shape = output.get_shape() 189 | # eps = tf.random_uniform(shape) 190 | # diff = output - eps 191 | # max_out = tf.reduce_max(diff, 1, keep_dims = True) 192 | # output = tf.sign(diff - max_out) + 1.0 193 | return output, Logit 194 | 195 | def log_prob(self, output, params): 196 | Logit = params 197 | logprob = output * Logit 198 | return tf.reduce_sum(logprob, 1) 199 | 200 | def get_prob_type(self): 201 | return 'softmax' 202 | 203 | --------------------------------------------------------------------------------