├── README.md ├── diagonal ├── README.md ├── Variational_Inference_in_DPGMM_Diagonal_.pdf ├── bound_check.py ├── demos.py └── dpgmm_vi.py └── spherical ├── README.md ├── Variational_Inference_in_DPGMM_Spherical.pdf ├── bound_check.py ├── demos.py └── dpgmm_vi.py /README.md: -------------------------------------------------------------------------------- 1 | # tf_dpgmm 2 | Variational inference in Dirichlet process Gaussian mixture model (tensorflow implementation), for spherical and diagonal covariance models 3 | 4 | There is a folder for each model type, and each contains: 5 | - a pdf containing the equations and derivations for the evidence lower bound and variational updates 6 | - ```dpgmm_vi.py```: a tensorflow implementation of variational inference in the model 7 | - ```bound_check.py```: a comparison of the analytical and Monte Carlo estimates of the ELBO. We used this to check that our derivations and code are correct, because the two estimates match. 8 | - ```demos.py```: examples of how to use ```dpgmm_vi.py```, including plotting changes in ELBO with each update and clustering results 9 | 10 | These codes have not been optimized for performance. Please let us know if you find any mistakes! 11 | -------------------------------------------------------------------------------- /diagonal/README.md: -------------------------------------------------------------------------------- 1 | # tf_dpgmm (diagonal covariance) 2 | If any datapoints are equal to the zero vector, they will be ignored. See the use of ```zeta_mask``` in ```dpgmm_vi.py```. This enables the use of differently sized datasets (because you can pad the smaller ones with zero vectors), but may not be desirable for you! -------------------------------------------------------------------------------- /diagonal/Variational_Inference_in_DPGMM_Diagonal_.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcusi/tf_dpgmm/32424fa6db31561e8b5a322c5893df5814fc43f7/diagonal/Variational_Inference_in_DPGMM_Diagonal_.pdf -------------------------------------------------------------------------------- /diagonal/bound_check.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import norm, gamma, rv_discrete, beta 3 | from scipy.special import digamma 4 | from scipy.special import gamma as gammaf 5 | 6 | lgamma = lambda x: np.log(gammaf(x)) 7 | 8 | """ 9 | DIAGONAL COVARIANCE 10 | 11 | Code to check analytical derivations of ELBO in Variational_Inference_in_DPGMM_derivation.pdf against Monte Carlo estimates 12 | 13 | python bound_check.py 14 | 15 | lbh@mit.edu, october 2018 16 | """ 17 | 18 | np.random.seed(0) 19 | 20 | D=3 21 | K=2 22 | 23 | nu = np.random.randn(K, D) 24 | omega = np.random.random([K, D]) + 1 25 | zeta = np.array([0.2, 0.8]) 26 | 27 | a = np.ones([K, D]) 28 | b = 1.5*np.ones([K, D]) 29 | 30 | lambda1 = np.ones(K) 31 | lambda2 = 2.*np.ones(K) 32 | 33 | alpha = 3. 34 | p_phi = beta(1, alpha) 35 | p_mu = norm 36 | p_tau = gamma(a=1., scale=1.) 37 | def log_p_z(phi, z): 38 | p = np.concatenate([[1], np.cumprod(1-phi[:-1])]) * phi 39 | return np.log(p[z]) 40 | def p_x(z, mu, tau): return norm(loc=mu[z], scale=np.sqrt(1./tau[z])) 41 | 42 | q_phi = beta(lambda1, lambda2) 43 | q_mu = norm(loc=nu, scale=1./np.sqrt(omega)) 44 | q_tau = gamma(a=a, scale=1./b) #tau is precision! 45 | q_z = rv_discrete(values=(range(K), zeta)) 46 | 47 | x = 3. 48 | 49 | N = 50001 50 | 51 | print('Diagonal Covariance Model') 52 | # ####### phi term in the ELBO ######## 53 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 54 | # Analytical 55 | bound = sum((lgamma(1. + alpha) - lgamma(alpha) 56 | + (alpha - 1.)*(digamma(l2_k) - digamma(l1_k + l2_k)) 57 | - lgamma(l1_k + l2_k) + lgamma(l1_k) + lgamma(l2_k) 58 | - (l1_k - 1.)*(digamma(l1_k) - digamma(l1_k + l2_k)) 59 | - (l2_k - 1.)*(digamma(l2_k) - digamma(l1_k + l2_k))) 60 | for (l1_k, l2_k) in zip(lambda1, lambda2)) 61 | print("Analytical phi term in ELBO:", bound) 62 | 63 | # Monte Carlo 64 | print('Monte Carlo estimate of phi term in ELBO:') 65 | np.random.seed() 66 | bounds = [] 67 | for i in range(N): 68 | phi = q_phi.rvs() 69 | 70 | bounds.append(sum(p_phi.logpdf(phi) - q_phi.logpdf(phi))) #Sum over K for MC estimate 71 | if i%5000 == 0: print(i, np.mean(bounds)) 72 | 73 | 74 | 75 | 76 | # ####### mu term in the ELBO ######## 77 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 78 | # Analytical 79 | # OLD: 80 | # bound = sum(-0.5*nu_k**2 for nu_k in nu) 81 | # NEW: 82 | bound = -0.5 * sum(sum(1./omega + nu**2 + np.log(omega) - 1)) 83 | print("Analytical mu term in ELBO:", bound) 84 | 85 | # Monte Carlo 86 | print('Monte Carlo estimate of mu term in ELBO:') 87 | np.random.seed() 88 | bounds = [] 89 | for i in range(N): 90 | mu = q_mu.rvs() 91 | bounds.append(sum(sum(p_mu.logpdf(mu) - q_mu.logpdf(mu)))) #Sum over K for MC estimate 92 | if i%5000 == 0: print(i, np.mean(bounds)) 93 | 94 | 95 | 96 | 97 | # ####### tau term in the ELBO ######## 98 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 99 | # Analytical 100 | #OLD: 101 | # bound = sum(lgamma(a_k) - (a_k-1.)*digamma(a_k) - np.log(b_k) + 1 - np.divide(a_k,b_k) 102 | # for (a_k, b_k) in zip(a, b)) 103 | #NEW: 104 | bound = sum(sum(lgamma(a_k) - (a_k-1.)*digamma(a_k) - np.log(b_k) + a_k - np.divide(a_k,b_k) 105 | for (a_k, b_k) in zip(a, b))) 106 | print("Analytical tau term in ELBO:", bound) 107 | 108 | # Monte Carlo 109 | print('Monte Carlo estimate of mu term in ELBO:') 110 | np.random.seed() 111 | bounds = [] 112 | for i in range(N): 113 | tau = q_tau.rvs() 114 | 115 | bounds.append(sum(sum(p_tau.logpdf(tau) - q_tau.logpdf(tau)))) #Sum over K for MC estimate 116 | if i%5000 == 0: print(i, np.mean(bounds)) 117 | 118 | 119 | 120 | 121 | # ####### z term in the ELBO ######## 122 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 123 | # Analytical 124 | bound = sum(zeta_k*( 125 | - np.log(zeta_k) 126 | + digamma(l1_k) - digamma(l1_k+l2_k) 127 | + sum(digamma(lambda2[j]) - digamma(lambda1[j]+lambda2[j]) for j in range(k))) 128 | for (l1_k, l2_k, zeta_k, k) in zip(lambda1, lambda2, zeta, range(K))) 129 | print("Analytical z term in ELBO:", bound) 130 | 131 | # Monte Carlo 132 | print('Monte Carlo estimate of z term in ELBO:') 133 | np.random.seed() 134 | bounds = [] 135 | for i in range(N): 136 | phi = q_phi.rvs() 137 | z = q_z.rvs() 138 | 139 | bounds.append(log_p_z(phi, z) - q_z.logpmf(z)) #There's only a single datapoint, so no need for sum 140 | if i%5000 == 0: print(i, np.mean(bounds)) 141 | 142 | 143 | 144 | 145 | # ####### x term in the ELBO ######## 146 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 147 | # Analytical 148 | bound = sum(sum(zeta_k * ( 149 | -1/2. * (np.log(2 * np.pi) - digamma(ak) + np.log(bk)) 150 | -(ak/(2.*bk)) * (x - nu_k)**2 151 | -(ak/(2.*bk)) * gammaf(3./2.) * (2*np.pi)**(-1/2.) * 2.**(3./2.) / omega_k 152 | ) for (ak, bk, nu_k, zeta_k, omega_k) in zip(a, b, nu, zeta, omega))) 153 | print("Analytical x term in ELBO:", bound) 154 | 155 | # Monte Carlo 156 | print('Monte Carlo estimate of x term in ELBO:') 157 | np.random.seed() 158 | bounds = [] 159 | for i in range(N): 160 | mu = q_mu.rvs() 161 | tau = q_tau.rvs() 162 | z = q_z.rvs() 163 | 164 | bounds.append(sum(p_x(z, mu, tau).logpdf(x))) #sum over d (There's only a single datapoint, so no need for sum over i) 165 | if i%5000 == 0: print(i, np.mean(bounds)) -------------------------------------------------------------------------------- /diagonal/demos.py: -------------------------------------------------------------------------------- 1 | from dpgmm_vi import variational_inference 2 | import numpy as np 3 | 4 | import matplotlib 5 | #matplotlib.use('Agg') 6 | import matplotlib.pyplot as plt 7 | from matplotlib.patches import Ellipse 8 | import colorsys 9 | 10 | """ 11 | DIAGONAL COVARIANCE 12 | 13 | Plots to demonstrate use of dpgmm_vi.py 14 | 15 | python demos.py 16 | 17 | mcusi@mit.edu, july 2018 18 | """ 19 | 20 | 21 | def gen_demo_data(batch_size=1, np_seed=None, D=2, use_zeros=True): 22 | #generates data from multivariate gaussian with diagonal covariance 23 | Nmax = 6*25 24 | np.random.seed(np_seed) 25 | for b in range(batch_size): 26 | K = np.random.randint(2, high=6+1) 27 | in_dataset = 0 28 | for k in range(K): 29 | mean = 4*np.random.randn(D) - 1 30 | cov = np.diag(np.random.rand(2) + 0.1) 31 | n = np.random.randint(10, high=25+1) 32 | in_dataset += n 33 | gaussian_data = np.random.multivariate_normal(mean, cov, n) 34 | _data = gaussian_data if k == 0 else np.vstack((_data, gaussian_data)) 35 | n_zeros = Nmax - in_dataset ##use zeros to pad smaller datasets 36 | _data = np.vstack((np.zeros((n_zeros,D)),_data)) 37 | np.random.shuffle(_data) 38 | _data = np.float32(_data[np.newaxis,:,:]) 39 | data = _data if b == 0 else np.concatenate((data,_data)) 40 | 41 | return data 42 | 43 | def ELBO_demo(np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20): 44 | 45 | #Generate toy data 46 | data = gen_demo_data(batch_size = 1, np_seed = np_seed) 47 | nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2)) 48 | 49 | #Run inference 50 | inferred_latents, ELBO_deltas = variational_inference(data, alpha=alpha, T=T, n_iter=max_n_iter, tf_seed=tf_seed, get_elbo=True) 51 | 52 | #Plot change in ELBO with updates 53 | ##If you use [1:] after each of the plot arguments, 54 | ##You can see that the change is still positive after the first iteration 55 | plt.plot(ELBO_deltas.total); plt.title('Change in ELBO with each set of updates'); plt.show(); 56 | 57 | plt.plot(ELBO_deltas.zeta_z + ELBO_deltas.zeta_x); plt.plot(ELBO_deltas.zeta_z); plt.plot(ELBO_deltas.zeta_x); 58 | plt.legend(['z+x', 'z', 'x'], loc='upper right'); plt.title('Change in z&x ELBO terms due to zeta update'); plt.show() 59 | 60 | plt.plot(ELBO_deltas.lambda_z + ELBO_deltas.lambda_z); plt.plot(ELBO_deltas.lambda_z); plt.plot(ELBO_deltas.lambda_phi); 61 | plt.legend(['z+phi', 'z', 'phi'], loc='upper right'); plt.title('Change in z&phi ELBO terms due to lambda update'); plt.show() 62 | 63 | plt.plot(ELBO_deltas.nu_mu + ELBO_deltas.nu_x); plt.plot(ELBO_deltas.nu_mu); plt.plot(ELBO_deltas.nu_x); 64 | plt.legend(['mu+x', 'mu', 'x'], loc='upper right'); plt.title('Change in mu&x ELBO terms due to nu update'); plt.show() 65 | 66 | plt.plot(ELBO_deltas.omega_mu + ELBO_deltas.omega_x); plt.plot(ELBO_deltas.omega_mu); plt.plot(ELBO_deltas.omega_x); 67 | plt.legend(['mu+x', 'mu', 'x'], loc='upper right'); plt.title('Change in mu&x ELBO terms due to omega update'); plt.show() 68 | 69 | plt.plot(ELBO_deltas.ab_tau + ELBO_deltas.ab_x); plt.plot(ELBO_deltas.ab_tau); plt.plot(ELBO_deltas.ab_x); 70 | plt.legend(['tau+x', 'tau', 'x'], loc='upper right'); plt.title('Change in tau&x ELBO terms due to ab update'); plt.show() 71 | 72 | ##Plot each datapoint with a colour corresponding to the variational cluster to which it is assigned with maximum probability 73 | batch_number = 0 74 | nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)] 75 | inferred_zeta = inferred_latents.zeta[batch_number,nonzero_datapoints,:] 76 | assignments = np.argmax(inferred_zeta, axis=1) 77 | plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1],c=assignments,marker='x') 78 | plt.gca().set_xlim([-10,10]) 79 | plt.gca().set_ylim([-10,10]) 80 | plt.gca().set_aspect('equal', adjustable='box') 81 | plt.title('MAP assignments of datapoints to clusters') 82 | plt.show() 83 | 84 | def clusters_demo(np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20): 85 | 86 | data = gen_demo_data(batch_size = 1, np_seed = np_seed) 87 | nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2)) 88 | 89 | for n_iter in [0, 1, 2, 5, 10, max_n_iter]: 90 | inferred_latents = variational_inference(data, alpha=alpha, T=T, n_iter=n_iter, tf_seed=tf_seed, get_elbo=False) 91 | 92 | #Plot means and datapoints as points 93 | batch_number = 0 94 | nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)] 95 | plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1], marker='x') 96 | plt.scatter(inferred_latents.nu[batch_number,:,0] + 0.01*np.random.randn(T), inferred_latents.nu[batch_number,:,1],marker='o',s=30,color='r') 97 | 98 | #Plot expected standard deviation as diameter of ellipse 99 | patches = []; 100 | diameter = 2*np.sqrt(1./np.divide(inferred_latents.a, inferred_latents.b)) 101 | 102 | #Plot marginal cluster probabilities as the transparency of circle 103 | l1 = inferred_latents.lambda_1[batch_number,:] 104 | l2 = inferred_latents.lambda_2[batch_number,:] 105 | beta_means = np.divide(l1,l1 + l2) 106 | log_beta_means = np.log(beta_means + 1e-30) 107 | cs = np.concatenate(( [0], np.cumsum( np.log(1-beta_means+1e-30)[:-1]) )) #SBP 108 | beta_expectation = np.exp(log_beta_means + cs) 109 | beta_expectation /= (1.*np.sum(beta_expectation)) 110 | for k in range(T): 111 | circle = Ellipse((inferred_latents.nu[batch_number,k,0], inferred_latents.nu[batch_number,k,1]), diameter[batch_number,k,0], diameter[batch_number,k,1]) 112 | plt.gca().add_artist(circle) 113 | circle.set_alpha(beta_expectation[k]) 114 | plt.gca().set_xlim([-10,10]) 115 | plt.gca().set_ylim([-10,10]) 116 | plt.gca().set_aspect('equal', adjustable='box') 117 | plt.title('Variational distributions at iteration ' + str(n_iter)) 118 | plt.show() 119 | 120 | def batch_demo(batch_size=2, np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20): 121 | 122 | data = gen_demo_data(batch_size = batch_size, np_seed = np_seed) 123 | nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2)) 124 | 125 | #Run inference 126 | inferred_latents = variational_inference(data, alpha=alpha, T=T, n_iter=max_n_iter, tf_seed=tf_seed, get_elbo=False) 127 | 128 | for batch_number in range(batch_size): 129 | nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)] 130 | inferred_zeta = inferred_latents.zeta[batch_number, nonzero_datapoints, :] 131 | 132 | #plot weighted points 133 | #https://stackoverflow.com/questions/41314736/scatterplot-wherein-each-point-color-is-a-different-mixture-of-k-colors 134 | HSV = [(x*1.0/T, 0.8, 0.5) for x in np.random.permutation(T)] 135 | RGB = np.array(map(lambda x: colorsys.hsv_to_rgb(*x), HSV)) 136 | assignments = np.sum(np.multiply(RGB[np.newaxis, :, :], inferred_zeta[:, :, np.newaxis]),axis=1) 137 | plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1],c=assignments,marker='x') 138 | plt.title('Weighted assignments, batch ' + str(batch_number)) 139 | plt.gca().set_xlim([-10,10]) 140 | plt.gca().set_ylim([-10,10]) 141 | plt.gca().set_aspect('equal', adjustable='box') 142 | plt.show() 143 | 144 | if __name__ == "__main__": 145 | print('Diagonal Covariance Model') 146 | np_seed = 23; tf_seed = 100; alpha=1.0; T=100; max_n_iter=20; 147 | print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') 148 | print('Change in ELBO with each iteration of updates:') 149 | ELBO_demo(np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter) 150 | print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') 151 | print('Change in latent parameters with increasing number of updates:') 152 | clusters_demo(np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter) 153 | print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') 154 | print('This script can use batches of different datasets:') 155 | batch_demo(batch_size = 2, np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter) 156 | 157 | -------------------------------------------------------------------------------- /diagonal/dpgmm_vi.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from scipy.special import gamma as gamma_func 3 | import numpy as np 4 | import collections 5 | 6 | """ 7 | DIAGONAL COVARIANCE 8 | 9 | Unsupervised clustering in R^D 10 | 11 | TF implementation of variational inference in a 12 | Dirichlet Process isotropic Gaussian mixture model 13 | Derivation in Variational_Inference_in_DPGMM_derivation.pdf included in git repo 14 | 15 | Clusters matrix X (batch_size x N x D) of N datapoints with dimensionality D 16 | If a datapoint = zero vector, it is ignored. 17 | This option allows you to use batched datasets of different sizes 18 | 19 | For examples of how to use this code, see demos.py 20 | 21 | mcusi@mit.edu, july 2018 22 | 23 | """ 24 | 25 | class dpgmm(): 26 | 27 | ######### INITIALIZATION ########################################################################################################## 28 | 29 | def __init__(self, alpha, D, n_iter, T, covariance_type='diagonal'): 30 | 31 | self.alpha = alpha; #Dirichlet concentration parameter 32 | self.D = D; self.Dfl = tf.cast(self.D, dtype=tf.float32); #dimensionality of data 33 | self.T = T; #truncation value 34 | 35 | self.gaussian_const = np.divide(gamma_func(1.5)*(2.0**1.5), np.sqrt(2.*np.pi)) 36 | 37 | #Initialization settings 38 | self.mu_std = 5. 39 | 40 | #inference settings 41 | self.n_iter = n_iter; 42 | self.log_constant = 1e-30 43 | 44 | def initialize_latents(self, X, batch_size, shared=True, use_mask=True): 45 | """ 46 | > randomly initializes variational distribution parameters 47 | > if shared == True, batches share the same initialization 48 | """ 49 | 50 | N = tf.shape(X)[1] 51 | shape_T = [self.T] if shared else [batch_size, self.T] 52 | shape_TD = [self.T, self.D] if shared else [batch_size, self.T, self.D] 53 | 54 | a = tf.get_variable("a", shape_TD, dtype=tf.float32, 55 | initializer=tf.ones_initializer()) 56 | b = tf.get_variable("b", shape_TD, dtype=tf.float32, 57 | initializer=tf.ones_initializer()) 58 | lambda_1 = tf.get_variable("lambda_1", shape_T, dtype=tf.float32, 59 | initializer=tf.ones_initializer()) 60 | lambda_2 = tf.get_variable("lambda_2", shape_T, dtype=tf.float32, 61 | initializer=tf.ones_initializer()) 62 | nu = tf.get_variable("nu", shape_TD, dtype=tf.float32, 63 | initializer=tf.random_normal_initializer(stddev=self.mu_std)) 64 | omega = tf.get_variable("omega", shape_TD, dtype=tf.float32, 65 | initializer=tf.ones_initializer()) 66 | 67 | if shared: 68 | 69 | a = tf.tile(a[tf.newaxis, :, :], [batch_size, 1, 1]) 70 | b = tf.tile(b[tf.newaxis, :, :], [batch_size, 1, 1]) 71 | lambda_1 = tf.tile(lambda_1[tf.newaxis, :], [batch_size, 1]) 72 | lambda_2 = tf.tile(lambda_2[tf.newaxis, :], [batch_size, 1]) 73 | nu = tf.tile(nu[tf.newaxis, :, :], [batch_size, 1, 1]) 74 | omega = tf.tile(omega[tf.newaxis, :, :], [batch_size, 1, 1]) 75 | 76 | # zeta will be the first in the update distribution 77 | # so this initialization is only necessary for ELBO calculation 78 | alpha_vec = tf.fill([batch_size, self.T], self.alpha) 79 | zeta_dist = tf.distributions.Dirichlet(alpha_vec) 80 | #zeta: batch_size N T 81 | zeta = tf.transpose(zeta_dist.sample([N]),perm=[1,0,2]) 82 | #mask: batch_size N 83 | if use_mask: 84 | mask = tf.cast(tf.logical_not(tf.reduce_all(tf.equal(X,0),axis=2)), dtype=tf.float32) 85 | else: 86 | mask = tf.ones([batch_size, N]) 87 | 88 | return a, b, lambda_1, lambda_2, nu, omega, zeta, mask 89 | 90 | ######### UPDATE EQUATIONS ########################################################################################################## 91 | 92 | def update_lambda(self, zeta_mask): 93 | ##lambda_1: only sum over datapoints 94 | #nu_z batch N T 95 | #embedding_weights batch N 96 | lambda_1 = 1.0 + tf.reduce_sum(zeta_mask, axis=1) #over N 97 | ##lambda_2: requires sum over classes as well as datapoints 98 | #nu_z: batch N T 99 | l = tf.cumsum(zeta_mask, axis=2, reverse=True, exclusive=True) #over T 100 | lambda_2 = self.alpha + tf.reduce_sum(l, axis=1) #over N 101 | return lambda_1, lambda_2 102 | 103 | def update_nu(self, a, b, zeta_mask, X): 104 | # nu_z batch N T newaxis 105 | # a batch newaxis T D 106 | # b batch newaxis T D 107 | w = tf.divide(tf.multiply(zeta_mask[:, :, :, tf.newaxis], 108 | a[:, tf.newaxis, :, :]), b[:, tf.newaxis,:, :]) 109 | #w : batch N T D 110 | #X : batch N newaxis D 111 | numer = tf.reduce_sum(tf.multiply(w, X[:, :, tf.newaxis, :]), axis=1) #over N 112 | denom = 1.0 + tf.reduce_sum(w, axis=1) #over N 113 | # numer batch T D 114 | # denom batch T D 115 | nu = tf.divide(numer, denom) 116 | return nu 117 | 118 | def update_omega(self, a, b, zeta_mask): 119 | 120 | # a batch newaxis T D 121 | # b batch newaxis T D 122 | ratio = tf.multiply(tf.divide(a, b), self.gaussian_const)[:, tf.newaxis, :, :] 123 | # nu_z batch N T newaxis 124 | #WANT: omega batch T D 125 | omega = 1.0 + tf.reduce_sum( tf.multiply(zeta_mask[:, :, :, tf.newaxis], ratio) , axis=1) #over N 126 | 127 | return omega 128 | 129 | def update_ab(self, nu, omega, zeta_mask, X): 130 | #nu_z_masked batch N T 131 | #a batch T 132 | a = 1.0 + tf.multiply(0.5, tf.reduce_sum(zeta_mask, axis=1))#over N 133 | #a batch T D 134 | a = tf.tile(a[:, :, tf.newaxis], [1, 1, self.D]) 135 | 136 | #X batch N newaxis D 137 | #nu batch newaxis T D 138 | #squared_difference batch N T D 139 | squared_difference = tf.square(X[:,:,tf.newaxis,:] - nu[:,tf.newaxis,:,:]) 140 | #omega batch newaxis T D 141 | s = squared_difference + tf.divide(self.gaussian_const, omega[:, tf.newaxis, :, :]) 142 | #zeta_mask batch N T newaxis 143 | #b batch T D 144 | b = 1.0 + 0.5*tf.reduce_sum(tf.multiply(zeta_mask[:, :, :, tf.newaxis], s), axis=1) #over N 145 | 146 | return a, b 147 | 148 | def eta_x(self, a, b, nu, omega, X): 149 | """ 150 | eta_x_{i, k, d} = E_q[log P(x_{i,d} | z_i = k, mu_{k,d}, var_{k,d})] 151 | eta_x: 152 | 153 | """ 154 | 155 | #a batch_size, T, D 156 | #b batch_size, T, D 157 | ab1 = tf.multiply(-0.5, tf.log(2*np.pi) - tf.digamma(a) + tf.log(b)) 158 | ab2 = tf.divide(a, -2.0*b) 159 | 160 | # X: batch N newaxis D 161 | # mu: batch newaxis T D 162 | squared_difference = tf.square(tf.subtract(X[:, :, tf.newaxis, :], nu[:, tf.newaxis, :, :])) 163 | #omega: batch newaxis T D 164 | s = squared_difference + tf.divide(self.gaussian_const, omega[:, tf.newaxis, :, :]) 165 | #s: batch N T D 166 | #ab1 batch_size, newaxis, T, D 167 | #ab2 batch_size, newaxis, T, D 168 | Eq = ab1[:, tf.newaxis, :, :] + tf.multiply(ab2[:,tf.newaxis, :, :], s) 169 | 170 | return Eq 171 | 172 | def eta_z(self, lambda_1, lambda_2): 173 | #lambda_1, lambda_2: batch_size, T 174 | d1 = tf.digamma(lambda_1) - tf.digamma(lambda_1 + lambda_2) 175 | d2 = tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2) 176 | d_cumsum = tf.cumsum(d2, axis=1, exclusive=True) 177 | return d1 + d_cumsum 178 | 179 | def update_zeta(self, a, b, lambda_1, lambda_2, nu, omega, X): 180 | 181 | #self.eta_x: batch N T D --> sum over D 182 | #self.eta_z: batch newaxis T 183 | 184 | prop_log_zeta = self.eta_z(lambda_1, lambda_2)[:, tf.newaxis, :] - 1. + tf.reduce_sum(self.eta_x(a, b, nu, omega, X),axis=3)#over D 185 | #prop_log_nu_z batch N T 186 | log_zeta = tf.subtract(prop_log_zeta, tf.reduce_logsumexp(prop_log_zeta, axis=2, keepdims=True)) #over T 187 | zeta = tf.exp(log_zeta) 188 | 189 | return zeta 190 | 191 | def update_all(self, L, dataset): 192 | 193 | zeta = self.update_zeta(L.a, L.b, L.lambda_1, L.lambda_2, L.nu, L.omega, dataset.X) 194 | zeta_mask = tf.multiply(dataset.mask[:,:,tf.newaxis],zeta) 195 | lambda_1, lambda_2 = self.update_lambda(zeta_mask) 196 | nu = self.update_nu(L.a, L.b, zeta_mask, dataset.X) 197 | omega = self.update_omega(L.a, L.b, zeta_mask) 198 | a, b = self.update_ab(nu, omega, zeta_mask, dataset.X) #might have to mess with the order of these 199 | 200 | return a, b, lambda_1, lambda_2, nu, omega, zeta 201 | 202 | ######### VARIATIONAL LOWER BOUND ########################################################################################################## 203 | 204 | def phi_lower_bound_term(self, lambda_1, lambda_2): 205 | """ 206 | lambda_1: [batch_size, T] 207 | lambda_2: [batch_size, T] 208 | """ 209 | term1 = tf.lgamma(1. + self.alpha) - tf.lgamma(self.alpha) 210 | term2 = (self.alpha - 1.)*(tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2)) 211 | term3 = -1*tf.lgamma(lambda_1 + lambda_2) + tf.lgamma(lambda_1) + tf.lgamma(lambda_2) 212 | term4 = tf.multiply(lambda_1 - 1., tf.digamma(lambda_1) - tf.digamma(lambda_1 + lambda_2)) 213 | term5 = tf.multiply(lambda_2 - 1., tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2)) 214 | #sum over clusters 215 | vb = tf.reduce_sum(term1 + term2 + term3 - term4 - term5, axis=1) 216 | return vb 217 | 218 | def mu_lower_bound_term(self, nu, omega): 219 | #nu: [batch_size, T, D] 220 | #omega [batch_size, T, D] 221 | summand = tf.divide(1.0, omega) + tf.square(nu) + tf.log(omega) - 1.0 222 | tot = -0.5 * tf.reduce_sum(summand, axis=2) 223 | vb = tf.reduce_sum(tot, axis=1) #over number of clusters 224 | return vb 225 | 226 | def tau_lower_bound_term(self, a, b): 227 | #a, b: [batch_size, T, D] 228 | tot = tf.lgamma(a) - tf.multiply(a - 1.,tf.digamma(a)) - tf.log(b) + a - tf.divide(a, b) 229 | vb = tf.reduce_sum(tot, axis=[1,2]) #sum over clusters and dimensions 230 | return vb 231 | 232 | def z_lower_bound_term(self, lambda_1, lambda_2, zeta, mask): 233 | #lambda_1: [batch_size, T] 234 | #lambda_2: [batch_size, T] 235 | #zeta: [batch_size, N, T] 236 | c = -tf.log(zeta + self.log_constant) + self.eta_z(lambda_1, lambda_2)[:,tf.newaxis,:] 237 | 238 | # batch_Size N T --> batch N 239 | e = tf.reduce_sum(tf.multiply(zeta, c),axis=2) #over clusters 240 | e_mask = tf.multiply(e, mask) 241 | vb = tf.reduce_sum(e_mask, axis=1) #over I 242 | 243 | return vb 244 | 245 | def x_lower_bound_term(self, a, b, nu, omega, zeta_mask, X): 246 | #X: batch_size, N, D 247 | #self.eta_x: batch N T D 248 | EqLogPxGivenZ = self.eta_x(a, b, nu, omega, X) 249 | #zeta_mask: batch_size, N, T, newaxis 250 | tot = tf.multiply(zeta_mask[:, :, :, tf.newaxis], EqLogPxGivenZ) 251 | #tot batch_size N T D 252 | vb = tf.reduce_sum(tot, axis=[1,2,3]) 253 | return vb 254 | 255 | def evidence_lower_bound(self, L, D): 256 | phi_lb = self.phi_lower_bound_term(L.lambda_1, L.lambda_2) 257 | mu_lb = self.mu_lower_bound_term(L.nu, L.omega) 258 | tau_lb = self.tau_lower_bound_term(L.a, L.b) 259 | z_lb = self.z_lower_bound_term(L.lambda_1, L.lambda_2, L.zeta, D.mask) 260 | x_lb = self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, tf.multiply(D.mask[:,:,tf.newaxis],L.zeta), D.X) 261 | return phi_lb + mu_lb + tau_lb + z_lb + x_lb 262 | 263 | ######### INFERENCE FUNCTIONS ########################################################################################################## 264 | 265 | def infer(self, _a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta, X, mask): 266 | """ 267 | Performs variational inference in DPGMM for n_iter number of iterations, 268 | then returns inferred latent variables 269 | 270 | _a, _b, _lambda_1, _lambda_2, _nu, _zeta: initial parameters for inference 271 | X: data matrix (batch_size x nDatapoints x dimensions) 272 | mask: 1 if consider as datapoint, 0 if ignore (batch_size x nDatapoints) 273 | """ 274 | 275 | ##Initial input into "while" loop, i.e., inference iterations 276 | i = tf.constant(0) 277 | latents = collections.namedtuple('latents', ['a', 'b', 'lambda_1', 'lambda_2', 'nu', 'omega', 'zeta']) 278 | dataset = collections.namedtuple('dataset', ['X', 'mask']) 279 | init_iteration = (i, latents(_a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta), dataset(X, mask)) 280 | 281 | cond = lambda i, L, D: i < self.n_iter 282 | def body(i, L, D): 283 | a, b, lambda_1, lambda_2, nu, omega, zeta = self.update_all(L, D) 284 | return (i + 1, latents(a, b, lambda_1, lambda_2, nu, omega, zeta), D) 285 | 286 | final_iteration = tf.while_loop(cond, body, init_iteration) 287 | 288 | return final_iteration[1] 289 | 290 | def elbo_infer(self, _a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta, X, mask, batch_size): 291 | """ 292 | Performs variational inference in DPGMM for n_iter number of iterations, 293 | and also calculates the change in ELBO at each update 294 | returns inferred latent variables and changes in ELBO 295 | """ 296 | 297 | i = tf.constant(0) 298 | latents = collections.namedtuple('latents', ['a', 'b', 'lambda_1', 'lambda_2', 'nu', 'omega', 'zeta']) 299 | dataset = collections.namedtuple('dataset', ['X', 'mask']) 300 | #ELBO term names: "updated-variable_term-of-lower-bound" 301 | ELBO_terms = ['zeta_z', 'zeta_x', 'lambda_phi', 'lambda_z', 'nu_mu', 'nu_x', 'omega_mu', 'omega_x', 'ab_tau', 'ab_x', 'total'] 302 | empty_ELBO_terms = tuple([tf.TensorArray(dtype=tf.float32, size=self.n_iter, element_shape=batch_size, name=ELBO_terms[j]) for j in range(11)]) 303 | init_iteration = (i, latents(_a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta), dataset(X, mask)) + empty_ELBO_terms 304 | 305 | cond = lambda i, L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total: i < self.n_iter 306 | def body(i, L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total): 307 | 308 | zeta = self.update_zeta(L.a, L.b, L.lambda_1, L.lambda_2, L.nu, L.omega, D.X) 309 | zeta_mask = tf.multiply(D.mask[:,:,tf.newaxis], zeta) 310 | zeta_z = zeta_z.write(i, self.z_lower_bound_term(L.lambda_1, L.lambda_2, zeta, D.mask)-self.z_lower_bound_term(L.lambda_1, L.lambda_2, L.zeta, D.mask)) 311 | zeta_x = zeta_x.write(i, self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, tf.multiply(D.mask[:,:,tf.newaxis],L.zeta), D.X)) 312 | 313 | lambda_1, lambda_2 = self.update_lambda(zeta_mask) 314 | lambda_phi = lambda_phi.write(i, self.phi_lower_bound_term(lambda_1, lambda_2) - self.phi_lower_bound_term(L.lambda_1, L.lambda_2)) 315 | lambda_z = lambda_z.write(i,self.z_lower_bound_term(lambda_1, lambda_2, zeta, D.mask)-self.z_lower_bound_term(L.lambda_1, L.lambda_2, zeta, D.mask)) 316 | 317 | nu = self.update_nu(L.a, L.b, zeta_mask, D.X) 318 | nu_mu = nu_mu.write(i, self.mu_lower_bound_term(nu, L.omega)-self.mu_lower_bound_term(L.nu, L.omega)) 319 | nu_x = nu_x.write(i, self.x_lower_bound_term(L.a, L.b, nu, L.omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, zeta_mask, D.X)) 320 | 321 | omega = self.update_omega(L.a, L.b, zeta_mask) 322 | omega_mu = omega_mu.write(i, self.mu_lower_bound_term(nu, omega)-self.mu_lower_bound_term(nu, L.omega)) 323 | omega_x = omega_x.write(i, self.x_lower_bound_term(L.a, L.b, nu, omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, nu, L.omega, zeta_mask, D.X)) 324 | 325 | a, b = self.update_ab(nu, omega, zeta_mask, D.X) 326 | ab_tau = ab_tau.write(i, self.tau_lower_bound_term(a, b) - self.tau_lower_bound_term(L.a, L.b)) 327 | ab_x = ab_x.write(i, self.x_lower_bound_term(a, b, nu, omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, nu, omega, zeta_mask, D.X)) 328 | 329 | updated_L = latents(a, b, lambda_1, lambda_2, nu, omega, zeta) 330 | total = total.write(i, self.evidence_lower_bound(updated_L, D) - self.evidence_lower_bound(L, D)) 331 | 332 | return (i+1, updated_L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total) 333 | 334 | final_iteration = tf.while_loop(cond, body, init_iteration) 335 | 336 | return final_iteration[1], [final_iteration[i].stack() for i in range(3,14)] 337 | 338 | def variational_inference(data, alpha=1.0, T=10, n_iter=10, tf_seed=None, get_elbo=False, tf_device='/cpu:0'): 339 | """ 340 | Tensorflow setup to run variational inference 341 | 342 | data: matrix of datapoints, size should be (batch_size, max_number_of_datapoints, dimesionality_of_data) 343 | batches that have different number of datapoints can be run together by padding the smaller data matrices with zero vectors 344 | alpha: Dirichlet concentration parameter 345 | T: truncation paper 346 | n_iter: number of iterations to run VI for 347 | get_elbo: if True, measure & return the change in ELBO for each update 348 | 349 | """ 350 | 351 | #size of dataset 352 | batch_size = np.shape(data)[0] 353 | N = np.shape(data)[1] 354 | D = np.shape(data)[2] 355 | 356 | with tf.Graph().as_default(): 357 | with tf.device(tf_device): 358 | 359 | tf.set_random_seed(tf_seed) 360 | X = tf.placeholder(tf.float32, shape=[batch_size, N, D]) 361 | 362 | mixture_model = dpgmm(alpha, D, n_iter, T) 363 | init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, mask = mixture_model.initialize_latents(X, batch_size, shared=False) 364 | 365 | if not get_elbo: 366 | inferred_latents = mixture_model.infer(init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, X, mask) 367 | else: 368 | inferred_latents, ELBO_deltas = mixture_model.elbo_infer(init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, X, mask, batch_size) 369 | 370 | ##Run graph 371 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: 372 | 373 | sess.run(tf.global_variables_initializer()) 374 | if not get_elbo: 375 | inferred_latents_out = sess.run([inferred_latents], feed_dict = {X: data}) 376 | return inferred_latents_out[0] 377 | else: 378 | inferred_latents_out, ELBO_deltas_out = sess.run([inferred_latents, ELBO_deltas], feed_dict = {X: data}) 379 | ELBO_terms = collections.namedtuple('ELBO_terms', ['zeta_z', 'zeta_x', 'lambda_phi', 'lambda_z', 'nu_mu', 'nu_x', 'omega_mu', 'omega_x', 'ab_tau', 'ab_x', 'total']) 380 | return inferred_latents_out, ELBO_terms(*ELBO_deltas_out) 381 | -------------------------------------------------------------------------------- /spherical/README.md: -------------------------------------------------------------------------------- 1 | # tf_dpgmm (spherical covariance) 2 | If any datapoints are equal to the zero vector, they will be ignored. See the use of ```zeta_mask``` in ```dpgmm_vi.py```. This enables the use of differently sized datasets (because you can pad the smaller ones with zero vectors), but may not be what you want! -------------------------------------------------------------------------------- /spherical/Variational_Inference_in_DPGMM_Spherical.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mcusi/tf_dpgmm/32424fa6db31561e8b5a322c5893df5814fc43f7/spherical/Variational_Inference_in_DPGMM_Spherical.pdf -------------------------------------------------------------------------------- /spherical/bound_check.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import norm, gamma, rv_discrete, beta 3 | from scipy.special import digamma 4 | from scipy.special import gamma as gammaf 5 | 6 | lgamma = lambda x: np.log(gammaf(x)) 7 | 8 | """ 9 | DIAGONAL COVARIANCE 10 | 11 | Code to check analytical derivations of ELBO in Variational_Inference_in_DPGMM_derivation.pdf against Monte Carlo estimates 12 | 13 | python bound_check.py 14 | 15 | lbh@mit.edu, october 2018 16 | """ 17 | 18 | np.random.seed(0) 19 | 20 | D=3 21 | K=2 22 | 23 | nu = np.random.randn(K, D) 24 | omega = np.random.random([K]) + 1 25 | zeta = np.array([0.2, 0.8]) 26 | 27 | a = np.ones([K],dtype=np.float32) 28 | b = 1.5*np.ones([K],dtype=np.float32) 29 | 30 | lambda1 = np.ones(K,dtype=np.float32) 31 | lambda2 = 2.*np.ones(K,dtype=np.float32) 32 | 33 | alpha = 3. 34 | p_phi = beta(1., alpha) 35 | p_mu = norm 36 | p_tau = gamma(a=1., scale=1.) 37 | def log_p_z(phi, z): 38 | p = np.concatenate([[1], np.cumprod(1-phi[:-1])]) * phi 39 | return np.log(p[z]) 40 | def p_x(z, mu, tau): return norm(loc=mu[z], scale=np.sqrt(1./tau[z])) 41 | 42 | q_phi = beta(lambda1, lambda2) 43 | q_mu = norm(loc=nu, scale=1./np.sqrt(omega[:, None])) 44 | q_tau = gamma(a=a, scale=1./b) #tau is precision! 45 | q_z = rv_discrete(values=(range(K), zeta)) 46 | 47 | x = np.array([1., 2., 3.]) 48 | 49 | N = 200001 50 | 51 | print('Spherical Covariance Model') 52 | 53 | # ####### mu term in the ELBO ######## 54 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 55 | # Analytical 56 | # OLD: 57 | # bound = sum(-0.5*nu_k**2 for nu_k in nu) 58 | # NEW: 59 | bound = sum( #over k 60 | -D/2. * (1./omega + np.log(omega) - 1) 61 | - 0.5 * (nu**2).sum(axis=1) 62 | ) 63 | print("Analytical mu term in ELBO:", bound) 64 | 65 | # Monte Carlo 66 | print('Monte Carlo estimate of mu term in ELBO:') 67 | np.random.seed() 68 | bounds = [] 69 | for i in range(N): 70 | mu = q_mu.rvs() 71 | bounds.append(sum(sum(p_mu.logpdf(mu) - q_mu.logpdf(mu)))) #Sum over K for MC estimate 72 | if i%5000 == 0: print(i, np.mean(bounds)) 73 | 74 | 75 | 76 | # ####### phi term in the ELBO ######## 77 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 78 | # Analytical 79 | bound = sum((lgamma(1. + alpha) - lgamma(alpha) 80 | + (alpha - 1.)*(digamma(l2_k) - digamma(l1_k + l2_k)) 81 | - lgamma(l1_k + l2_k) + lgamma(l1_k) + lgamma(l2_k) 82 | - (l1_k - 1.)*(digamma(l1_k) - digamma(l1_k + l2_k)) 83 | - (l2_k - 1.)*(digamma(l2_k) - digamma(l1_k + l2_k))) 84 | for (l1_k, l2_k) in zip(lambda1, lambda2)) 85 | print("Analytical phi term in ELBO:", bound) 86 | 87 | # Monte Carlo 88 | print('Monte Carlo estimate of phi term in ELBO:') 89 | np.random.seed() 90 | bounds = [] 91 | for i in range(N): 92 | phi = q_phi.rvs() 93 | 94 | bounds.append(sum(p_phi.logpdf(phi) - q_phi.logpdf(phi))) #Sum over K for MC estimate 95 | if i%5000 == 0: print(i, np.mean(bounds)) 96 | 97 | 98 | 99 | # ####### tau term in the ELBO ######## 100 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 101 | # Analytical 102 | #OLD: 103 | # bound = sum(lgamma(a_k) - (a_k-1.)*digamma(a_k) - np.log(b_k) + 1 - np.divide(a_k,b_k) 104 | # for (a_k, b_k) in zip(a, b)) 105 | #NEW: 106 | bound = sum(lgamma(a_k) - (a_k-1.)*digamma(a_k) - np.log(b_k) + a_k - np.divide(a_k,b_k) 107 | for (a_k, b_k) in zip(a, b)) 108 | print("Analytical tau term in ELBO:", bound) 109 | 110 | # Monte Carlo 111 | print('Monte Carlo estimate of mu term in ELBO:') 112 | np.random.seed() 113 | bounds = [] 114 | for i in range(N): 115 | tau = q_tau.rvs() 116 | 117 | bounds.append(sum(p_tau.logpdf(tau) - q_tau.logpdf(tau))) #Sum over K for MC estimate 118 | if i%5000 == 0: print(i, np.mean(bounds)) 119 | 120 | 121 | 122 | 123 | # ####### z term in the ELBO ######## 124 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 125 | # Analytical 126 | bound = sum(zeta_k*( 127 | - np.log(zeta_k) 128 | + digamma(l1_k) - digamma(l1_k+l2_k) 129 | + sum(digamma(lambda2[j]) - digamma(lambda1[j]+lambda2[j]) for j in range(k))) 130 | for (l1_k, l2_k, zeta_k, k) in zip(lambda1, lambda2, zeta, range(K))) 131 | print("Analytical z term in ELBO:", bound) 132 | 133 | # Monte Carlo 134 | print('Monte Carlo estimate of z term in ELBO:') 135 | np.random.seed() 136 | bounds = [] 137 | for i in range(N): 138 | phi = q_phi.rvs() 139 | z = q_z.rvs() 140 | 141 | bounds.append(log_p_z(phi, z) - q_z.logpmf(z)) #There's only a single datapoint, so no need for sum 142 | if i%5000 == 0: print(i, np.mean(bounds)) 143 | 144 | 145 | 146 | 147 | # ####### x term in the ELBO ######## 148 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") 149 | # Analytical 150 | bound = sum( #over k 151 | zeta_k * ( 152 | -D/2. * (np.log(2 * np.pi) - digamma(ak) + np.log(bk)) 153 | -(ak/(2.*bk)) * ((x[None, :] - nu_k)**2).sum(axis=1) 154 | -(ak/(2.*bk)) * D * gammaf(3./2.) * (2*np.pi)**(-1/2.) * 2.**(3./2.) / omega_k 155 | ) 156 | for (ak, bk, nu_k, zeta_k, omega_k) in zip(a, b, nu, zeta, omega)) 157 | print("Analytical x term in ELBO:", bound) 158 | 159 | # Monte Carlo 160 | print('Monte Carlo estimate of x term in ELBO:') 161 | np.random.seed() 162 | bounds = [] 163 | for i in range(N): 164 | mu = q_mu.rvs() 165 | tau = q_tau.rvs() 166 | z = q_z.rvs() 167 | 168 | bounds.append(sum(p_x(z, mu, tau).logpdf(x))) #sum over d (There's only a single datapoint, so no need for sum over i) 169 | if i%5000 == 0: print(i, np.mean(bounds)) -------------------------------------------------------------------------------- /spherical/demos.py: -------------------------------------------------------------------------------- 1 | from dpgmm_vi import variational_inference 2 | import numpy as np 3 | 4 | import matplotlib 5 | #matplotlib.use('Agg') 6 | import matplotlib.pyplot as plt 7 | from matplotlib.patches import Circle 8 | import colorsys 9 | 10 | """ 11 | SPHERICAL COVARIANCE, i.e. isotropic 12 | 13 | Plots to demonstrate use of dpgmm_vi.py 14 | 15 | python demos.py 16 | 17 | mcusi@mit.edu, july 2018 18 | """ 19 | 20 | 21 | def gen_demo_data(batch_size=1, np_seed=None, D=2, use_zeros=True): 22 | #generates data from multivariate gaussian with isotropic covariance matrix 23 | Nmax = 6*25 24 | np.random.seed(np_seed) 25 | for b in range(batch_size): 26 | K = np.random.randint(2, high=6+1) 27 | in_dataset = 0 28 | for k in range(K): 29 | mean = 3.5*np.random.randn(D) - 1 30 | cov = 0.1*np.eye(D) + [0.25,1][np.random.randint(2)]*np.eye(D) 31 | n = np.random.randint(10, high=25+1) 32 | in_dataset += n 33 | gaussian_data = np.random.multivariate_normal(mean, cov, n) 34 | _data = gaussian_data if k == 0 else np.vstack((_data, gaussian_data)) 35 | n_zeros = Nmax - in_dataset ##use zeros to pad smaller datasets 36 | _data = np.vstack((np.zeros((n_zeros,D)),_data)) 37 | np.random.shuffle(_data) 38 | _data = np.float32(_data[np.newaxis,:,:]) 39 | data = _data if b == 0 else np.concatenate((data,_data)) 40 | 41 | return data 42 | 43 | def ELBO_demo(np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20): 44 | 45 | #Generate toy data 46 | data = gen_demo_data(batch_size = 1, np_seed = np_seed) 47 | nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2)) 48 | 49 | #Run inference 50 | inferred_latents, ELBO_deltas = variational_inference(data, alpha=alpha, T=T, n_iter=max_n_iter, tf_seed=tf_seed, get_elbo=True) 51 | 52 | #Plot change in ELBO with updates 53 | ##If you use [1:] after each of the plot arguments, 54 | ##You can see that the change is still positive after the first iteration 55 | plt.plot(ELBO_deltas.total); plt.title('Change in ELBO with each set of updates'); plt.show(); 56 | 57 | plt.plot(ELBO_deltas.zeta_z + ELBO_deltas.zeta_x); plt.plot(ELBO_deltas.zeta_z); plt.plot(ELBO_deltas.zeta_x); 58 | plt.legend(['z+x', 'z', 'x'], loc='upper right'); plt.title('Change in z&x ELBO terms due to zeta update'); plt.show() 59 | 60 | plt.plot(ELBO_deltas.lambda_z + ELBO_deltas.lambda_z); plt.plot(ELBO_deltas.lambda_z); plt.plot(ELBO_deltas.lambda_phi); 61 | plt.legend(['z+phi', 'z', 'phi'], loc='upper right'); plt.title('Change in z&phi ELBO terms due to lambda update'); plt.show() 62 | 63 | plt.plot(ELBO_deltas.nu_mu + ELBO_deltas.nu_x); plt.plot(ELBO_deltas.nu_mu); plt.plot(ELBO_deltas.nu_x); 64 | plt.legend(['mu+x', 'mu', 'x'], loc='upper right'); plt.title('Change in mu&x ELBO terms due to nu update'); plt.show() 65 | 66 | plt.plot(ELBO_deltas.ab_tau + ELBO_deltas.ab_x); plt.plot(ELBO_deltas.ab_tau); plt.plot(ELBO_deltas.ab_x); 67 | plt.legend(['tau+x', 'tau', 'x'], loc='upper right'); plt.title('Change in tau&x ELBO terms due to ab update'); plt.show() 68 | 69 | ##Plot each datapoint with a colour corresponding to the variational cluster to which it is assigned with maximum probability 70 | batch_number = 0 71 | nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)] 72 | inferred_zeta = inferred_latents.zeta[batch_number,nonzero_datapoints,:] 73 | assignments = np.argmax(inferred_zeta, axis=1) 74 | plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1],c=assignments,marker='x') 75 | plt.gca().set_xlim([-10,10]) 76 | plt.gca().set_ylim([-10,10]) 77 | plt.gca().set_aspect('equal', adjustable='box') 78 | plt.title('MAP assignments of datapoints to clusters') 79 | plt.show() 80 | 81 | def clusters_demo(np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20): 82 | 83 | data = gen_demo_data(batch_size = 1, np_seed = np_seed) 84 | nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2)) 85 | 86 | for n_iter in [0, 1, 2, 5, 10, max_n_iter]: 87 | inferred_latents = variational_inference(data, alpha=alpha, T=T, n_iter=n_iter, tf_seed=tf_seed, get_elbo=False) 88 | 89 | #Plot means and datapoints as points 90 | batch_number = 0 91 | nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)] 92 | plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1], marker='x') 93 | plt.scatter(inferred_latents.nu[batch_number,:,0] + 0.01*np.random.randn(T), inferred_latents.nu[batch_number,:,1],marker='o',s=30,color='r') 94 | 95 | #Plot expected standard deviation as radius of circle 96 | patches = []; 97 | radii = np.sqrt(1./np.divide(inferred_latents.a, inferred_latents.b)) 98 | 99 | #Plot marginal cluster probabilities as the transparency of circle 100 | l1 = inferred_latents.lambda_1[batch_number,:] 101 | l2 = inferred_latents.lambda_2[batch_number,:] 102 | beta_means = np.divide(l1,l1 + l2) 103 | log_beta_means = np.log(beta_means + 1e-30) 104 | cs = np.concatenate(( [0], np.cumsum( np.log(1-beta_means+1e-30)[:-1]) )) #SBP 105 | beta_expectation = np.exp(log_beta_means + cs) 106 | beta_expectation /= (1.*np.sum(beta_expectation)) 107 | for k in range(T): 108 | circle = Circle((inferred_latents.nu[batch_number,k,0], inferred_latents.nu[batch_number,k,1]), radii[batch_number,k]) 109 | plt.gca().add_artist(circle) 110 | circle.set_alpha(beta_expectation[k]) 111 | plt.gca().set_xlim([-10,10]) 112 | plt.gca().set_ylim([-10,10]) 113 | plt.gca().set_aspect('equal', adjustable='box') 114 | plt.title('Variational distributions at iteration ' + str(n_iter)) 115 | plt.show() 116 | 117 | def batch_demo(batch_size=2, np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20): 118 | 119 | data = gen_demo_data(batch_size = batch_size, np_seed = np_seed) 120 | nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2)) 121 | 122 | #Run inference 123 | inferred_latents = variational_inference(data, alpha=alpha, T=T, n_iter=max_n_iter, tf_seed=tf_seed, get_elbo=False) 124 | 125 | for batch_number in range(batch_size): 126 | nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)] 127 | inferred_zeta = inferred_latents.zeta[batch_number, nonzero_datapoints, :] 128 | 129 | #plot weighted points 130 | #https://stackoverflow.com/questions/41314736/scatterplot-wherein-each-point-color-is-a-different-mixture-of-k-colors 131 | HSV = [(x*1.0/T, 0.8, 0.5) for x in np.random.permutation(T)] 132 | RGB = np.array(map(lambda x: colorsys.hsv_to_rgb(*x), HSV)) 133 | assignments = np.sum(np.multiply(RGB[np.newaxis, :, :], inferred_zeta[:, :, np.newaxis]),axis=1) 134 | plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1],c=assignments,marker='x') 135 | plt.title('Weighted assignments, batch ' + str(batch_number)) 136 | plt.gca().set_xlim([-10,10]) 137 | plt.gca().set_ylim([-10,10]) 138 | plt.gca().set_aspect('equal', adjustable='box') 139 | plt.show() 140 | 141 | if __name__ == "__main__": 142 | print('Spherical Covariance Model') 143 | np_seed = 23; tf_seed = 100; alpha=1.0; T=100; max_n_iter=20; 144 | print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') 145 | print('Change in ELBO with each iteration of updates:') 146 | ELBO_demo(np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter) 147 | print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') 148 | print('Change in latent parameters with increasing number of updates:') 149 | clusters_demo(np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter) 150 | print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') 151 | print('This script can use batches of different datasets:') 152 | batch_demo(batch_size = 2, np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter) 153 | 154 | -------------------------------------------------------------------------------- /spherical/dpgmm_vi.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from scipy.special import gamma as gamma_func 3 | import numpy as np 4 | import collections 5 | 6 | """ 7 | Spherical COVARIANCE 8 | 9 | Unsupervised clustering in R^D 10 | 11 | TF implementation of variational inference in a 12 | Dirichlet Process isotropic Gaussian mixture model 13 | Derivation in Variational_Inference_in_DPGMM_derivation.pdf included in git repo 14 | 15 | Clusters matrix X (batch_size x N x D) of N datapoints with dimensionality D 16 | If a datapoint = zero vector, it is ignored. 17 | This option allows you to use batched datasets of different sizes 18 | 19 | For examples of how to use this code, see demos.py 20 | 21 | mcusi@mit.edu, july 2018 22 | 23 | """ 24 | 25 | class dpgmm(): 26 | 27 | ######### INITIALIZATION ########################################################################################################## 28 | 29 | def __init__(self, alpha, D, n_iter, T, covariance_type='diagonal'): 30 | 31 | self.alpha = alpha; #Dirichlet concentration parameter 32 | self.D = D; self.Dfl = tf.cast(self.D, dtype=tf.float32); #dimensionality of data 33 | self.T = T; #truncation value 34 | 35 | self.gaussian_const = np.divide(gamma_func(1.5)*(2.0**1.5)*self.D, np.sqrt(2.*np.pi)) 36 | 37 | #Initialization settings 38 | self.mu_std = 5. 39 | 40 | #inference settings 41 | self.n_iter = n_iter; 42 | self.log_constant = 1e-30 43 | 44 | def initialize_latents(self, X, batch_size, shared=True, use_mask=True): 45 | """ 46 | > randomly initializes variational distribution parameters 47 | > if shared == True, batches share the same initialization 48 | """ 49 | 50 | N = tf.shape(X)[1] 51 | shape_T = [self.T] if shared else [batch_size, self.T] 52 | shape_TD = [self.T, self.D] if shared else [batch_size, self.T, self.D] 53 | 54 | a = tf.get_variable("a", shape_T, dtype=tf.float32, 55 | initializer=tf.ones_initializer()) 56 | b = tf.get_variable("b", shape_T, dtype=tf.float32, 57 | initializer=tf.ones_initializer()) 58 | lambda_1 = tf.get_variable("lambda_1", shape_T, dtype=tf.float32, 59 | initializer=tf.ones_initializer()) 60 | lambda_2 = tf.get_variable("lambda_2", shape_T, dtype=tf.float32, 61 | initializer=tf.ones_initializer()) 62 | nu = tf.get_variable("nu", shape_TD, dtype=tf.float32, 63 | initializer=tf.random_normal_initializer(stddev=self.mu_std)) 64 | omega = tf.get_variable("omega", shape_T, dtype=tf.float32, 65 | initializer=tf.ones_initializer()) 66 | 67 | if shared: 68 | 69 | a = tf.tile(a[tf.newaxis, :], [batch_size, 1]) 70 | b = tf.tile(b[tf.newaxis, :], [batch_size, 1]) 71 | lambda_1 = tf.tile(lambda_1[tf.newaxis, :], [batch_size, 1]) 72 | lambda_2 = tf.tile(lambda_2[tf.newaxis, :], [batch_size, 1]) 73 | nu = tf.tile(nu[tf.newaxis, :, :], [batch_size, 1, 1]) 74 | omega = tf.tile(omega[tf.newaxis, :], [batch_size, 1]) 75 | 76 | # zeta will be the first in the update distribution 77 | # so this initialization is only necessary for ELBO calculation 78 | alpha_vec = tf.fill([batch_size, self.T], self.alpha) 79 | zeta_dist = tf.distributions.Dirichlet(alpha_vec) 80 | #zeta: batch_size N T 81 | zeta = tf.transpose(zeta_dist.sample([N]),perm=[1,0,2]) 82 | #mask: batch_size N 83 | if use_mask: 84 | mask = tf.cast(tf.logical_not(tf.reduce_all(tf.equal(X,0),axis=2)), dtype=tf.float32) 85 | else: 86 | mask = tf.ones([batch_size, N]) 87 | 88 | return a, b, lambda_1, lambda_2, nu, omega, zeta, mask 89 | 90 | ######### UPDATE EQUATIONS ########################################################################################################## 91 | 92 | def update_lambda(self, zeta_mask): 93 | ##lambda_1: only sum over datapoints 94 | #nu_z batch N T 95 | #embedding_weights batch N 96 | lambda_1 = 1.0 + tf.reduce_sum(zeta_mask, axis=1) #over N 97 | ##lambda_2: requires sum over classes as well as datapoints 98 | #nu_z: batch N T 99 | l = tf.cumsum(zeta_mask, axis=2, reverse=True, exclusive=True) #over T 100 | lambda_2 = self.alpha + tf.reduce_sum(l, axis=1) #over N 101 | return lambda_1, lambda_2 102 | 103 | def update_nu(self, a, b, zeta_mask, X): 104 | # nu_z batch N T 105 | # a batch newaxis T 106 | # b batch newaxis T 107 | w = tf.divide(tf.multiply(zeta_mask, 108 | a[:, tf.newaxis, :]), b[:, tf.newaxis,:])[:, :, :, tf.newaxis] 109 | #w : batch N T newaxis 110 | #X : batch N newaxis D 111 | numer = tf.reduce_sum(tf.multiply(w, X[:, :, tf.newaxis, :]), axis=1) #over N 112 | denom = 1.0 + tf.reduce_sum(w, axis=1) #over N 113 | # numer batch T D 114 | # denom batch T D 115 | nu = tf.divide(numer, denom) 116 | return nu 117 | 118 | def update_omega(self, a, b, zeta_mask): 119 | 120 | # a batch newaxis T 121 | # b batch newaxis T 122 | ratio = tf.multiply(tf.divide(a, b), self.gaussian_const)[:, tf.newaxis, :] 123 | # nu_z batch N T 124 | #WANT: omega batch T 125 | omega = 1.0 + tf.reduce_sum( tf.multiply(zeta_mask, ratio) , axis=1) #over N 126 | return omega 127 | 128 | def update_ab(self, nu, omega, zeta_mask, X): 129 | #nu_z_masked batch N T 130 | #a batch T 131 | a = 1.0 + tf.multiply(self.Dfl/2.0, tf.reduce_sum(zeta_mask, axis=1))#over N 132 | 133 | #X batch N newaxis D 134 | #nu batch newaxis T D 135 | #difference norm batch N T 136 | difference_norm = tf.reduce_sum(tf.square(X[:,:,tf.newaxis,:] - nu[:,tf.newaxis,:,:]),axis=3) 137 | #omega batch newaxis T 138 | s = difference_norm + tf.divide(self.gaussian_const, omega[:, tf.newaxis, :]) 139 | #s batch N T 140 | #zeta_mask batch N T 141 | #b batch T 142 | b = 1.0 + 0.5*tf.reduce_sum(tf.multiply(zeta_mask, s), axis=1) #over N 143 | 144 | return a, b 145 | 146 | def eta_x(self, a, b, nu, omega, X): 147 | """ 148 | eta_x_{i, k} = E_q[log P(x_{i} | z_i = k, mu_{k}, var_{k})] 149 | 150 | """ 151 | 152 | #a batch_size, T 153 | #b batch_size, T 154 | ab1 = tf.multiply(-self.Dfl/2.0, tf.log(2*np.pi) - tf.digamma(a) + tf.log(b))[:,tf.newaxis, :] 155 | ab2 = tf.divide(a, -2.0*b)[:,tf.newaxis, :] 156 | 157 | # X: batch N newaxis D 158 | # mu: batch newaxis T D 159 | norm_difference = tf.reduce_sum(tf.square(tf.subtract(X[:, :, tf.newaxis, :], nu[:, tf.newaxis, :, :])),axis=3) 160 | #omega: batch newaxis T 161 | s = norm_difference + tf.divide(self.gaussian_const, omega[:, tf.newaxis, :]) 162 | #s: batch N T 163 | #ab1 batch_size, newaxis, T, 164 | #ab2 batch_size, newaxis, T, 165 | Eq = ab1 + tf.multiply(ab2, s) 166 | 167 | return Eq 168 | 169 | def eta_z(self, lambda_1, lambda_2): 170 | #lambda_1, lambda_2: batch_size, T 171 | d1 = tf.digamma(lambda_1) - tf.digamma(lambda_1 + lambda_2) 172 | d2 = tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2) 173 | d_cumsum = tf.cumsum(d2, axis=1, exclusive=True) 174 | return d1 + d_cumsum 175 | 176 | def update_zeta(self, a, b, lambda_1, lambda_2, nu, omega, X): 177 | 178 | #self.eta_x: batch N T 179 | #self.eta_z: batch newaxis T 180 | 181 | prop_log_zeta = self.eta_z(lambda_1, lambda_2)[:, tf.newaxis, :] - 1. + self.eta_x(a, b, nu, omega, X) 182 | #prop_log_nu_z batch N T 183 | log_zeta = tf.subtract(prop_log_zeta, tf.reduce_logsumexp(prop_log_zeta, axis=2, keepdims=True)) #over T 184 | zeta = tf.exp(log_zeta) 185 | 186 | return zeta 187 | 188 | def update_all(self, L, dataset): 189 | 190 | 191 | zeta = self.update_zeta(L.a, L.b, L.lambda_1, L.lambda_2, L.nu, L.omega, dataset.X) 192 | zeta_mask = tf.multiply(dataset.mask[:,:,tf.newaxis],zeta) 193 | lambda_1, lambda_2 = self.update_lambda(zeta_mask) 194 | nu = self.update_nu(L.a, L.b, zeta_mask, dataset.X) 195 | omega = self.update_omega(L.a, L.b, zeta_mask) 196 | a, b = self.update_ab(nu, omega, zeta_mask, dataset.X) #might have to mess with the order of these 197 | 198 | return a, b, lambda_1, lambda_2, nu, omega, zeta 199 | 200 | ######### VARIATIONAL LOWER BOUND ########################################################################################################## 201 | 202 | def phi_lower_bound_term(self, lambda_1, lambda_2): 203 | """ 204 | lambda_1: [batch_size, T] 205 | lambda_2: [batch_size, T] 206 | """ 207 | term1 = tf.lgamma(1. + self.alpha) - tf.lgamma(self.alpha) 208 | term2 = (self.alpha - 1.)*(tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2)) 209 | term3 = -1*tf.lgamma(lambda_1 + lambda_2) + tf.lgamma(lambda_1) + tf.lgamma(lambda_2) 210 | term4 = tf.multiply(lambda_1 - 1., tf.digamma(lambda_1) - tf.digamma(lambda_1 + lambda_2)) 211 | term5 = tf.multiply(lambda_2 - 1., tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2)) 212 | #sum over clusters 213 | vb = tf.reduce_sum(term1 + term2 + term3 - term4 - term5, axis=1) 214 | return vb 215 | 216 | def mu_lower_bound_term(self, nu, omega): 217 | #nu: [batch_size, T, D] 218 | #omega [batch_size, T] 219 | tot = tf.multiply(-self.Dfl/2.0, tf.divide(1.0, omega) + tf.log(omega) - 1.0) - 0.5*tf.reduce_sum(tf.square(nu),axis=2) 220 | vb = tf.reduce_sum(tot, axis=1) #over number of clusters 221 | return vb 222 | 223 | def tau_lower_bound_term(self, a, b): 224 | #a, b: [batch_size, T] 225 | tot = tf.lgamma(a) - tf.multiply(a - 1.,tf.digamma(a)) - tf.log(b) + a - tf.divide(a, b) 226 | vb = tf.reduce_sum(tot, axis=1) #sum over clusters 227 | return vb 228 | 229 | def z_lower_bound_term(self, lambda_1, lambda_2, zeta, mask): 230 | #lambda_1: [batch_size, T] 231 | #lambda_2: [batch_size, T] 232 | #zeta: [batch_size, N, T] 233 | c = -tf.log(zeta + self.log_constant) + self.eta_z(lambda_1, lambda_2)[:,tf.newaxis,:] 234 | 235 | # batch_Size N T --> batch N 236 | e = tf.reduce_sum(tf.multiply(zeta, c),axis=2) #over clusters 237 | e_mask = tf.multiply(e, mask) 238 | vb = tf.reduce_sum(e_mask, axis=1) #over I 239 | 240 | return vb 241 | 242 | def x_lower_bound_term(self, a, b, nu, omega, zeta_mask, X): 243 | #X: batch_size, N, D 244 | #self.eta_x: batch N T 245 | EqLogPxGivenZ = self.eta_x(a, b, nu, omega, X) 246 | #zeta_mask: batch_size, N, T 247 | tot = tf.multiply(zeta_mask, EqLogPxGivenZ) 248 | #tot batch_size N T 249 | vb = tf.reduce_sum(tot, axis=[1,2]) 250 | return vb 251 | 252 | def evidence_lower_bound(self, L, D): 253 | phi_lb = self.phi_lower_bound_term(L.lambda_1, L.lambda_2) 254 | mu_lb = self.mu_lower_bound_term(L.nu, L.omega) 255 | tau_lb = self.tau_lower_bound_term(L.a, L.b) 256 | z_lb = self.z_lower_bound_term(L.lambda_1, L.lambda_2, L.zeta, D.mask) 257 | x_lb = self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, tf.multiply(D.mask[:,:,tf.newaxis],L.zeta), D.X) 258 | return phi_lb + mu_lb + tau_lb + z_lb + x_lb 259 | 260 | ######### INFERENCE FUNCTIONS ########################################################################################################## 261 | 262 | def infer(self, _a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta, X, mask): 263 | """ 264 | Performs variational inference in DPGMM for n_iter number of iterations, 265 | then returns inferred latent variables 266 | 267 | _a, _b, _lambda_1, _lambda_2, _nu, _zeta: initial parameters for inference 268 | X: data matrix (batch_size x nDatapoints x dimensions) 269 | mask: 1 if consider as datapoint, 0 if ignore (batch_size x nDatapoints) 270 | """ 271 | 272 | ##Initial input into "while" loop, i.e., inference iterations 273 | i = tf.constant(0) 274 | latents = collections.namedtuple('latents', ['a', 'b', 'lambda_1', 'lambda_2', 'nu', 'omega', 'zeta']) 275 | dataset = collections.namedtuple('dataset', ['X', 'mask']) 276 | init_iteration = (i, latents(_a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta), dataset(X, mask)) 277 | 278 | cond = lambda i, L, D: i < self.n_iter 279 | def body(i, L, D): 280 | a, b, lambda_1, lambda_2, nu, omega, zeta = self.update_all(L, D) 281 | return (i + 1, latents(a, b, lambda_1, lambda_2, nu, omega, zeta), D) 282 | 283 | final_iteration = tf.while_loop(cond, body, init_iteration) 284 | 285 | return final_iteration[1] 286 | 287 | def elbo_infer(self, _a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta, X, mask, batch_size): 288 | """ 289 | Performs variational inference in DPGMM for n_iter number of iterations, 290 | and also calculates the change in ELBO at each update 291 | returns inferred latent variables and changes in ELBO 292 | """ 293 | 294 | i = tf.constant(0) 295 | latents = collections.namedtuple('latents', ['a', 'b', 'lambda_1', 'lambda_2', 'nu', 'omega', 'zeta']) 296 | dataset = collections.namedtuple('dataset', ['X', 'mask']) 297 | #ELBO term names: "updated-variable_term-of-lower-bound" 298 | ELBO_terms = ['zeta_z', 'zeta_x', 'lambda_phi', 'lambda_z', 'nu_mu', 'nu_x', 'omega_mu', 'omega_x', 'ab_tau', 'ab_x', 'total'] 299 | empty_ELBO_terms = tuple([tf.TensorArray(dtype=tf.float32, size=self.n_iter, element_shape=batch_size, name=ELBO_terms[j]) for j in range(11)]) 300 | init_iteration = (i, latents(_a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta), dataset(X, mask)) + empty_ELBO_terms 301 | 302 | cond = lambda i, L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total: i < self.n_iter 303 | def body(i, L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total): 304 | 305 | zeta = self.update_zeta(L.a, L.b, L.lambda_1, L.lambda_2, L.nu, L.omega, D.X) 306 | zeta_mask = tf.multiply(D.mask[:,:,tf.newaxis], zeta) 307 | zeta_z = zeta_z.write(i, self.z_lower_bound_term(L.lambda_1, L.lambda_2, zeta, D.mask)-self.z_lower_bound_term(L.lambda_1, L.lambda_2, L.zeta, D.mask)) 308 | zeta_x = zeta_x.write(i, self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, tf.multiply(D.mask[:,:,tf.newaxis],L.zeta), D.X)) 309 | 310 | lambda_1, lambda_2 = self.update_lambda(zeta_mask) 311 | lambda_phi = lambda_phi.write(i, self.phi_lower_bound_term(lambda_1, lambda_2) - self.phi_lower_bound_term(L.lambda_1, L.lambda_2)) 312 | lambda_z = lambda_z.write(i,self.z_lower_bound_term(lambda_1, lambda_2, zeta, D.mask)-self.z_lower_bound_term(L.lambda_1, L.lambda_2, zeta, D.mask)) 313 | 314 | nu = self.update_nu(L.a, L.b, zeta_mask, D.X) 315 | nu_mu = nu_mu.write(i, self.mu_lower_bound_term(nu, L.omega)-self.mu_lower_bound_term(L.nu, L.omega)) 316 | nu_x = nu_x.write(i, self.x_lower_bound_term(L.a, L.b, nu, L.omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, zeta_mask, D.X)) 317 | 318 | omega = self.update_omega(L.a, L.b, zeta_mask) 319 | omega_mu = omega_mu.write(i, self.mu_lower_bound_term(nu, omega)-self.mu_lower_bound_term(nu, L.omega)) 320 | omega_x = omega_x.write(i, self.x_lower_bound_term(L.a, L.b, nu, omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, nu, L.omega, zeta_mask, D.X)) 321 | 322 | a, b = self.update_ab(nu, omega, zeta_mask, D.X) 323 | ab_tau = ab_tau.write(i, self.tau_lower_bound_term(a, b) - self.tau_lower_bound_term(L.a, L.b)) 324 | ab_x = ab_x.write(i, self.x_lower_bound_term(a, b, nu, omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, nu, omega, zeta_mask, D.X)) 325 | 326 | updated_L = latents(a, b, lambda_1, lambda_2, nu, omega, zeta) 327 | total = total.write(i, self.evidence_lower_bound(updated_L, D) - self.evidence_lower_bound(L, D)) 328 | 329 | return (i+1, updated_L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total) 330 | 331 | final_iteration = tf.while_loop(cond, body, init_iteration) 332 | 333 | return final_iteration[1], [final_iteration[i].stack() for i in range(3,14)] 334 | 335 | def variational_inference(data, alpha=1.0, T=10, n_iter=10, tf_seed=None, get_elbo=False, tf_device='/cpu:0'): 336 | """ 337 | Tensorflow setup to run variational inference 338 | 339 | data: matrix of datapoints, size should be (batch_size, max_number_of_datapoints, dimesionality_of_data) 340 | batches that have different number of datapoints can be run together by padding the smaller data matrices with zero vectors 341 | alpha: Dirichlet concentration parameter 342 | T: truncation paper 343 | n_iter: number of iterations to run VI for 344 | get_elbo: if True, measure & return the change in ELBO for each update 345 | 346 | """ 347 | 348 | #size of dataset 349 | batch_size = np.shape(data)[0] 350 | N = np.shape(data)[1] 351 | D = np.shape(data)[2] 352 | 353 | with tf.Graph().as_default(): 354 | with tf.device(tf_device): 355 | 356 | tf.set_random_seed(tf_seed) 357 | X = tf.placeholder(tf.float32, shape=[batch_size, N, D]) 358 | 359 | mixture_model = dpgmm(alpha, D, n_iter, T) 360 | init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, mask = mixture_model.initialize_latents(X, batch_size, shared=False) 361 | 362 | if not get_elbo: 363 | inferred_latents = mixture_model.infer(init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, X, mask) 364 | else: 365 | inferred_latents, ELBO_deltas = mixture_model.elbo_infer(init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, X, mask, batch_size) 366 | 367 | ##Run graph 368 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: 369 | 370 | sess.run(tf.global_variables_initializer()) 371 | if not get_elbo: 372 | inferred_latents_out = sess.run([inferred_latents], feed_dict = {X: data}) 373 | return inferred_latents_out[0] 374 | else: 375 | inferred_latents_out, ELBO_deltas_out = sess.run([inferred_latents, ELBO_deltas], feed_dict = {X: data}) 376 | ELBO_terms = collections.namedtuple('ELBO_terms', ['zeta_z', 'zeta_x', 'lambda_phi', 'lambda_z', 'nu_mu', 'nu_x', 'omega_mu', 'omega_x', 'ab_tau', 'ab_x', 'total']) 377 | return inferred_latents_out, ELBO_terms(*ELBO_deltas_out) 378 | --------------------------------------------------------------------------------