├── README.md
├── diagonal
    ├── README.md
    ├── Variational_Inference_in_DPGMM_Diagonal_.pdf
    ├── bound_check.py
    ├── demos.py
    └── dpgmm_vi.py
└── spherical
    ├── README.md
    ├── Variational_Inference_in_DPGMM_Spherical.pdf
    ├── bound_check.py
    ├── demos.py
    └── dpgmm_vi.py


/README.md:
--------------------------------------------------------------------------------
 1 | # tf_dpgmm
 2 | Variational inference in Dirichlet process Gaussian mixture model (tensorflow implementation), for spherical and diagonal covariance models 
 3 | 
 4 | There is a folder for each model type, and each contains:
 5 | - a pdf containing the equations and derivations for the evidence lower bound and variational updates
 6 | - ```dpgmm_vi.py```: a tensorflow implementation of variational inference in the model 
 7 | - ```bound_check.py```: a comparison of the analytical and Monte Carlo estimates of the ELBO. We used this to check that our derivations and code are correct, because the two estimates match. 
 8 | - ```demos.py```: examples of how to use ```dpgmm_vi.py```, including plotting changes in ELBO with each update and clustering results
 9 | 
10 | These codes have not been optimized for performance. Please let us know if you find any mistakes!
11 | 


--------------------------------------------------------------------------------
/diagonal/README.md:
--------------------------------------------------------------------------------
1 | # tf_dpgmm (diagonal covariance)
2 | If any datapoints are equal to the zero vector, they will be ignored. See the use of ```zeta_mask``` in ```dpgmm_vi.py```. This enables the use of differently sized datasets (because you can pad the smaller ones with zero vectors), but may not be desirable for you!


--------------------------------------------------------------------------------
/diagonal/Variational_Inference_in_DPGMM_Diagonal_.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcusi/tf_dpgmm/32424fa6db31561e8b5a322c5893df5814fc43f7/diagonal/Variational_Inference_in_DPGMM_Diagonal_.pdf


--------------------------------------------------------------------------------
/diagonal/bound_check.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.stats import norm, gamma, rv_discrete, beta
  3 | from scipy.special import digamma
  4 | from scipy.special import gamma as gammaf
  5 | 
  6 | lgamma = lambda x: np.log(gammaf(x))
  7 | 
  8 | """
  9 | DIAGONAL COVARIANCE
 10 | 
 11 | Code to check analytical derivations of ELBO in Variational_Inference_in_DPGMM_derivation.pdf against Monte Carlo estimates 
 12 | 
 13 | python bound_check.py
 14 | 
 15 | lbh@mit.edu, october 2018
 16 | """
 17 | 
 18 | np.random.seed(0)
 19 | 
 20 | D=3
 21 | K=2
 22 | 
 23 | nu = np.random.randn(K, D)
 24 | omega = np.random.random([K, D]) + 1
 25 | zeta = np.array([0.2, 0.8])
 26 | 
 27 | a = np.ones([K, D])
 28 | b = 1.5*np.ones([K, D])
 29 | 
 30 | lambda1 = np.ones(K)
 31 | lambda2 = 2.*np.ones(K)
 32 | 
 33 | alpha = 3.
 34 | p_phi = beta(1, alpha)
 35 | p_mu = norm
 36 | p_tau = gamma(a=1., scale=1.)
 37 | def log_p_z(phi, z):
 38 | 	p = np.concatenate([[1], np.cumprod(1-phi[:-1])]) * phi
 39 | 	return np.log(p[z])
 40 | def p_x(z, mu, tau): return norm(loc=mu[z], scale=np.sqrt(1./tau[z]))
 41 | 
 42 | q_phi = beta(lambda1, lambda2)
 43 | q_mu = norm(loc=nu, scale=1./np.sqrt(omega))
 44 | q_tau = gamma(a=a, scale=1./b) #tau is precision!
 45 | q_z = rv_discrete(values=(range(K), zeta))
 46 | 
 47 | x = 3.
 48 | 
 49 | N = 50001
 50 | 
 51 | print('Diagonal Covariance Model')
 52 | # ####### phi term in the ELBO ########
 53 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
 54 | # Analytical
 55 | bound = sum((lgamma(1. + alpha) - lgamma(alpha)
 56 | 			+ (alpha - 1.)*(digamma(l2_k) - digamma(l1_k + l2_k))
 57 | 			- lgamma(l1_k + l2_k) + lgamma(l1_k) + lgamma(l2_k)
 58 | 			- (l1_k - 1.)*(digamma(l1_k) - digamma(l1_k + l2_k))
 59 | 			- (l2_k - 1.)*(digamma(l2_k) - digamma(l1_k + l2_k)))
 60 | 			for (l1_k, l2_k) in zip(lambda1, lambda2))
 61 | print("Analytical phi term in ELBO:", bound)
 62 | 
 63 | # Monte Carlo
 64 | print('Monte Carlo estimate of phi term in ELBO:')
 65 | np.random.seed()
 66 | bounds = []
 67 | for i in range(N):
 68 | 	phi = q_phi.rvs()
 69 | 
 70 | 	bounds.append(sum(p_phi.logpdf(phi) - q_phi.logpdf(phi))) #Sum over K for MC estimate
 71 | 	if i%5000 == 0: print(i, np.mean(bounds))
 72 | 
 73 | 
 74 | 
 75 | 
 76 | # ####### mu term in the ELBO ########
 77 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
 78 | # Analytical
 79 | # OLD:
 80 | # bound = sum(-0.5*nu_k**2 for nu_k in nu)
 81 | # NEW:
 82 | bound = -0.5 * sum(sum(1./omega + nu**2 + np.log(omega) - 1))
 83 | print("Analytical mu term in ELBO:", bound)
 84 | 
 85 | # Monte Carlo
 86 | print('Monte Carlo estimate of mu term in ELBO:')
 87 | np.random.seed()
 88 | bounds = []
 89 | for i in range(N):
 90 | 	mu = q_mu.rvs()
 91 | 	bounds.append(sum(sum(p_mu.logpdf(mu) - q_mu.logpdf(mu)))) #Sum over K for MC estimate
 92 | 	if i%5000 == 0: print(i, np.mean(bounds))
 93 | 
 94 | 
 95 | 
 96 | 
 97 | # ####### tau term in the ELBO ########
 98 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
 99 | # Analytical
100 | #OLD:
101 | # bound = sum(lgamma(a_k) - (a_k-1.)*digamma(a_k) - np.log(b_k) + 1 - np.divide(a_k,b_k)
102 | # 			for (a_k, b_k) in zip(a, b))
103 | #NEW:
104 | bound = sum(sum(lgamma(a_k) - (a_k-1.)*digamma(a_k) - np.log(b_k) + a_k - np.divide(a_k,b_k)
105 |  			for (a_k, b_k) in zip(a, b)))
106 | print("Analytical tau term in ELBO:", bound)
107 | 
108 | # Monte Carlo
109 | print('Monte Carlo estimate of mu term in ELBO:')
110 | np.random.seed()
111 | bounds = []
112 | for i in range(N):
113 | 	tau = q_tau.rvs()
114 | 
115 | 	bounds.append(sum(sum(p_tau.logpdf(tau) - q_tau.logpdf(tau)))) #Sum over K for MC estimate
116 | 	if i%5000 == 0: print(i, np.mean(bounds))
117 | 
118 | 
119 | 
120 | 
121 | # ####### z term in the ELBO ########
122 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
123 | # Analytical
124 | bound = sum(zeta_k*(
125 | 				- np.log(zeta_k)
126 | 				+ digamma(l1_k) - digamma(l1_k+l2_k)
127 | 				+ sum(digamma(lambda2[j]) - digamma(lambda1[j]+lambda2[j]) for j in range(k)))
128 | 			for (l1_k, l2_k, zeta_k, k) in zip(lambda1, lambda2, zeta, range(K)))
129 | print("Analytical z term in ELBO:", bound)
130 | 
131 | # Monte Carlo
132 | print('Monte Carlo estimate of z term in ELBO:')
133 | np.random.seed()
134 | bounds = []
135 | for i in range(N):
136 | 	phi = q_phi.rvs()
137 | 	z = q_z.rvs()
138 | 
139 | 	bounds.append(log_p_z(phi, z) - q_z.logpmf(z)) #There's only a single datapoint, so no need for sum
140 | 	if i%5000 == 0: print(i, np.mean(bounds))
141 | 
142 | 
143 | 
144 | 
145 | # ####### x term in the ELBO ########
146 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
147 | # Analytical
148 | bound = sum(sum(zeta_k * (
149 | 			-1/2. * (np.log(2 * np.pi) - digamma(ak) + np.log(bk))
150 | 			-(ak/(2.*bk)) * (x - nu_k)**2
151 | 			-(ak/(2.*bk)) * gammaf(3./2.) * (2*np.pi)**(-1/2.) * 2.**(3./2.) / omega_k
152 | 		) for (ak, bk, nu_k, zeta_k, omega_k) in zip(a, b, nu, zeta, omega)))
153 | print("Analytical x term in ELBO:", bound)
154 | 
155 | # Monte Carlo
156 | print('Monte Carlo estimate of x term in ELBO:')
157 | np.random.seed()
158 | bounds = []
159 | for i in range(N):
160 | 	mu = q_mu.rvs()
161 | 	tau = q_tau.rvs()
162 | 	z = q_z.rvs()
163 | 
164 | 	bounds.append(sum(p_x(z, mu, tau).logpdf(x))) #sum over d (There's only a single datapoint, so no need for sum over i)
165 | 	if i%5000 == 0: print(i, np.mean(bounds))


--------------------------------------------------------------------------------
/diagonal/demos.py:
--------------------------------------------------------------------------------
  1 | from dpgmm_vi import variational_inference 
  2 | import numpy as np
  3 | 
  4 | import matplotlib
  5 | #matplotlib.use('Agg')
  6 | import matplotlib.pyplot as plt 
  7 | from matplotlib.patches import Ellipse
  8 | import colorsys 
  9 | 
 10 | """
 11 | DIAGONAL COVARIANCE
 12 | 
 13 | Plots to demonstrate use of dpgmm_vi.py
 14 | 
 15 | python demos.py
 16 | 
 17 | mcusi@mit.edu, july 2018
 18 | """
 19 | 
 20 | 
 21 | def gen_demo_data(batch_size=1, np_seed=None, D=2, use_zeros=True):
 22 |     #generates data from multivariate gaussian with diagonal covariance
 23 |     Nmax = 6*25
 24 |     np.random.seed(np_seed)
 25 |     for b in range(batch_size):
 26 |         K = np.random.randint(2, high=6+1)
 27 |         in_dataset = 0
 28 |         for k in range(K):
 29 |             mean = 4*np.random.randn(D) - 1
 30 |             cov = np.diag(np.random.rand(2) + 0.1)
 31 |             n = np.random.randint(10, high=25+1)
 32 |             in_dataset += n
 33 |             gaussian_data = np.random.multivariate_normal(mean, cov, n)
 34 |             _data = gaussian_data if k == 0 else np.vstack((_data, gaussian_data))
 35 |         n_zeros = Nmax - in_dataset ##use zeros to pad smaller datasets
 36 |         _data = np.vstack((np.zeros((n_zeros,D)),_data))
 37 |         np.random.shuffle(_data)
 38 |         _data = np.float32(_data[np.newaxis,:,:])
 39 |         data = _data if b == 0 else np.concatenate((data,_data))
 40 | 
 41 |     return data
 42 | 
 43 | def ELBO_demo(np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20):
 44 | 
 45 |     #Generate toy data
 46 |     data = gen_demo_data(batch_size = 1, np_seed = np_seed)
 47 |     nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2))
 48 | 
 49 |     #Run inference
 50 |     inferred_latents, ELBO_deltas = variational_inference(data, alpha=alpha, T=T, n_iter=max_n_iter, tf_seed=tf_seed, get_elbo=True)
 51 | 
 52 |     #Plot change in ELBO with updates
 53 |     ##If you use [1:] after each of the plot arguments, 
 54 |     ##You can see that the change is still positive after the first iteration
 55 |     plt.plot(ELBO_deltas.total); plt.title('Change in ELBO with each set of updates'); plt.show();
 56 | 
 57 |     plt.plot(ELBO_deltas.zeta_z + ELBO_deltas.zeta_x); plt.plot(ELBO_deltas.zeta_z); plt.plot(ELBO_deltas.zeta_x);
 58 |     plt.legend(['z+x', 'z', 'x'], loc='upper right'); plt.title('Change in z&x ELBO terms due to zeta update'); plt.show()
 59 | 
 60 |     plt.plot(ELBO_deltas.lambda_z + ELBO_deltas.lambda_z); plt.plot(ELBO_deltas.lambda_z); plt.plot(ELBO_deltas.lambda_phi);
 61 |     plt.legend(['z+phi', 'z', 'phi'], loc='upper right'); plt.title('Change in z&phi ELBO terms due to lambda update'); plt.show()
 62 | 
 63 |     plt.plot(ELBO_deltas.nu_mu + ELBO_deltas.nu_x); plt.plot(ELBO_deltas.nu_mu); plt.plot(ELBO_deltas.nu_x);
 64 |     plt.legend(['mu+x', 'mu', 'x'], loc='upper right'); plt.title('Change in mu&x ELBO terms due to nu update'); plt.show()
 65 | 
 66 |     plt.plot(ELBO_deltas.omega_mu + ELBO_deltas.omega_x); plt.plot(ELBO_deltas.omega_mu); plt.plot(ELBO_deltas.omega_x);
 67 |     plt.legend(['mu+x', 'mu', 'x'], loc='upper right'); plt.title('Change in mu&x ELBO terms due to omega update'); plt.show()
 68 | 
 69 |     plt.plot(ELBO_deltas.ab_tau + ELBO_deltas.ab_x); plt.plot(ELBO_deltas.ab_tau); plt.plot(ELBO_deltas.ab_x);
 70 |     plt.legend(['tau+x', 'tau', 'x'], loc='upper right'); plt.title('Change in tau&x ELBO terms due to ab update'); plt.show()
 71 | 
 72 |     ##Plot each datapoint with a colour corresponding to the variational cluster to which it is assigned with maximum probability
 73 |     batch_number = 0
 74 |     nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)]
 75 |     inferred_zeta = inferred_latents.zeta[batch_number,nonzero_datapoints,:]
 76 |     assignments = np.argmax(inferred_zeta, axis=1)
 77 |     plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1],c=assignments,marker='x')
 78 |     plt.gca().set_xlim([-10,10])
 79 |     plt.gca().set_ylim([-10,10])
 80 |     plt.gca().set_aspect('equal', adjustable='box')
 81 |     plt.title('MAP assignments of datapoints to clusters')
 82 |     plt.show()
 83 | 
 84 | def clusters_demo(np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20):
 85 | 
 86 |     data = gen_demo_data(batch_size = 1, np_seed = np_seed)
 87 |     nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2))
 88 | 
 89 |     for n_iter in [0, 1, 2, 5, 10, max_n_iter]:
 90 |         inferred_latents = variational_inference(data, alpha=alpha, T=T, n_iter=n_iter, tf_seed=tf_seed, get_elbo=False)
 91 | 
 92 |         #Plot means and datapoints as points
 93 |         batch_number = 0
 94 |         nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)]
 95 |         plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1], marker='x')
 96 |         plt.scatter(inferred_latents.nu[batch_number,:,0] + 0.01*np.random.randn(T), inferred_latents.nu[batch_number,:,1],marker='o',s=30,color='r')
 97 |         
 98 |         #Plot expected standard deviation as diameter of ellipse
 99 |         patches = []; 
100 |         diameter = 2*np.sqrt(1./np.divide(inferred_latents.a, inferred_latents.b))
101 |         
102 |         #Plot marginal cluster probabilities as the transparency of circle
103 |         l1 = inferred_latents.lambda_1[batch_number,:]
104 |         l2 = inferred_latents.lambda_2[batch_number,:]
105 |         beta_means = np.divide(l1,l1 + l2)
106 |         log_beta_means = np.log(beta_means + 1e-30)
107 |         cs = np.concatenate(( [0], np.cumsum( np.log(1-beta_means+1e-30)[:-1]) )) #SBP
108 |         beta_expectation = np.exp(log_beta_means + cs)
109 |         beta_expectation /= (1.*np.sum(beta_expectation))               
110 |         for k in range(T):
111 |             circle = Ellipse((inferred_latents.nu[batch_number,k,0], inferred_latents.nu[batch_number,k,1]), diameter[batch_number,k,0], diameter[batch_number,k,1])
112 |             plt.gca().add_artist(circle)
113 |             circle.set_alpha(beta_expectation[k])   
114 |         plt.gca().set_xlim([-10,10])
115 |         plt.gca().set_ylim([-10,10])
116 |         plt.gca().set_aspect('equal', adjustable='box')
117 |         plt.title('Variational distributions at iteration ' + str(n_iter))
118 |         plt.show()
119 | 
120 | def batch_demo(batch_size=2, np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20):
121 | 
122 |     data = gen_demo_data(batch_size = batch_size, np_seed = np_seed)
123 |     nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2))
124 | 
125 |     #Run inference
126 |     inferred_latents = variational_inference(data, alpha=alpha, T=T, n_iter=max_n_iter, tf_seed=tf_seed, get_elbo=False)
127 | 
128 |     for batch_number in range(batch_size):
129 |         nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)]
130 |         inferred_zeta = inferred_latents.zeta[batch_number, nonzero_datapoints, :]
131 | 
132 |         #plot weighted points 
133 |         #https://stackoverflow.com/questions/41314736/scatterplot-wherein-each-point-color-is-a-different-mixture-of-k-colors
134 |         HSV = [(x*1.0/T, 0.8, 0.5) for x in np.random.permutation(T)]
135 |         RGB = np.array(map(lambda x: colorsys.hsv_to_rgb(*x), HSV))
136 |         assignments = np.sum(np.multiply(RGB[np.newaxis, :, :], inferred_zeta[:, :, np.newaxis]),axis=1)
137 |         plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1],c=assignments,marker='x')
138 |         plt.title('Weighted assignments, batch ' + str(batch_number))
139 |         plt.gca().set_xlim([-10,10])
140 |         plt.gca().set_ylim([-10,10])
141 |         plt.gca().set_aspect('equal', adjustable='box')
142 |         plt.show()
143 | 
144 | if __name__ == "__main__":
145 |     print('Diagonal Covariance Model')
146 |     np_seed = 23; tf_seed = 100; alpha=1.0; T=100; max_n_iter=20;
147 |     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
148 |     print('Change in ELBO with each iteration of updates:')
149 |     ELBO_demo(np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter)
150 |     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
151 |     print('Change in latent parameters with increasing number of updates:')
152 |     clusters_demo(np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter)
153 |     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
154 |     print('This script can use batches of different datasets:')
155 |     batch_demo(batch_size = 2, np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter)
156 | 
157 | 


--------------------------------------------------------------------------------
/diagonal/dpgmm_vi.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from scipy.special import gamma as gamma_func
  3 | import numpy as np
  4 | import collections
  5 | 
  6 | """
  7 | DIAGONAL COVARIANCE
  8 | 
  9 | Unsupervised clustering in R^D
 10 | 
 11 | TF implementation of variational inference in a 
 12 | Dirichlet Process isotropic Gaussian mixture model
 13 | Derivation in Variational_Inference_in_DPGMM_derivation.pdf included in git repo 
 14 | 
 15 | Clusters matrix X (batch_size x N x D) of N datapoints with dimensionality D
 16 | If a datapoint = zero vector, it is ignored. 
 17 | This option allows you to use batched datasets of different sizes
 18 | 
 19 | For examples of how to use this code, see demos.py
 20 | 
 21 | mcusi@mit.edu, july 2018
 22 | 
 23 | """
 24 | 
 25 | class dpgmm():
 26 | 
 27 |     ######### INITIALIZATION ##########################################################################################################
 28 | 
 29 |     def __init__(self, alpha, D, n_iter, T, covariance_type='diagonal'):
 30 | 
 31 |         self.alpha = alpha; #Dirichlet concentration parameter
 32 |         self.D = D; self.Dfl = tf.cast(self.D, dtype=tf.float32); #dimensionality of data
 33 |         self.T = T; #truncation value
 34 |  
 35 |         self.gaussian_const = np.divide(gamma_func(1.5)*(2.0**1.5), np.sqrt(2.*np.pi))
 36 | 
 37 |         #Initialization settings
 38 |         self.mu_std = 5.
 39 | 
 40 |         #inference settings
 41 |         self.n_iter = n_iter;
 42 |         self.log_constant = 1e-30
 43 | 
 44 |     def initialize_latents(self, X, batch_size, shared=True, use_mask=True):
 45 |         """
 46 |         > randomly initializes variational distribution parameters
 47 |         > if shared == True, batches share the same initialization
 48 |         """
 49 | 
 50 |         N = tf.shape(X)[1]
 51 |         shape_T = [self.T] if shared else [batch_size, self.T]
 52 |         shape_TD = [self.T, self.D] if shared else [batch_size, self.T, self.D]
 53 | 
 54 |         a = tf.get_variable("a", shape_TD, dtype=tf.float32,
 55 |                 initializer=tf.ones_initializer())
 56 |         b = tf.get_variable("b", shape_TD, dtype=tf.float32,
 57 |                 initializer=tf.ones_initializer())
 58 |         lambda_1 = tf.get_variable("lambda_1", shape_T, dtype=tf.float32,
 59 |                 initializer=tf.ones_initializer())
 60 |         lambda_2 = tf.get_variable("lambda_2", shape_T, dtype=tf.float32,
 61 |                 initializer=tf.ones_initializer())
 62 |         nu = tf.get_variable("nu", shape_TD, dtype=tf.float32,
 63 |                 initializer=tf.random_normal_initializer(stddev=self.mu_std)) 
 64 |         omega = tf.get_variable("omega", shape_TD, dtype=tf.float32,
 65 |                 initializer=tf.ones_initializer())
 66 | 
 67 |         if shared:
 68 | 
 69 |             a = tf.tile(a[tf.newaxis, :, :], [batch_size, 1, 1])
 70 |             b = tf.tile(b[tf.newaxis, :, :], [batch_size, 1, 1])
 71 |             lambda_1 = tf.tile(lambda_1[tf.newaxis, :], [batch_size, 1])
 72 |             lambda_2 = tf.tile(lambda_2[tf.newaxis, :], [batch_size, 1])
 73 |             nu = tf.tile(nu[tf.newaxis, :, :], [batch_size, 1, 1])
 74 |             omega = tf.tile(omega[tf.newaxis, :, :], [batch_size, 1, 1])
 75 |         
 76 |         # zeta will be the first in the update distribution 
 77 |         # so this initialization is only necessary for ELBO calculation
 78 |         alpha_vec = tf.fill([batch_size, self.T], self.alpha)
 79 |         zeta_dist = tf.distributions.Dirichlet(alpha_vec)
 80 |         #zeta: batch_size N T
 81 |         zeta = tf.transpose(zeta_dist.sample([N]),perm=[1,0,2])
 82 |         #mask: batch_size N
 83 |         if use_mask:
 84 |             mask = tf.cast(tf.logical_not(tf.reduce_all(tf.equal(X,0),axis=2)), dtype=tf.float32)
 85 |         else:
 86 |             mask = tf.ones([batch_size, N])
 87 | 
 88 |         return a, b, lambda_1, lambda_2, nu, omega, zeta, mask
 89 | 
 90 |     ######### UPDATE EQUATIONS ##########################################################################################################
 91 | 
 92 |     def update_lambda(self, zeta_mask):
 93 |         ##lambda_1: only sum over datapoints
 94 |         #nu_z batch N T
 95 |         #embedding_weights batch N
 96 |         lambda_1 = 1.0 + tf.reduce_sum(zeta_mask, axis=1) #over N
 97 |         ##lambda_2: requires sum over classes as well as datapoints 
 98 |         #nu_z: batch N T 
 99 |         l = tf.cumsum(zeta_mask, axis=2, reverse=True, exclusive=True) #over T
100 |         lambda_2 = self.alpha + tf.reduce_sum(l, axis=1) #over N
101 |         return lambda_1, lambda_2
102 | 
103 |     def update_nu(self, a, b, zeta_mask, X):
104 |         # nu_z batch N T newaxis
105 |         # a batch newaxis T D
106 |         # b batch newaxis T D
107 |         w = tf.divide(tf.multiply(zeta_mask[:, :, :, tf.newaxis], 
108 |             a[:, tf.newaxis, :, :]), b[:, tf.newaxis,:, :]) 
109 |         #w : batch N T D
110 |         #X : batch N newaxis D
111 |         numer = tf.reduce_sum(tf.multiply(w, X[:, :, tf.newaxis, :]), axis=1) #over N
112 |         denom = 1.0 + tf.reduce_sum(w, axis=1) #over N
113 |         # numer batch T D
114 |         # denom batch T D
115 |         nu = tf.divide(numer,  denom)
116 |         return nu
117 | 
118 |     def update_omega(self, a, b, zeta_mask):
119 | 
120 |         # a batch newaxis T D
121 |         # b batch newaxis T D
122 |         ratio = tf.multiply(tf.divide(a, b), self.gaussian_const)[:, tf.newaxis, :, :]
123 |         # nu_z batch N T newaxis
124 |         #WANT: omega batch T D
125 |         omega = 1.0 + tf.reduce_sum( tf.multiply(zeta_mask[:, :, :, tf.newaxis], ratio) , axis=1) #over N
126 | 
127 |         return omega
128 | 
129 |     def update_ab(self, nu, omega, zeta_mask, X):
130 |         #nu_z_masked batch N T
131 |         #a batch T
132 |         a = 1.0 + tf.multiply(0.5, tf.reduce_sum(zeta_mask, axis=1))#over N
133 |         #a batch T D
134 |         a = tf.tile(a[:, :, tf.newaxis], [1, 1, self.D])
135 |         
136 |         #X batch N newaxis D
137 |         #nu batch newaxis T D
138 |         #squared_difference batch N T D         
139 |         squared_difference = tf.square(X[:,:,tf.newaxis,:] - nu[:,tf.newaxis,:,:])
140 |         #omega batch newaxis T D 
141 |         s = squared_difference + tf.divide(self.gaussian_const, omega[:, tf.newaxis, :, :])
142 |         #zeta_mask batch N T newaxis
143 |         #b batch T D 
144 |         b = 1.0 + 0.5*tf.reduce_sum(tf.multiply(zeta_mask[:, :, :, tf.newaxis], s), axis=1) #over N
145 |         
146 |         return a, b
147 | 
148 |     def eta_x(self, a, b, nu, omega, X):
149 |         """
150 |         eta_x_{i, k, d} = E_q[log P(x_{i,d} | z_i = k, mu_{k,d}, var_{k,d})]
151 |         eta_x: 
152 | 
153 |         """
154 | 
155 |         #a batch_size, T, D
156 |         #b batch_size, T, D
157 |         ab1 = tf.multiply(-0.5, tf.log(2*np.pi) - tf.digamma(a) + tf.log(b))
158 |         ab2 = tf.divide(a, -2.0*b)
159 | 
160 |         # X: batch N newaxis D
161 |         # mu: batch newaxis T D 
162 |         squared_difference = tf.square(tf.subtract(X[:, :, tf.newaxis, :], nu[:, tf.newaxis, :, :]))
163 |         #omega: batch newaxis T D 
164 |         s = squared_difference + tf.divide(self.gaussian_const, omega[:, tf.newaxis, :, :])
165 |         #s: batch N T D 
166 |         #ab1 batch_size, newaxis, T, D
167 |         #ab2 batch_size, newaxis, T, D
168 |         Eq = ab1[:, tf.newaxis, :, :] + tf.multiply(ab2[:,tf.newaxis, :, :], s)
169 | 
170 |         return Eq
171 | 
172 |     def eta_z(self, lambda_1, lambda_2):
173 |         #lambda_1, lambda_2: batch_size, T
174 |         d1 = tf.digamma(lambda_1) - tf.digamma(lambda_1 + lambda_2)
175 |         d2 = tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2)
176 |         d_cumsum = tf.cumsum(d2, axis=1, exclusive=True)
177 |         return d1 + d_cumsum
178 | 
179 |     def update_zeta(self, a, b, lambda_1, lambda_2, nu, omega, X): 
180 | 
181 |         #self.eta_x: batch N T D --> sum over D
182 |         #self.eta_z: batch newaxis T
183 | 
184 |         prop_log_zeta = self.eta_z(lambda_1, lambda_2)[:, tf.newaxis, :] - 1. + tf.reduce_sum(self.eta_x(a, b, nu, omega, X),axis=3)#over D
185 |         #prop_log_nu_z batch N T 
186 |         log_zeta = tf.subtract(prop_log_zeta, tf.reduce_logsumexp(prop_log_zeta, axis=2, keepdims=True)) #over T
187 |         zeta = tf.exp(log_zeta)
188 | 
189 |         return zeta
190 | 
191 |     def update_all(self, L, dataset):
192 | 
193 |         zeta = self.update_zeta(L.a, L.b, L.lambda_1, L.lambda_2, L.nu, L.omega, dataset.X)
194 |         zeta_mask = tf.multiply(dataset.mask[:,:,tf.newaxis],zeta)
195 |         lambda_1, lambda_2 = self.update_lambda(zeta_mask)
196 |         nu = self.update_nu(L.a, L.b, zeta_mask, dataset.X)
197 |         omega = self.update_omega(L.a, L.b, zeta_mask)
198 |         a, b = self.update_ab(nu, omega, zeta_mask, dataset.X) #might have to mess with the order of these
199 | 
200 |         return a, b, lambda_1, lambda_2, nu, omega, zeta
201 | 
202 |     ######### VARIATIONAL LOWER BOUND ##########################################################################################################
203 | 
204 |     def phi_lower_bound_term(self, lambda_1, lambda_2):
205 |         """
206 |         lambda_1: [batch_size, T]
207 |         lambda_2: [batch_size, T]
208 |         """
209 |         term1 =  tf.lgamma(1. + self.alpha) - tf.lgamma(self.alpha)
210 |         term2 = (self.alpha - 1.)*(tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2))
211 |         term3 = -1*tf.lgamma(lambda_1 + lambda_2) + tf.lgamma(lambda_1) + tf.lgamma(lambda_2)
212 |         term4 = tf.multiply(lambda_1 - 1., tf.digamma(lambda_1) - tf.digamma(lambda_1 + lambda_2))
213 |         term5 = tf.multiply(lambda_2 - 1., tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2))
214 |         #sum over clusters
215 |         vb = tf.reduce_sum(term1 + term2 + term3 - term4 - term5, axis=1)
216 |         return vb
217 | 
218 |     def mu_lower_bound_term(self, nu, omega):
219 |         #nu: [batch_size, T, D]
220 |         #omega [batch_size, T, D]
221 |         summand = tf.divide(1.0, omega) + tf.square(nu) + tf.log(omega) - 1.0
222 |         tot = -0.5 * tf.reduce_sum(summand, axis=2)
223 |         vb = tf.reduce_sum(tot, axis=1) #over number of clusters
224 |         return vb
225 | 
226 |     def tau_lower_bound_term(self, a, b):
227 |         #a, b: [batch_size, T, D]
228 |         tot = tf.lgamma(a) - tf.multiply(a - 1.,tf.digamma(a)) - tf.log(b) + a - tf.divide(a, b)
229 |         vb = tf.reduce_sum(tot, axis=[1,2]) #sum over clusters and dimensions
230 |         return vb
231 | 
232 |     def z_lower_bound_term(self, lambda_1, lambda_2, zeta, mask):
233 |         #lambda_1: [batch_size, T]
234 |         #lambda_2: [batch_size, T]
235 |         #zeta: [batch_size, N, T]
236 |         c = -tf.log(zeta + self.log_constant) + self.eta_z(lambda_1, lambda_2)[:,tf.newaxis,:]
237 | 
238 |         # batch_Size N T --> batch N 
239 |         e = tf.reduce_sum(tf.multiply(zeta, c),axis=2) #over clusters
240 |         e_mask = tf.multiply(e, mask)
241 |         vb = tf.reduce_sum(e_mask, axis=1) #over I 
242 | 
243 |         return vb
244 | 
245 |     def x_lower_bound_term(self, a, b, nu, omega, zeta_mask, X):
246 |         #X: batch_size, N, D
247 |         #self.eta_x: batch N T D 
248 |         EqLogPxGivenZ = self.eta_x(a, b, nu, omega, X)
249 |         #zeta_mask: batch_size, N, T, newaxis
250 |         tot = tf.multiply(zeta_mask[:, :, :, tf.newaxis], EqLogPxGivenZ)
251 |         #tot batch_size N T D
252 |         vb = tf.reduce_sum(tot, axis=[1,2,3])
253 |         return vb
254 | 
255 |     def evidence_lower_bound(self, L, D):
256 |         phi_lb = self.phi_lower_bound_term(L.lambda_1, L.lambda_2) 
257 |         mu_lb = self.mu_lower_bound_term(L.nu, L.omega) 
258 |         tau_lb = self.tau_lower_bound_term(L.a, L.b)
259 |         z_lb = self.z_lower_bound_term(L.lambda_1, L.lambda_2, L.zeta, D.mask)
260 |         x_lb = self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, tf.multiply(D.mask[:,:,tf.newaxis],L.zeta), D.X)
261 |         return phi_lb + mu_lb + tau_lb + z_lb + x_lb
262 | 
263 |     ######### INFERENCE FUNCTIONS ##########################################################################################################
264 | 
265 |     def infer(self, _a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta, X, mask):
266 |         """
267 |         Performs variational inference in DPGMM for n_iter number of iterations,
268 |         then returns inferred latent variables
269 | 
270 |         _a, _b, _lambda_1, _lambda_2, _nu, _zeta: initial parameters for inference
271 |         X: data matrix (batch_size x nDatapoints x dimensions)
272 |         mask: 1 if consider as datapoint, 0 if ignore (batch_size x nDatapoints)
273 |         """
274 |             
275 |         ##Initial input into "while" loop, i.e., inference iterations
276 |         i = tf.constant(0)
277 |         latents = collections.namedtuple('latents', ['a', 'b', 'lambda_1', 'lambda_2', 'nu', 'omega', 'zeta'])
278 |         dataset = collections.namedtuple('dataset', ['X', 'mask'])
279 |         init_iteration = (i, latents(_a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta), dataset(X, mask))
280 | 
281 |         cond = lambda i, L, D: i < self.n_iter
282 |         def body(i, L, D):
283 |             a, b, lambda_1, lambda_2, nu, omega, zeta = self.update_all(L, D)              
284 |             return (i + 1, latents(a, b, lambda_1, lambda_2, nu, omega, zeta), D)
285 |         
286 |         final_iteration = tf.while_loop(cond, body, init_iteration)
287 | 
288 |         return final_iteration[1]   
289 | 
290 |     def elbo_infer(self, _a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta, X, mask, batch_size):
291 |         """
292 |         Performs variational inference in DPGMM for n_iter number of iterations,
293 |         and also calculates the change in ELBO at each update
294 |         returns inferred latent variables and changes in ELBO 
295 |         """
296 | 
297 |         i = tf.constant(0)
298 |         latents = collections.namedtuple('latents', ['a', 'b', 'lambda_1', 'lambda_2', 'nu', 'omega', 'zeta'])
299 |         dataset = collections.namedtuple('dataset', ['X', 'mask'])
300 |         #ELBO term names: "updated-variable_term-of-lower-bound"
301 |         ELBO_terms = ['zeta_z', 'zeta_x', 'lambda_phi', 'lambda_z', 'nu_mu', 'nu_x', 'omega_mu', 'omega_x', 'ab_tau', 'ab_x', 'total']
302 |         empty_ELBO_terms = tuple([tf.TensorArray(dtype=tf.float32, size=self.n_iter, element_shape=batch_size, name=ELBO_terms[j]) for j in range(11)])
303 |         init_iteration = (i, latents(_a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta), dataset(X, mask)) + empty_ELBO_terms
304 | 
305 |         cond = lambda i, L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total: i < self.n_iter 
306 |         def body(i, L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total):
307 | 
308 |             zeta = self.update_zeta(L.a, L.b, L.lambda_1, L.lambda_2, L.nu, L.omega, D.X)
309 |             zeta_mask = tf.multiply(D.mask[:,:,tf.newaxis], zeta)
310 |             zeta_z = zeta_z.write(i, self.z_lower_bound_term(L.lambda_1, L.lambda_2, zeta, D.mask)-self.z_lower_bound_term(L.lambda_1, L.lambda_2, L.zeta, D.mask))
311 |             zeta_x = zeta_x.write(i, self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, tf.multiply(D.mask[:,:,tf.newaxis],L.zeta), D.X))
312 | 
313 |             lambda_1, lambda_2 = self.update_lambda(zeta_mask)
314 |             lambda_phi = lambda_phi.write(i, self.phi_lower_bound_term(lambda_1, lambda_2) - self.phi_lower_bound_term(L.lambda_1, L.lambda_2))
315 |             lambda_z = lambda_z.write(i,self.z_lower_bound_term(lambda_1, lambda_2, zeta, D.mask)-self.z_lower_bound_term(L.lambda_1, L.lambda_2, zeta, D.mask))
316 | 
317 |             nu = self.update_nu(L.a, L.b, zeta_mask, D.X)
318 |             nu_mu = nu_mu.write(i, self.mu_lower_bound_term(nu, L.omega)-self.mu_lower_bound_term(L.nu, L.omega))
319 |             nu_x = nu_x.write(i, self.x_lower_bound_term(L.a, L.b, nu, L.omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, zeta_mask, D.X))
320 | 
321 |             omega = self.update_omega(L.a, L.b, zeta_mask)
322 |             omega_mu = omega_mu.write(i, self.mu_lower_bound_term(nu, omega)-self.mu_lower_bound_term(nu, L.omega))
323 |             omega_x = omega_x.write(i, self.x_lower_bound_term(L.a, L.b, nu, omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, nu, L.omega, zeta_mask, D.X))            
324 | 
325 |             a, b = self.update_ab(nu, omega, zeta_mask, D.X)
326 |             ab_tau = ab_tau.write(i, self.tau_lower_bound_term(a, b) - self.tau_lower_bound_term(L.a, L.b))
327 |             ab_x = ab_x.write(i, self.x_lower_bound_term(a, b, nu, omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, nu, omega, zeta_mask, D.X))
328 | 
329 |             updated_L = latents(a, b, lambda_1, lambda_2, nu, omega, zeta)
330 |             total = total.write(i, self.evidence_lower_bound(updated_L, D) - self.evidence_lower_bound(L, D))
331 | 
332 |             return (i+1, updated_L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total)           
333 | 
334 |         final_iteration = tf.while_loop(cond, body, init_iteration)
335 | 
336 |         return final_iteration[1], [final_iteration[i].stack() for i in range(3,14)]
337 | 
338 | def variational_inference(data, alpha=1.0, T=10, n_iter=10, tf_seed=None, get_elbo=False, tf_device='/cpu:0'):
339 |     """
340 |     Tensorflow setup to run variational inference
341 | 
342 |     data: matrix of datapoints, size should be (batch_size, max_number_of_datapoints, dimesionality_of_data)
343 |           batches that have different number of datapoints can be run together by padding the smaller data matrices with zero vectors 
344 |     alpha: Dirichlet concentration parameter
345 |     T: truncation paper
346 |     n_iter: number of iterations to run VI for
347 |     get_elbo: if True, measure & return the change in ELBO for each update
348 | 
349 |     """
350 | 
351 |     #size of dataset
352 |     batch_size = np.shape(data)[0]
353 |     N = np.shape(data)[1]
354 |     D = np.shape(data)[2]
355 | 
356 |     with tf.Graph().as_default():
357 |         with tf.device(tf_device):
358 | 
359 |             tf.set_random_seed(tf_seed)
360 |             X = tf.placeholder(tf.float32, shape=[batch_size, N, D])            
361 | 
362 |             mixture_model = dpgmm(alpha, D, n_iter, T)
363 |             init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, mask = mixture_model.initialize_latents(X, batch_size, shared=False)
364 | 
365 |             if not get_elbo:
366 |                 inferred_latents = mixture_model.infer(init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, X, mask)
367 |             else:
368 |                 inferred_latents, ELBO_deltas = mixture_model.elbo_infer(init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, X, mask, batch_size)
369 | 
370 |         ##Run graph
371 |         with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
372 |             
373 |             sess.run(tf.global_variables_initializer())
374 |             if not get_elbo:
375 |                 inferred_latents_out = sess.run([inferred_latents], feed_dict = {X: data})
376 |                 return inferred_latents_out[0]
377 |             else:
378 |                 inferred_latents_out, ELBO_deltas_out = sess.run([inferred_latents, ELBO_deltas], feed_dict = {X: data})
379 |                 ELBO_terms = collections.namedtuple('ELBO_terms', ['zeta_z', 'zeta_x', 'lambda_phi', 'lambda_z', 'nu_mu', 'nu_x', 'omega_mu', 'omega_x', 'ab_tau', 'ab_x', 'total'])
380 |                 return inferred_latents_out, ELBO_terms(*ELBO_deltas_out)
381 | 


--------------------------------------------------------------------------------
/spherical/README.md:
--------------------------------------------------------------------------------
1 | # tf_dpgmm (spherical covariance)
2 | If any datapoints are equal to the zero vector, they will be ignored. See the use of ```zeta_mask``` in ```dpgmm_vi.py```. This enables the use of differently sized datasets (because you can pad the smaller ones with zero vectors), but may not be what you want!


--------------------------------------------------------------------------------
/spherical/Variational_Inference_in_DPGMM_Spherical.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mcusi/tf_dpgmm/32424fa6db31561e8b5a322c5893df5814fc43f7/spherical/Variational_Inference_in_DPGMM_Spherical.pdf


--------------------------------------------------------------------------------
/spherical/bound_check.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.stats import norm, gamma, rv_discrete, beta
  3 | from scipy.special import digamma
  4 | from scipy.special import gamma as gammaf
  5 | 
  6 | lgamma = lambda x: np.log(gammaf(x))
  7 | 
  8 | """
  9 | DIAGONAL COVARIANCE
 10 | 
 11 | Code to check analytical derivations of ELBO in Variational_Inference_in_DPGMM_derivation.pdf against Monte Carlo estimates 
 12 | 
 13 | python bound_check.py
 14 | 
 15 | lbh@mit.edu, october 2018
 16 | """
 17 | 
 18 | np.random.seed(0)
 19 | 
 20 | D=3
 21 | K=2
 22 | 
 23 | nu = np.random.randn(K, D)
 24 | omega = np.random.random([K]) + 1
 25 | zeta = np.array([0.2, 0.8])
 26 | 
 27 | a = np.ones([K],dtype=np.float32)
 28 | b = 1.5*np.ones([K],dtype=np.float32)
 29 | 
 30 | lambda1 = np.ones(K,dtype=np.float32)
 31 | lambda2 = 2.*np.ones(K,dtype=np.float32)
 32 | 
 33 | alpha = 3.
 34 | p_phi = beta(1., alpha)
 35 | p_mu = norm
 36 | p_tau = gamma(a=1., scale=1.)
 37 | def log_p_z(phi, z):
 38 | 	p = np.concatenate([[1], np.cumprod(1-phi[:-1])]) * phi
 39 | 	return np.log(p[z])
 40 | def p_x(z, mu, tau): return norm(loc=mu[z], scale=np.sqrt(1./tau[z]))
 41 | 
 42 | q_phi = beta(lambda1, lambda2)
 43 | q_mu = norm(loc=nu, scale=1./np.sqrt(omega[:, None]))
 44 | q_tau = gamma(a=a, scale=1./b) #tau is precision!
 45 | q_z = rv_discrete(values=(range(K), zeta))
 46 | 
 47 | x = np.array([1., 2., 3.])
 48 | 
 49 | N = 200001
 50 | 
 51 | print('Spherical Covariance Model')
 52 | 
 53 | # ####### mu term in the ELBO ########
 54 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
 55 | # Analytical
 56 | # OLD:
 57 | # bound = sum(-0.5*nu_k**2 for nu_k in nu)
 58 | # NEW:
 59 | bound = sum( #over k
 60 | 	-D/2. * (1./omega + np.log(omega) - 1)
 61 | 	- 0.5 * (nu**2).sum(axis=1)
 62 | )
 63 | print("Analytical mu term in ELBO:", bound)
 64 | 
 65 | # Monte Carlo
 66 | print('Monte Carlo estimate of mu term in ELBO:')
 67 | np.random.seed()
 68 | bounds = []
 69 | for i in range(N):
 70 | 	mu = q_mu.rvs()
 71 | 	bounds.append(sum(sum(p_mu.logpdf(mu) - q_mu.logpdf(mu)))) #Sum over K for MC estimate
 72 | 	if i%5000 == 0: print(i, np.mean(bounds))
 73 | 
 74 | 
 75 | 
 76 | # ####### phi term in the ELBO ########
 77 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
 78 | # Analytical
 79 | bound = sum((lgamma(1. + alpha) - lgamma(alpha)
 80 | 			+ (alpha - 1.)*(digamma(l2_k) - digamma(l1_k + l2_k))
 81 | 			- lgamma(l1_k + l2_k) + lgamma(l1_k) + lgamma(l2_k)
 82 | 			- (l1_k - 1.)*(digamma(l1_k) - digamma(l1_k + l2_k))
 83 | 			- (l2_k - 1.)*(digamma(l2_k) - digamma(l1_k + l2_k)))
 84 | 			for (l1_k, l2_k) in zip(lambda1, lambda2))
 85 | print("Analytical phi term in ELBO:", bound)
 86 | 
 87 | # Monte Carlo
 88 | print('Monte Carlo estimate of phi term in ELBO:')
 89 | np.random.seed()
 90 | bounds = []
 91 | for i in range(N):
 92 | 	phi = q_phi.rvs()
 93 | 
 94 | 	bounds.append(sum(p_phi.logpdf(phi) - q_phi.logpdf(phi))) #Sum over K for MC estimate
 95 | 	if i%5000 == 0: print(i, np.mean(bounds))
 96 | 
 97 | 
 98 | 
 99 | # ####### tau term in the ELBO ########
100 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
101 | # Analytical
102 | #OLD:
103 | # bound = sum(lgamma(a_k) - (a_k-1.)*digamma(a_k) - np.log(b_k) + 1 - np.divide(a_k,b_k)
104 | # 			for (a_k, b_k) in zip(a, b))
105 | #NEW:
106 | bound = sum(lgamma(a_k) - (a_k-1.)*digamma(a_k) - np.log(b_k) + a_k - np.divide(a_k,b_k)
107 |  			for (a_k, b_k) in zip(a, b))
108 | print("Analytical tau term in ELBO:", bound)
109 | 
110 | # Monte Carlo
111 | print('Monte Carlo estimate of mu term in ELBO:')
112 | np.random.seed()
113 | bounds = []
114 | for i in range(N):
115 | 	tau = q_tau.rvs()
116 | 
117 | 	bounds.append(sum(p_tau.logpdf(tau) - q_tau.logpdf(tau))) #Sum over K for MC estimate
118 | 	if i%5000 == 0: print(i, np.mean(bounds))
119 | 
120 | 
121 | 
122 | 
123 | # ####### z term in the ELBO ########
124 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
125 | # Analytical
126 | bound = sum(zeta_k*(
127 | 				- np.log(zeta_k)
128 | 				+ digamma(l1_k) - digamma(l1_k+l2_k)
129 | 				+ sum(digamma(lambda2[j]) - digamma(lambda1[j]+lambda2[j]) for j in range(k)))
130 | 			for (l1_k, l2_k, zeta_k, k) in zip(lambda1, lambda2, zeta, range(K)))
131 | print("Analytical z term in ELBO:", bound)
132 | 
133 | # Monte Carlo
134 | print('Monte Carlo estimate of z term in ELBO:')
135 | np.random.seed()
136 | bounds = []
137 | for i in range(N):
138 | 	phi = q_phi.rvs()
139 | 	z = q_z.rvs()
140 | 
141 | 	bounds.append(log_p_z(phi, z) - q_z.logpmf(z)) #There's only a single datapoint, so no need for sum
142 | 	if i%5000 == 0: print(i, np.mean(bounds))
143 | 
144 | 
145 | 
146 | 
147 | # ####### x term in the ELBO ########
148 | print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
149 | # Analytical
150 | bound = sum( #over k
151 | 			zeta_k * (
152 | 				-D/2. * (np.log(2 * np.pi) - digamma(ak) + np.log(bk))
153 | 				-(ak/(2.*bk)) * ((x[None, :] - nu_k)**2).sum(axis=1)
154 | 				-(ak/(2.*bk)) * D * gammaf(3./2.) * (2*np.pi)**(-1/2.) * 2.**(3./2.) / omega_k
155 | 			)
156 | 		for (ak, bk, nu_k, zeta_k, omega_k) in zip(a, b, nu, zeta, omega))
157 | print("Analytical x term in ELBO:", bound)
158 | 
159 | # Monte Carlo
160 | print('Monte Carlo estimate of x term in ELBO:')
161 | np.random.seed()
162 | bounds = []
163 | for i in range(N):
164 | 	mu = q_mu.rvs()
165 | 	tau = q_tau.rvs()
166 | 	z = q_z.rvs()
167 | 
168 | 	bounds.append(sum(p_x(z, mu, tau).logpdf(x))) #sum over d (There's only a single datapoint, so no need for sum over i)
169 | 	if i%5000 == 0: print(i, np.mean(bounds))


--------------------------------------------------------------------------------
/spherical/demos.py:
--------------------------------------------------------------------------------
  1 | from dpgmm_vi import variational_inference 
  2 | import numpy as np
  3 | 
  4 | import matplotlib
  5 | #matplotlib.use('Agg')
  6 | import matplotlib.pyplot as plt 
  7 | from matplotlib.patches import Circle
  8 | import colorsys 
  9 | 
 10 | """
 11 | SPHERICAL COVARIANCE, i.e. isotropic
 12 | 
 13 | Plots to demonstrate use of dpgmm_vi.py
 14 | 
 15 | python demos.py
 16 | 
 17 | mcusi@mit.edu, july 2018
 18 | """
 19 | 
 20 | 
 21 | def gen_demo_data(batch_size=1, np_seed=None, D=2, use_zeros=True):
 22 |     #generates data from multivariate gaussian with isotropic covariance matrix
 23 |     Nmax = 6*25
 24 |     np.random.seed(np_seed)
 25 |     for b in range(batch_size):
 26 |         K = np.random.randint(2, high=6+1)
 27 |         in_dataset = 0
 28 |         for k in range(K):
 29 |             mean = 3.5*np.random.randn(D) - 1
 30 |             cov = 0.1*np.eye(D) + [0.25,1][np.random.randint(2)]*np.eye(D)
 31 |             n = np.random.randint(10, high=25+1)
 32 |             in_dataset += n
 33 |             gaussian_data = np.random.multivariate_normal(mean, cov, n)
 34 |             _data = gaussian_data if k == 0 else np.vstack((_data, gaussian_data))
 35 |         n_zeros = Nmax - in_dataset ##use zeros to pad smaller datasets
 36 |         _data = np.vstack((np.zeros((n_zeros,D)),_data))
 37 |         np.random.shuffle(_data)
 38 |         _data = np.float32(_data[np.newaxis,:,:])
 39 |         data = _data if b == 0 else np.concatenate((data,_data))
 40 | 
 41 |     return data
 42 | 
 43 | def ELBO_demo(np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20):
 44 | 
 45 |     #Generate toy data
 46 |     data = gen_demo_data(batch_size = 1, np_seed = np_seed)
 47 |     nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2))
 48 | 
 49 |     #Run inference
 50 |     inferred_latents, ELBO_deltas = variational_inference(data, alpha=alpha, T=T, n_iter=max_n_iter, tf_seed=tf_seed, get_elbo=True)
 51 | 
 52 |     #Plot change in ELBO with updates
 53 |     ##If you use [1:] after each of the plot arguments, 
 54 |     ##You can see that the change is still positive after the first iteration
 55 |     plt.plot(ELBO_deltas.total); plt.title('Change in ELBO with each set of updates'); plt.show();
 56 | 
 57 |     plt.plot(ELBO_deltas.zeta_z + ELBO_deltas.zeta_x); plt.plot(ELBO_deltas.zeta_z); plt.plot(ELBO_deltas.zeta_x);
 58 |     plt.legend(['z+x', 'z', 'x'], loc='upper right'); plt.title('Change in z&x ELBO terms due to zeta update'); plt.show()
 59 | 
 60 |     plt.plot(ELBO_deltas.lambda_z + ELBO_deltas.lambda_z); plt.plot(ELBO_deltas.lambda_z); plt.plot(ELBO_deltas.lambda_phi);
 61 |     plt.legend(['z+phi', 'z', 'phi'], loc='upper right'); plt.title('Change in z&phi ELBO terms due to lambda update'); plt.show()
 62 | 
 63 |     plt.plot(ELBO_deltas.nu_mu + ELBO_deltas.nu_x); plt.plot(ELBO_deltas.nu_mu); plt.plot(ELBO_deltas.nu_x);
 64 |     plt.legend(['mu+x', 'mu', 'x'], loc='upper right'); plt.title('Change in mu&x ELBO terms due to nu update'); plt.show()
 65 | 
 66 |     plt.plot(ELBO_deltas.ab_tau + ELBO_deltas.ab_x); plt.plot(ELBO_deltas.ab_tau); plt.plot(ELBO_deltas.ab_x);
 67 |     plt.legend(['tau+x', 'tau', 'x'], loc='upper right'); plt.title('Change in tau&x ELBO terms due to ab update'); plt.show()
 68 | 
 69 |     ##Plot each datapoint with a colour corresponding to the variational cluster to which it is assigned with maximum probability
 70 |     batch_number = 0
 71 |     nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)]
 72 |     inferred_zeta = inferred_latents.zeta[batch_number,nonzero_datapoints,:]
 73 |     assignments = np.argmax(inferred_zeta, axis=1)
 74 |     plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1],c=assignments,marker='x')
 75 |     plt.gca().set_xlim([-10,10])
 76 |     plt.gca().set_ylim([-10,10])
 77 |     plt.gca().set_aspect('equal', adjustable='box')
 78 |     plt.title('MAP assignments of datapoints to clusters')
 79 |     plt.show()
 80 | 
 81 | def clusters_demo(np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20):
 82 | 
 83 |     data = gen_demo_data(batch_size = 1, np_seed = np_seed)
 84 |     nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2))
 85 | 
 86 |     for n_iter in [0, 1, 2, 5, 10, max_n_iter]:
 87 |         inferred_latents = variational_inference(data, alpha=alpha, T=T, n_iter=n_iter, tf_seed=tf_seed, get_elbo=False)
 88 | 
 89 |         #Plot means and datapoints as points
 90 |         batch_number = 0
 91 |         nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)]
 92 |         plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1], marker='x')
 93 |         plt.scatter(inferred_latents.nu[batch_number,:,0] + 0.01*np.random.randn(T), inferred_latents.nu[batch_number,:,1],marker='o',s=30,color='r')
 94 |         
 95 |         #Plot expected standard deviation as radius of circle
 96 |         patches = []; 
 97 |         radii = np.sqrt(1./np.divide(inferred_latents.a, inferred_latents.b))
 98 |         
 99 |         #Plot marginal cluster probabilities as the transparency of circle
100 |         l1 = inferred_latents.lambda_1[batch_number,:]
101 |         l2 = inferred_latents.lambda_2[batch_number,:]
102 |         beta_means = np.divide(l1,l1 + l2)
103 |         log_beta_means = np.log(beta_means + 1e-30)
104 |         cs = np.concatenate(( [0], np.cumsum( np.log(1-beta_means+1e-30)[:-1]) )) #SBP
105 |         beta_expectation = np.exp(log_beta_means + cs)
106 |         beta_expectation /= (1.*np.sum(beta_expectation))               
107 |         for k in range(T):
108 |             circle = Circle((inferred_latents.nu[batch_number,k,0], inferred_latents.nu[batch_number,k,1]), radii[batch_number,k])
109 |             plt.gca().add_artist(circle)
110 |             circle.set_alpha(beta_expectation[k])   
111 |         plt.gca().set_xlim([-10,10])
112 |         plt.gca().set_ylim([-10,10])
113 |         plt.gca().set_aspect('equal', adjustable='box')
114 |         plt.title('Variational distributions at iteration ' + str(n_iter))
115 |         plt.show()
116 | 
117 | def batch_demo(batch_size=2, np_seed=0, tf_seed=0, alpha=1.0, T=100, max_n_iter=20):
118 | 
119 |     data = gen_demo_data(batch_size = batch_size, np_seed = np_seed)
120 |     nonzero_datapoints_batches = np.where(~np.all(data==0, axis=2))
121 | 
122 |     #Run inference
123 |     inferred_latents = variational_inference(data, alpha=alpha, T=T, n_iter=max_n_iter, tf_seed=tf_seed, get_elbo=False)
124 | 
125 |     for batch_number in range(batch_size):
126 |         nonzero_datapoints = nonzero_datapoints_batches[1][np.where(nonzero_datapoints_batches[0] == batch_number)]
127 |         inferred_zeta = inferred_latents.zeta[batch_number, nonzero_datapoints, :]
128 | 
129 |         #plot weighted points 
130 |         #https://stackoverflow.com/questions/41314736/scatterplot-wherein-each-point-color-is-a-different-mixture-of-k-colors
131 |         HSV = [(x*1.0/T, 0.8, 0.5) for x in np.random.permutation(T)]
132 |         RGB = np.array(map(lambda x: colorsys.hsv_to_rgb(*x), HSV))
133 |         assignments = np.sum(np.multiply(RGB[np.newaxis, :, :], inferred_zeta[:, :, np.newaxis]),axis=1)
134 |         plt.scatter(data[batch_number,nonzero_datapoints,0],data[batch_number,nonzero_datapoints,1],c=assignments,marker='x')
135 |         plt.title('Weighted assignments, batch ' + str(batch_number))
136 |         plt.gca().set_xlim([-10,10])
137 |         plt.gca().set_ylim([-10,10])
138 |         plt.gca().set_aspect('equal', adjustable='box')
139 |         plt.show()
140 | 
141 | if __name__ == "__main__":
142 |     print('Spherical Covariance Model')
143 |     np_seed = 23; tf_seed = 100; alpha=1.0; T=100; max_n_iter=20;
144 |     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
145 |     print('Change in ELBO with each iteration of updates:')
146 |     ELBO_demo(np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter)
147 |     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
148 |     print('Change in latent parameters with increasing number of updates:')
149 |     clusters_demo(np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter)
150 |     print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
151 |     print('This script can use batches of different datasets:')
152 |     batch_demo(batch_size = 2, np_seed = np_seed, tf_seed = tf_seed, alpha=alpha, T=T, max_n_iter=max_n_iter)
153 | 
154 | 


--------------------------------------------------------------------------------
/spherical/dpgmm_vi.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from scipy.special import gamma as gamma_func
  3 | import numpy as np
  4 | import collections
  5 | 
  6 | """
  7 | Spherical COVARIANCE
  8 | 
  9 | Unsupervised clustering in R^D
 10 | 
 11 | TF implementation of variational inference in a 
 12 | Dirichlet Process isotropic Gaussian mixture model
 13 | Derivation in Variational_Inference_in_DPGMM_derivation.pdf included in git repo 
 14 | 
 15 | Clusters matrix X (batch_size x N x D) of N datapoints with dimensionality D
 16 | If a datapoint = zero vector, it is ignored. 
 17 | This option allows you to use batched datasets of different sizes
 18 | 
 19 | For examples of how to use this code, see demos.py
 20 | 
 21 | mcusi@mit.edu, july 2018
 22 | 
 23 | """
 24 | 
 25 | class dpgmm():
 26 | 
 27 |     ######### INITIALIZATION ##########################################################################################################
 28 | 
 29 |     def __init__(self, alpha, D, n_iter, T, covariance_type='diagonal'):
 30 | 
 31 |         self.alpha = alpha; #Dirichlet concentration parameter
 32 |         self.D = D; self.Dfl = tf.cast(self.D, dtype=tf.float32); #dimensionality of data
 33 |         self.T = T; #truncation value
 34 |  
 35 |         self.gaussian_const = np.divide(gamma_func(1.5)*(2.0**1.5)*self.D, np.sqrt(2.*np.pi))
 36 | 
 37 |         #Initialization settings
 38 |         self.mu_std = 5.
 39 | 
 40 |         #inference settings
 41 |         self.n_iter = n_iter;
 42 |         self.log_constant = 1e-30
 43 | 
 44 |     def initialize_latents(self, X, batch_size, shared=True, use_mask=True):
 45 |         """
 46 |         > randomly initializes variational distribution parameters
 47 |         > if shared == True, batches share the same initialization
 48 |         """
 49 | 
 50 |         N = tf.shape(X)[1]
 51 |         shape_T = [self.T] if shared else [batch_size, self.T]
 52 |         shape_TD = [self.T, self.D] if shared else [batch_size, self.T, self.D]
 53 | 
 54 |         a = tf.get_variable("a", shape_T, dtype=tf.float32,
 55 |                 initializer=tf.ones_initializer())
 56 |         b = tf.get_variable("b", shape_T, dtype=tf.float32,
 57 |                 initializer=tf.ones_initializer())
 58 |         lambda_1 = tf.get_variable("lambda_1", shape_T, dtype=tf.float32,
 59 |                 initializer=tf.ones_initializer())
 60 |         lambda_2 = tf.get_variable("lambda_2", shape_T, dtype=tf.float32,
 61 |                 initializer=tf.ones_initializer())
 62 |         nu = tf.get_variable("nu", shape_TD, dtype=tf.float32,
 63 |                 initializer=tf.random_normal_initializer(stddev=self.mu_std)) 
 64 |         omega = tf.get_variable("omega", shape_T, dtype=tf.float32,
 65 |                 initializer=tf.ones_initializer())
 66 | 
 67 |         if shared:
 68 | 
 69 |             a = tf.tile(a[tf.newaxis, :], [batch_size, 1])
 70 |             b = tf.tile(b[tf.newaxis, :], [batch_size, 1])
 71 |             lambda_1 = tf.tile(lambda_1[tf.newaxis, :], [batch_size, 1])
 72 |             lambda_2 = tf.tile(lambda_2[tf.newaxis, :], [batch_size, 1])
 73 |             nu = tf.tile(nu[tf.newaxis, :, :], [batch_size, 1, 1])
 74 |             omega = tf.tile(omega[tf.newaxis, :], [batch_size, 1])
 75 |         
 76 |         # zeta will be the first in the update distribution 
 77 |         # so this initialization is only necessary for ELBO calculation
 78 |         alpha_vec = tf.fill([batch_size, self.T], self.alpha)
 79 |         zeta_dist = tf.distributions.Dirichlet(alpha_vec)
 80 |         #zeta: batch_size N T
 81 |         zeta = tf.transpose(zeta_dist.sample([N]),perm=[1,0,2])
 82 |         #mask: batch_size N
 83 |         if use_mask:
 84 |             mask = tf.cast(tf.logical_not(tf.reduce_all(tf.equal(X,0),axis=2)), dtype=tf.float32)
 85 |         else:
 86 |             mask = tf.ones([batch_size, N])
 87 | 
 88 |         return a, b, lambda_1, lambda_2, nu, omega, zeta, mask
 89 | 
 90 |     ######### UPDATE EQUATIONS ##########################################################################################################
 91 | 
 92 |     def update_lambda(self, zeta_mask):
 93 |         ##lambda_1: only sum over datapoints
 94 |         #nu_z batch N T
 95 |         #embedding_weights batch N
 96 |         lambda_1 = 1.0 + tf.reduce_sum(zeta_mask, axis=1) #over N
 97 |         ##lambda_2: requires sum over classes as well as datapoints 
 98 |         #nu_z: batch N T 
 99 |         l = tf.cumsum(zeta_mask, axis=2, reverse=True, exclusive=True) #over T
100 |         lambda_2 = self.alpha + tf.reduce_sum(l, axis=1) #over N
101 |         return lambda_1, lambda_2
102 | 
103 |     def update_nu(self, a, b, zeta_mask, X):
104 |         # nu_z batch N T
105 |         # a batch newaxis T
106 |         # b batch newaxis T
107 |         w = tf.divide(tf.multiply(zeta_mask, 
108 |             a[:, tf.newaxis, :]), b[:, tf.newaxis,:])[:, :, :, tf.newaxis] 
109 |         #w : batch N T newaxis 
110 |         #X : batch N newaxis D
111 |         numer = tf.reduce_sum(tf.multiply(w, X[:, :, tf.newaxis, :]), axis=1) #over N
112 |         denom = 1.0 + tf.reduce_sum(w, axis=1) #over N
113 |         # numer batch T D
114 |         # denom batch T D
115 |         nu = tf.divide(numer,  denom)
116 |         return nu
117 | 
118 |     def update_omega(self, a, b, zeta_mask):
119 | 
120 |         # a batch newaxis T
121 |         # b batch newaxis T
122 |         ratio = tf.multiply(tf.divide(a, b), self.gaussian_const)[:, tf.newaxis, :]
123 |         # nu_z batch N T 
124 |         #WANT: omega batch T
125 |         omega = 1.0 + tf.reduce_sum( tf.multiply(zeta_mask, ratio) , axis=1) #over N
126 |         return omega
127 | 
128 |     def update_ab(self, nu, omega, zeta_mask, X):
129 |         #nu_z_masked batch N T
130 |         #a batch T
131 |         a = 1.0 + tf.multiply(self.Dfl/2.0, tf.reduce_sum(zeta_mask, axis=1))#over N
132 |         
133 |         #X batch N newaxis D
134 |         #nu batch newaxis T D
135 |         #difference norm batch N T        
136 |         difference_norm = tf.reduce_sum(tf.square(X[:,:,tf.newaxis,:] - nu[:,tf.newaxis,:,:]),axis=3)
137 |         #omega batch newaxis T
138 |         s = difference_norm + tf.divide(self.gaussian_const, omega[:, tf.newaxis, :])
139 |         #s batch N T 
140 |         #zeta_mask batch N T
141 |         #b batch T 
142 |         b = 1.0 + 0.5*tf.reduce_sum(tf.multiply(zeta_mask, s), axis=1) #over N
143 |         
144 |         return a, b
145 | 
146 |     def eta_x(self, a, b, nu, omega, X):
147 |         """
148 |         eta_x_{i, k} = E_q[log P(x_{i} | z_i = k, mu_{k}, var_{k})]
149 | 
150 |         """
151 | 
152 |         #a batch_size, T
153 |         #b batch_size, T
154 |         ab1 = tf.multiply(-self.Dfl/2.0, tf.log(2*np.pi) - tf.digamma(a) + tf.log(b))[:,tf.newaxis, :]
155 |         ab2 = tf.divide(a, -2.0*b)[:,tf.newaxis, :]
156 | 
157 |         # X: batch N newaxis D
158 |         # mu: batch newaxis T D
159 |         norm_difference = tf.reduce_sum(tf.square(tf.subtract(X[:, :, tf.newaxis, :], nu[:, tf.newaxis, :, :])),axis=3)
160 |         #omega: batch newaxis T 
161 |         s = norm_difference + tf.divide(self.gaussian_const, omega[:, tf.newaxis, :])
162 |         #s: batch N T
163 |         #ab1 batch_size, newaxis, T,
164 |         #ab2 batch_size, newaxis, T,
165 |         Eq = ab1 + tf.multiply(ab2, s)
166 | 
167 |         return Eq
168 | 
169 |     def eta_z(self, lambda_1, lambda_2):
170 |         #lambda_1, lambda_2: batch_size, T
171 |         d1 = tf.digamma(lambda_1) - tf.digamma(lambda_1 + lambda_2)
172 |         d2 = tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2)
173 |         d_cumsum = tf.cumsum(d2, axis=1, exclusive=True)
174 |         return d1 + d_cumsum
175 | 
176 |     def update_zeta(self, a, b, lambda_1, lambda_2, nu, omega, X): 
177 | 
178 |         #self.eta_x: batch N T 
179 |         #self.eta_z: batch newaxis T
180 | 
181 |         prop_log_zeta = self.eta_z(lambda_1, lambda_2)[:, tf.newaxis, :] - 1. + self.eta_x(a, b, nu, omega, X) 
182 |         #prop_log_nu_z batch N T 
183 |         log_zeta = tf.subtract(prop_log_zeta, tf.reduce_logsumexp(prop_log_zeta, axis=2, keepdims=True)) #over T
184 |         zeta = tf.exp(log_zeta)
185 | 
186 |         return zeta
187 | 
188 |     def update_all(self, L, dataset):
189 | 
190 | 
191 |         zeta = self.update_zeta(L.a, L.b, L.lambda_1, L.lambda_2, L.nu, L.omega, dataset.X)
192 |         zeta_mask = tf.multiply(dataset.mask[:,:,tf.newaxis],zeta)
193 |         lambda_1, lambda_2 = self.update_lambda(zeta_mask)
194 |         nu = self.update_nu(L.a, L.b, zeta_mask, dataset.X)
195 |         omega = self.update_omega(L.a, L.b, zeta_mask)
196 |         a, b = self.update_ab(nu, omega, zeta_mask, dataset.X) #might have to mess with the order of these
197 | 
198 |         return a, b, lambda_1, lambda_2, nu, omega, zeta
199 | 
200 |     ######### VARIATIONAL LOWER BOUND ##########################################################################################################
201 | 
202 |     def phi_lower_bound_term(self, lambda_1, lambda_2):
203 |         """
204 |         lambda_1: [batch_size, T]
205 |         lambda_2: [batch_size, T]
206 |         """
207 |         term1 =  tf.lgamma(1. + self.alpha) - tf.lgamma(self.alpha)
208 |         term2 = (self.alpha - 1.)*(tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2))
209 |         term3 = -1*tf.lgamma(lambda_1 + lambda_2) + tf.lgamma(lambda_1) + tf.lgamma(lambda_2)
210 |         term4 = tf.multiply(lambda_1 - 1., tf.digamma(lambda_1) - tf.digamma(lambda_1 + lambda_2))
211 |         term5 = tf.multiply(lambda_2 - 1., tf.digamma(lambda_2) - tf.digamma(lambda_1 + lambda_2))
212 |         #sum over clusters
213 |         vb = tf.reduce_sum(term1 + term2 + term3 - term4 - term5, axis=1)
214 |         return vb
215 | 
216 |     def mu_lower_bound_term(self, nu, omega):
217 |         #nu: [batch_size, T, D]
218 |         #omega [batch_size, T]
219 |         tot = tf.multiply(-self.Dfl/2.0, tf.divide(1.0, omega) + tf.log(omega) - 1.0) - 0.5*tf.reduce_sum(tf.square(nu),axis=2)
220 |         vb = tf.reduce_sum(tot, axis=1) #over number of clusters
221 |         return vb
222 | 
223 |     def tau_lower_bound_term(self, a, b):
224 |         #a, b: [batch_size, T]
225 |         tot = tf.lgamma(a) - tf.multiply(a - 1.,tf.digamma(a)) - tf.log(b) + a - tf.divide(a, b)
226 |         vb = tf.reduce_sum(tot, axis=1) #sum over clusters 
227 |         return vb
228 | 
229 |     def z_lower_bound_term(self, lambda_1, lambda_2, zeta, mask):
230 |         #lambda_1: [batch_size, T]
231 |         #lambda_2: [batch_size, T]
232 |         #zeta: [batch_size, N, T]
233 |         c = -tf.log(zeta + self.log_constant) + self.eta_z(lambda_1, lambda_2)[:,tf.newaxis,:]
234 | 
235 |         # batch_Size N T --> batch N 
236 |         e = tf.reduce_sum(tf.multiply(zeta, c),axis=2) #over clusters
237 |         e_mask = tf.multiply(e, mask)
238 |         vb = tf.reduce_sum(e_mask, axis=1) #over I 
239 | 
240 |         return vb
241 | 
242 |     def x_lower_bound_term(self, a, b, nu, omega, zeta_mask, X):
243 |         #X: batch_size, N, D
244 |         #self.eta_x: batch N T 
245 |         EqLogPxGivenZ = self.eta_x(a, b, nu, omega, X)
246 |         #zeta_mask: batch_size, N, T
247 |         tot = tf.multiply(zeta_mask, EqLogPxGivenZ)
248 |         #tot batch_size N T
249 |         vb = tf.reduce_sum(tot, axis=[1,2])
250 |         return vb
251 | 
252 |     def evidence_lower_bound(self, L, D):
253 |         phi_lb = self.phi_lower_bound_term(L.lambda_1, L.lambda_2) 
254 |         mu_lb = self.mu_lower_bound_term(L.nu, L.omega) 
255 |         tau_lb = self.tau_lower_bound_term(L.a, L.b)
256 |         z_lb = self.z_lower_bound_term(L.lambda_1, L.lambda_2, L.zeta, D.mask)
257 |         x_lb = self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, tf.multiply(D.mask[:,:,tf.newaxis],L.zeta), D.X)
258 |         return phi_lb + mu_lb + tau_lb + z_lb + x_lb
259 | 
260 |     ######### INFERENCE FUNCTIONS ##########################################################################################################
261 | 
262 |     def infer(self, _a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta, X, mask):
263 |         """
264 |         Performs variational inference in DPGMM for n_iter number of iterations,
265 |         then returns inferred latent variables
266 | 
267 |         _a, _b, _lambda_1, _lambda_2, _nu, _zeta: initial parameters for inference
268 |         X: data matrix (batch_size x nDatapoints x dimensions)
269 |         mask: 1 if consider as datapoint, 0 if ignore (batch_size x nDatapoints)
270 |         """
271 |             
272 |         ##Initial input into "while" loop, i.e., inference iterations
273 |         i = tf.constant(0)
274 |         latents = collections.namedtuple('latents', ['a', 'b', 'lambda_1', 'lambda_2', 'nu', 'omega', 'zeta'])
275 |         dataset = collections.namedtuple('dataset', ['X', 'mask'])
276 |         init_iteration = (i, latents(_a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta), dataset(X, mask))
277 | 
278 |         cond = lambda i, L, D: i < self.n_iter
279 |         def body(i, L, D):
280 |             a, b, lambda_1, lambda_2, nu, omega, zeta = self.update_all(L, D)              
281 |             return (i + 1, latents(a, b, lambda_1, lambda_2, nu, omega, zeta), D)
282 |         
283 |         final_iteration = tf.while_loop(cond, body, init_iteration)
284 | 
285 |         return final_iteration[1]   
286 | 
287 |     def elbo_infer(self, _a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta, X, mask, batch_size):
288 |         """
289 |         Performs variational inference in DPGMM for n_iter number of iterations,
290 |         and also calculates the change in ELBO at each update
291 |         returns inferred latent variables and changes in ELBO 
292 |         """
293 | 
294 |         i = tf.constant(0)
295 |         latents = collections.namedtuple('latents', ['a', 'b', 'lambda_1', 'lambda_2', 'nu', 'omega', 'zeta'])
296 |         dataset = collections.namedtuple('dataset', ['X', 'mask'])
297 |         #ELBO term names: "updated-variable_term-of-lower-bound"
298 |         ELBO_terms = ['zeta_z', 'zeta_x', 'lambda_phi', 'lambda_z', 'nu_mu', 'nu_x', 'omega_mu', 'omega_x', 'ab_tau', 'ab_x', 'total']
299 |         empty_ELBO_terms = tuple([tf.TensorArray(dtype=tf.float32, size=self.n_iter, element_shape=batch_size, name=ELBO_terms[j]) for j in range(11)])
300 |         init_iteration = (i, latents(_a, _b, _lambda_1, _lambda_2, _nu, _omega, _zeta), dataset(X, mask)) + empty_ELBO_terms
301 | 
302 |         cond = lambda i, L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total: i < self.n_iter 
303 |         def body(i, L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total):
304 | 
305 |             zeta = self.update_zeta(L.a, L.b, L.lambda_1, L.lambda_2, L.nu, L.omega, D.X)
306 |             zeta_mask = tf.multiply(D.mask[:,:,tf.newaxis], zeta)
307 |             zeta_z = zeta_z.write(i, self.z_lower_bound_term(L.lambda_1, L.lambda_2, zeta, D.mask)-self.z_lower_bound_term(L.lambda_1, L.lambda_2, L.zeta, D.mask))
308 |             zeta_x = zeta_x.write(i, self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, tf.multiply(D.mask[:,:,tf.newaxis],L.zeta), D.X))
309 | 
310 |             lambda_1, lambda_2 = self.update_lambda(zeta_mask)
311 |             lambda_phi = lambda_phi.write(i, self.phi_lower_bound_term(lambda_1, lambda_2) - self.phi_lower_bound_term(L.lambda_1, L.lambda_2))
312 |             lambda_z = lambda_z.write(i,self.z_lower_bound_term(lambda_1, lambda_2, zeta, D.mask)-self.z_lower_bound_term(L.lambda_1, L.lambda_2, zeta, D.mask))
313 | 
314 |             nu = self.update_nu(L.a, L.b, zeta_mask, D.X)
315 |             nu_mu = nu_mu.write(i, self.mu_lower_bound_term(nu, L.omega)-self.mu_lower_bound_term(L.nu, L.omega))
316 |             nu_x = nu_x.write(i, self.x_lower_bound_term(L.a, L.b, nu, L.omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, L.nu, L.omega, zeta_mask, D.X))
317 | 
318 |             omega = self.update_omega(L.a, L.b, zeta_mask)
319 |             omega_mu = omega_mu.write(i, self.mu_lower_bound_term(nu, omega)-self.mu_lower_bound_term(nu, L.omega))
320 |             omega_x = omega_x.write(i, self.x_lower_bound_term(L.a, L.b, nu, omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, nu, L.omega, zeta_mask, D.X))            
321 | 
322 |             a, b = self.update_ab(nu, omega, zeta_mask, D.X)
323 |             ab_tau = ab_tau.write(i, self.tau_lower_bound_term(a, b) - self.tau_lower_bound_term(L.a, L.b))
324 |             ab_x = ab_x.write(i, self.x_lower_bound_term(a, b, nu, omega, zeta_mask, D.X)-self.x_lower_bound_term(L.a, L.b, nu, omega, zeta_mask, D.X))
325 | 
326 |             updated_L = latents(a, b, lambda_1, lambda_2, nu, omega, zeta)
327 |             total = total.write(i, self.evidence_lower_bound(updated_L, D) - self.evidence_lower_bound(L, D))
328 | 
329 |             return (i+1, updated_L, D, zeta_z, zeta_x, lambda_phi, lambda_z, nu_mu, nu_x, omega_mu, omega_x, ab_tau, ab_x, total)           
330 | 
331 |         final_iteration = tf.while_loop(cond, body, init_iteration)
332 | 
333 |         return final_iteration[1], [final_iteration[i].stack() for i in range(3,14)]
334 | 
335 | def variational_inference(data, alpha=1.0, T=10, n_iter=10, tf_seed=None, get_elbo=False, tf_device='/cpu:0'):
336 |     """
337 |     Tensorflow setup to run variational inference
338 | 
339 |     data: matrix of datapoints, size should be (batch_size, max_number_of_datapoints, dimesionality_of_data)
340 |           batches that have different number of datapoints can be run together by padding the smaller data matrices with zero vectors 
341 |     alpha: Dirichlet concentration parameter
342 |     T: truncation paper
343 |     n_iter: number of iterations to run VI for
344 |     get_elbo: if True, measure & return the change in ELBO for each update
345 | 
346 |     """
347 | 
348 |     #size of dataset
349 |     batch_size = np.shape(data)[0]
350 |     N = np.shape(data)[1]
351 |     D = np.shape(data)[2]
352 | 
353 |     with tf.Graph().as_default():
354 |         with tf.device(tf_device):
355 | 
356 |             tf.set_random_seed(tf_seed)
357 |             X = tf.placeholder(tf.float32, shape=[batch_size, N, D])            
358 | 
359 |             mixture_model = dpgmm(alpha, D, n_iter, T)
360 |             init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, mask = mixture_model.initialize_latents(X, batch_size, shared=False)
361 | 
362 |             if not get_elbo:
363 |                 inferred_latents = mixture_model.infer(init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, X, mask)
364 |             else:
365 |                 inferred_latents, ELBO_deltas = mixture_model.elbo_infer(init_a, init_b, init_lambda_1, init_lambda_2, init_nu, init_omega, init_zeta, X, mask, batch_size)
366 | 
367 |         ##Run graph
368 |         with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
369 |             
370 |             sess.run(tf.global_variables_initializer())
371 |             if not get_elbo:
372 |                 inferred_latents_out = sess.run([inferred_latents], feed_dict = {X: data})
373 |                 return inferred_latents_out[0]
374 |             else:
375 |                 inferred_latents_out, ELBO_deltas_out = sess.run([inferred_latents, ELBO_deltas], feed_dict = {X: data})
376 |                 ELBO_terms = collections.namedtuple('ELBO_terms', ['zeta_z', 'zeta_x', 'lambda_phi', 'lambda_z', 'nu_mu', 'nu_x', 'omega_mu', 'omega_x', 'ab_tau', 'ab_x', 'total'])
377 |                 return inferred_latents_out, ELBO_terms(*ELBO_deltas_out)
378 | 


--------------------------------------------------------------------------------