├── README.md ├── VSSGP_example.py ├── VSSGP_model.py └── VSSGP_opt.py /README.md: -------------------------------------------------------------------------------- 1 | Variational Sparse Spectrum Gaussian Process toolkit (VSSGP) 2 | ==== 3 | 4 | This is the implementaion of the inference developed in "[Improving the Gaussian Process Sparse Spectrum Approximation by Representing Uncertainty in Frequency Inputs](http://mlg.eng.cam.ac.uk/yarin/publications.html#Gal2015Improving)". Please see the paper for further details. 5 | 6 | This code assumes that the Theano repository was cloned in the parent folder. To clone the repo, run: 7 | ``` 8 | git clone https://github.com/Theano/Theano.git 9 | ``` 10 | 11 | To run the example file, execute: 12 | ``` 13 | python VSSGP_example.py 14 | ``` 15 | 16 | Note that Theano will compile the code the first time the example is run, which might take some time. 17 | 18 | -------------------------------------------------------------------------------- /VSSGP_example.py: -------------------------------------------------------------------------------- 1 | from VSSGP_opt import VSSGP_opt 2 | from scipy.optimize import minimize 3 | import numpy as np 4 | from numpy.random import randn, rand 5 | np.set_printoptions(precision=2, suppress=True) 6 | import pylab; pylab.ion() # turn interactive mode on 7 | 8 | N, Q, D, K = 250, 1, 1, 25 9 | components, init_period, init_lengthscales, sf2s, tau = 2, 1e32, 1, np.array([1, 5]), 1 10 | 11 | # Some synthetic data to play with 12 | X = rand(N,Q) * 5*np.pi 13 | X = np.sort(X, axis=0) 14 | Z = rand(Q,K,components) * 5*np.pi 15 | #a, b, c, d, e, f = randn(), randn(), randn(), randn(), randn(), randn() 16 | #a, b, c, d, e, f = 0.6, 0.7, -0.6, 0.5, -0.1, -0.8 17 | #a, b, c, d, e, f = -0.6, -0.3, -0.6, 0.6, 0.7, 0.6 18 | #a, b, c, d, e, f = -0.5, -0.3, -0.6, 0.1, 1.1, 0.1 19 | a, b, c, d, e, f = 0.6, -1.8, -0.5, -0.5, 1.7, 0 20 | Y = a*np.sin(b*X+c) + d*np.sin(e*X+f) 21 | 22 | # Initialise near the posterior: 23 | mu = randn(Q,K,components) 24 | # TODO: Currently tuned by hand to smallest value that doesn't diverge; we break symmetry to allow for some to get very small while others very large 25 | feature_lengthscale = 5 # features are non-diminishing up to feature_lengthscale / lengthscale from z / lengthscale 26 | lSigma = np.log(randn(Q,K,components)**2 / feature_lengthscale**2) # feature weights are np.exp(-0.5 * (x-z)**2 * Sigma / lengthscale**2) 27 | lalpha = np.log(rand(K,components)*2*np.pi) 28 | lalpha_delta = np.log(rand(K,components) * (2*np.pi - lalpha)) 29 | m = randn(components*K,D) 30 | ls = np.zeros((components*K,D)) - 4 31 | lhyp = np.log(1 + 1e-2*randn(2*Q+1, components)) # break symmetry 32 | lhyp[0,:] += np.log(sf2s) # sf2 33 | lhyp[1:Q+1,:] += np.log(init_lengthscales) # length-scales 34 | lhyp[Q+1:,:] += np.log(init_period) # period 35 | ltau = np.log(tau) # precision 36 | lstsq = np.linalg.lstsq(np.hstack([X, np.ones((N,1))]), Y)[0] 37 | a = 0*np.atleast_2d(lstsq[0]) # mean function slope 38 | b = 0*lstsq[1] # mean function intercept 39 | 40 | opt_params = {'Z': Z, 'm': m, 'ls': ls, 'mu': mu, 'lSigma': lSigma, 'lhyp': lhyp, 'ltau': ltau} 41 | fixed_params = {'lalpha': lalpha, 'lalpha_delta': lalpha_delta, 'a': a, 'b': b} 42 | inputs = {'X': X, 'Y': Y} 43 | vssgp_opt = VSSGP_opt(N, Q, D, K, inputs, opt_params, fixed_params, use_exact_A=True, print_interval=1) 44 | 45 | # LBFGS 46 | x0 = np.concatenate([np.atleast_2d(opt_params[n]).flatten() for n in vssgp_opt.opt_param_names]) 47 | pylab.figure(num=None, figsize=(12, 9), dpi=80, facecolor='w', edgecolor='w') 48 | vssgp_opt.callback(x0) 49 | res = minimize(vssgp_opt.func, x0, method='L-BFGS-B', jac=vssgp_opt.fprime, 50 | options={'ftol': 0, 'disp': False, 'maxiter': 500}, tol=0, callback=vssgp_opt.callback) 51 | 52 | raw_input("PRESS ENTER TO CONTINUE.") -------------------------------------------------------------------------------- /VSSGP_model.py: -------------------------------------------------------------------------------- 1 | # To speed Theano up, create ram disk: mount -t tmpfs -o size=512m tmpfs /mnt/ramdisk 2 | # Then use flag THEANO_FLAGS='base_compiledir=/mnt/ramdisk' python script.py 3 | import sys; sys.path.insert(0, "../Theano"); sys.path.insert(0, "../../Theano") 4 | import theano; import theano.tensor as T; import theano.sandbox.linalg as sT 5 | import numpy as np 6 | import cPickle 7 | 8 | print 'Theano version: ' + theano.__version__ + ', base compile dir: ' + theano.config.base_compiledir 9 | theano.config.mode = 'FAST_RUN' 10 | theano.config.optimizer = 'fast_run' 11 | theano.config.reoptimize_unpickled_function = False 12 | 13 | class VSSGP: 14 | def __init__(self, use_exact_A = False): 15 | try: 16 | print 'Trying to load model...' 17 | with open('model_exact_A.save' if use_exact_A else 'model.save', 'rb') as file_handle: 18 | self.f, self.g = cPickle.load(file_handle) 19 | print 'Loaded!' 20 | return 21 | except: 22 | print 'Failed. Creating a new model...' 23 | 24 | print 'Setting up variables...' 25 | Z, mu, lSigma = T.dtensor3s('Z', 'mu', 'lSigma') 26 | X, Y, m, ls, lhyp, lalpha, lalpha_delta, a = T.dmatrices('X', 'Y', 'm', 'ls', 'lhyp', 'lalpha', 'lalpha_delta', 'a') 27 | b = T.dvector('b') 28 | ltau = T.dscalar('ltau') 29 | Sigma, alpha, alpha_delta, tau = T.exp(lSigma), T.exp(lalpha), T.exp(lalpha_delta), T.exp(ltau) 30 | alpha = alpha % 2*np.pi 31 | beta = T.minimum(alpha + alpha_delta, 2*np.pi) 32 | (N, Q), D, K = X.shape, Y.shape[1], mu.shape[1] 33 | sf2s, lss, ps = T.exp(lhyp[0]), T.exp(lhyp[1:1+Q]), T.exp(lhyp[1+Q:]) # length-scales abd periods 34 | 35 | print 'Setting up model...' 36 | if not use_exact_A: 37 | LL, KL, Y_pred_mean, Y_pred_var, EPhi, EPhiTPhi, opt_A_mean, opt_A_cov = self.get_model(Y, X, Z, alpha, beta, 38 | mu, Sigma, m, ls, sf2s, lss, ps, tau, a, b, N, Q, D, K) 39 | else: 40 | LL, KL, Y_pred_mean, Y_pred_var, EPhi, EPhiTPhi, opt_A_mean, opt_A_cov = self.get_model_exact_A(Y, X, Z, alpha, beta, 41 | mu, Sigma, m, ls, sf2s, lss, ps, tau, a, b, N, Q, D, K) 42 | 43 | print 'Compiling model...' 44 | inputs = {'X': X, 'Y': Y, 'Z': Z, 'mu': mu, 'lSigma': lSigma, 'm': m, 'ls': ls, 'lalpha': lalpha, 45 | 'lalpha_delta': lalpha_delta, 'lhyp': lhyp, 'ltau': ltau, 'a': a, 'b': b} 46 | z = 0.0*sum([T.sum(v) for v in inputs.values()]) # solve a bug with derivative wrt inputs not in the graph 47 | f = zip(['opt_A_mean', 'opt_A_cov', 'EPhi', 'EPhiTPhi', 'Y_pred_mean', 'Y_pred_var', 'LL', 'KL'], 48 | [opt_A_mean, opt_A_cov, EPhi, EPhiTPhi, Y_pred_mean, Y_pred_var, LL, KL]) 49 | self.f = {n: theano.function(inputs.values(), f+z, name=n, on_unused_input='ignore') for n,f in f} 50 | g = zip(['LL', 'KL'], [LL, KL]) 51 | wrt = {'Z': Z, 'mu': mu, 'lSigma': lSigma, 'm': m, 'ls': ls, 'lalpha': lalpha, 52 | 'lalpha_delta': lalpha_delta, 'lhyp': lhyp, 'ltau': ltau, 'a': a, 'b': b} 53 | self.g = {vn: {gn: theano.function(inputs.values(), T.grad(gv+z, vv), name='d'+gn+'_d'+vn, 54 | on_unused_input='ignore') for gn,gv in g} for vn, vv in wrt.iteritems()} 55 | 56 | with open('model_exact_A.save' if use_exact_A else 'model.save', 'wb') as file_handle: 57 | print 'Saving model...' 58 | sys.setrecursionlimit(2000) 59 | cPickle.dump([self.f, self.g], file_handle, protocol=cPickle.HIGHEST_PROTOCOL) 60 | 61 | def get_EPhi(self, X, Z, alpha, beta, mu, Sigma, sf2s, lss, ps, K): 62 | two_over_K = 2.*sf2s[None, None, :]/K # N x K x comp 63 | mean_p, std_p = ps**-1, (2*np.pi*lss)**-1 # Q x comp 64 | Ew = std_p[:, None, :] * mu + mean_p[:, None, :] # Q x K x comp 65 | XBAR = 2 * np.pi * (X[:, :, None, None] - Z[None, :, :, :]) # N x Q x K x comp 66 | decay = T.exp(-0.5 * ((std_p[None, :, None, :] * XBAR)**2 * Sigma[None, :, :, :]).sum(1)) # N x K x comp 67 | 68 | cos_w = T.cos(alpha + (XBAR * Ew[None, :, :, :]).sum(1)) # N x K x comp 69 | EPhi = two_over_K**0.5 * decay * cos_w 70 | EPhi = EPhi.flatten(2) # N x K*comp 71 | 72 | cos_2w = T.cos(2 * alpha + 2 * (XBAR * Ew[None, :, :, :]).sum(1)) # N x K x comp 73 | E_cos_sq = two_over_K * (0.5 + 0.5*decay**4 * cos_2w) # N x K x comp 74 | EPhiTPhi = (EPhi.T).dot(EPhi) 75 | EPhiTPhi = EPhiTPhi - T.diag(T.diag(EPhiTPhi)) + T.diag(E_cos_sq.sum(0).flatten(1)) 76 | return EPhi, EPhiTPhi, E_cos_sq 77 | 78 | def get_opt_A(self, tau, EPhiTPhi, YT_EPhi): 79 | SigInv = EPhiTPhi + (tau**-1 + 1e-4) * T.identity_like(EPhiTPhi) 80 | cholTauSigInv = tau**0.5 * sT.cholesky(SigInv) 81 | invCholTauSigInv = sT.matrix_inverse(cholTauSigInv) 82 | tauInvSig = invCholTauSigInv.T.dot(invCholTauSigInv) 83 | Sig_EPhiT_Y = tau * tauInvSig.dot(YT_EPhi.T) 84 | return Sig_EPhiT_Y, tauInvSig, cholTauSigInv 85 | 86 | def get_model(self, Y, X, Z, alpha, beta, mu, Sigma, m, ls, sf2s, lss, ps, tau, a, b, N, Q, D, K): 87 | s = T.exp(ls) 88 | Y = Y - (X.dot(a) + b[None,:]) 89 | EPhi, EPhiTPhi, E_cos_sq = self.get_EPhi(X, Z, alpha, beta, mu, Sigma, sf2s, lss, ps, K) 90 | YT_EPhi = Y.T.dot(EPhi) 91 | 92 | LL = (-0.5*N*D * np.log(2 * np.pi) + 0.5*N*D * T.log(tau) - 0.5*tau*T.sum(Y**2) 93 | - 0.5*tau * T.sum(EPhiTPhi * (T.diag(s.sum(1)) + T.sum(m[:,None,:]*m[None,:,:], axis=2))) 94 | + tau * T.sum((Y.T.dot(EPhi)) * m.T)) 95 | 96 | KL_A = 0.5 * (s + m**2 - ls - 1).sum() 97 | KL_w = 0.5 * (Sigma + mu**2 - T.log(Sigma) - 1).sum() 98 | KL = KL_A + KL_w 99 | 100 | Y_pred_mean = EPhi.dot(m) + (X.dot(a) + b[None,:]) 101 | Psi = T.sum(E_cos_sq.flatten(2)[:, :, None] * s[None, :, :], 1) # N x K*comp 102 | flat_diag_n = E_cos_sq.flatten(2) - EPhi**2 # N x K*comp 103 | Y_pred_var = tau**-1 * T.eye(D) + np.transpose(m.T.dot(flat_diag_n[:, :, None] * m),(1,0,2)) \ 104 | + T.eye(D)[None, :, :] * Psi[:, :, None] 105 | 106 | opt_A_mean, opt_A_cov, _ = self.get_opt_A(tau, EPhiTPhi, YT_EPhi) 107 | return LL, KL, Y_pred_mean, Y_pred_var, EPhi, EPhiTPhi, opt_A_mean, opt_A_cov 108 | 109 | def get_model_exact_A(self, Y, X, Z, alpha, beta, mu, Sigma, m, ls, sf2s, lss, ps, tau, a, b, N, Q, D, K): 110 | Y = Y - (X.dot(a) + b[None,:]) 111 | EPhi, EPhiTPhi, E_cos_sq = self.get_EPhi(X, Z, alpha, beta, mu, Sigma, sf2s, lss, ps, K) 112 | YT_EPhi = Y.T.dot(EPhi) 113 | 114 | opt_A_mean, opt_A_cov, cholSigInv = self.get_opt_A(tau, EPhiTPhi, YT_EPhi) 115 | LL = (-0.5*N*D * np.log(2 * np.pi) + 0.5*N*D * T.log(tau) - 0.5*tau*T.sum(Y**2) 116 | - 0.5*D * T.sum(2*T.log(T.diag(cholSigInv))) 117 | + 0.5*tau * T.sum(opt_A_mean.T * YT_EPhi)) 118 | 119 | KL_w = 0.5 * (Sigma + mu**2 - T.log(Sigma) - 1).sum() 120 | 121 | ''' For prediction, m is assumed to be [m_1, ..., m_d] with m_i = opt_a_i, and and ls = opt_A_cov ''' 122 | Y_pred_mean = EPhi.dot(m) + (X.dot(a) + b[None,:]) 123 | EphiTphi = EPhi[:, :, None] * EPhi[:, None, :] # N x K*comp x K*comp 124 | comp = sf2s.shape[0] 125 | EphiTphi = EphiTphi - T.eye(K*comp)[None, :, :] * EphiTphi + T.eye(K*comp)[None, :, :] * E_cos_sq.flatten(2)[:, :, None] 126 | Psi = T.sum(T.sum(EphiTphi * ls[None, :, :], 2), 1) # N 127 | flat_diag_n = E_cos_sq.flatten(2) - EPhi**2 # N x K*comp 128 | Y_pred_var = tau**-1 * T.eye(D) + np.transpose(m.T.dot(flat_diag_n[:, :, None] * m),(1,0,2)) \ 129 | + T.eye(D)[None, :, :] * Psi[:, None, None] 130 | 131 | return LL, KL_w, Y_pred_mean, Y_pred_var, EPhi, EPhiTPhi, opt_A_mean, opt_A_cov 132 | -------------------------------------------------------------------------------- /VSSGP_opt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from VSSGP_model import VSSGP 3 | import pylab 4 | import multiprocessing 5 | def extend(x, y, z = {}): 6 | return dict(x.items() + y.items() + z.items()) 7 | pool, global_f, global_g = None, None, None 8 | def eval_f_LL((X, Y, params)): 9 | return global_f['LL'](**extend({'X': X, 'Y': Y}, params)) 10 | def eval_g_LL((name, X, Y, params)): 11 | return global_g[name]['LL'](**extend({'X': X, 'Y': Y}, params)) 12 | 13 | class VSSGP_opt(): 14 | def __init__(self, N, Q, D, K, inputs, opt_params, fixed_params, use_exact_A = False, test_set = {}, 15 | parallel = False, batch_size = None, components = None, print_interval = None): 16 | self.vssgp, self.N, self.Q, self.K, self.fixed_params = VSSGP(use_exact_A), N, Q, K, fixed_params 17 | self.use_exact_A, self.parallel, self.batch_size = use_exact_A, parallel, batch_size 18 | self.inputs, self.test_set = inputs, test_set 19 | self.print_interval = 10 if print_interval is None else print_interval 20 | self.opt_param_names = [n for n,_ in opt_params.iteritems()] 21 | opt_param_values = [np.atleast_2d(opt_params[n]) for n in self.opt_param_names] 22 | self.shapes = [v.shape for v in opt_param_values] 23 | self.sizes = [sum([np.prod(x) for x in self.shapes[:i]]) for i in xrange(len(self.shapes)+1)] 24 | self.components = opt_params['lSigma'].shape[2] if components is None else components 25 | self.colours = [np.random.rand(3,1) for c in xrange(self.components)] 26 | self.callback_counter = [0] 27 | if 'train_ind' not in test_set: 28 | print 'train_ind not found!' 29 | self.test_set['train_ind'] = np.arange(inputs['X'].shape[0]).astype(int) 30 | self.test_set['test_ind'] = np.arange(0).astype(int) 31 | if batch_size is not None: 32 | if parallel: 33 | global pool, global_f, global_g 34 | global_f, global_g = self.vssgp.f, self.vssgp.g 35 | pool = multiprocessing.Pool(int(self.N / self.batch_size)) 36 | else: 37 | self.params = np.concatenate([v.flatten() for v in opt_param_values]) 38 | self.param_updates = np.zeros_like(self.params) 39 | self.moving_mean_squared = np.zeros_like(self.params) 40 | self.learning_rates = 1e-2*np.ones_like(self.params) 41 | 42 | 43 | def unpack(self, x): 44 | x_param_values = [x[self.sizes[i-1]:self.sizes[i]].reshape(self.shapes[i-1]) for i in xrange(1,len(self.shapes)+1)] 45 | params = {n:v for (n,v) in zip(self.opt_param_names, x_param_values)} 46 | if 'ltau' in params: 47 | params['ltau'] = params['ltau'].squeeze() 48 | return params 49 | 50 | def func(self, x): 51 | params = extend(self.fixed_params, self.unpack(x)) 52 | if self.batch_size is not None: 53 | X, Y, splits = self.inputs['X'], self.inputs['Y'], int(self.N / self.batch_size) 54 | if self.parallel: 55 | arguments = [(X[i::splits], Y[i::splits], params) for i in xrange(splits)] 56 | LL = sum(pool.map_async(eval_f_LL, arguments).get(9999999)) 57 | KL = self.vssgp.f['KL'](**extend({'X': [[0]], 'Y': [[0]]}, params)) 58 | else: 59 | split = np.random.randint(splits) 60 | LL = self.N / self.batch_size * self.vssgp.f['LL'](**extend({'X': X[split::splits], 'Y': Y[split::splits]}, params)) 61 | print LL 62 | KL = self.vssgp.f['KL'](**extend({'X': [[0]], 'Y': [[0]]}, params)) 63 | else: 64 | params = extend(self.inputs, params) 65 | LL, KL = self.vssgp.f['LL'](**params), self.vssgp.f['KL'](**params) 66 | return -(LL - KL) 67 | 68 | def fprime(self, x): 69 | grads, params = [], extend(self.fixed_params, self.unpack(x)) 70 | for n in self.opt_param_names: 71 | if self.batch_size is not None: 72 | X, Y, splits = self.inputs['X'], self.inputs['Y'], int(self.N / self.batch_size) 73 | if self.parallel: 74 | arguments = [(n, X[i::splits], Y[i::splits], params) for i in xrange(splits)] 75 | dLL = sum(pool.map_async(eval_g_LL, arguments).get(9999999)) 76 | dKL = self.vssgp.g[n]['KL'](**extend({'X': [[0]], 'Y': [[0]]}, params)) 77 | else: 78 | split = np.random.randint(splits) 79 | dLL = self.N / self.batch_size * self.vssgp.g[n]['LL'](**extend({'X': X[split::splits], 'Y': Y[split::splits]}, params)) 80 | dKL = self.vssgp.g[n]['KL'](**extend({'X': [[0]], 'Y': [[0]]}, params)) 81 | else: 82 | params = extend(self.inputs, params) 83 | dLL, dKL = self.vssgp.g[n]['LL'](**params), self.vssgp.g[n]['KL'](**params) 84 | grads += [-(dLL - dKL)] 85 | return np.concatenate([grad.flatten() for grad in grads]) 86 | 87 | def plot_func(self, X, Y, plot_test): 88 | vis_ind = self.test_set['test_ind'] if plot_test else self.test_set['train_ind'] 89 | N = self.test_set['train_ind'].shape[0] + self.test_set['test_ind'].shape[0] 90 | invis_ind = np.setdiff1d(np.arange(N), vis_ind) 91 | x, y = np.empty(N), np.empty(N) 92 | x[invis_ind], y[invis_ind] = np.nan, np.nan 93 | x[vis_ind], y[vis_ind] = X[:,0], Y[:,0] 94 | pylab.plot(x, y, c="#a40000" if not plot_test else "#4e9a06") 95 | 96 | def plot_predict(self, X, params, plot_test): 97 | inputs = {'X': X, 'Y': [[0]]} 98 | params = extend(inputs, self.fixed_params, params) 99 | mean = self.vssgp.f['Y_pred_mean'](**params)[:,0] 100 | std = self.vssgp.f['Y_pred_var'](**params)[:,0,0]**0.5 101 | lower_bound, upper_bound = mean - 2*std, mean + 2*std 102 | vis_ind = self.test_set['test_ind'] if plot_test else self.test_set['train_ind'] 103 | N = self.test_set['train_ind'].shape[0] + self.test_set['test_ind'].shape[0] 104 | invis_ind = np.setdiff1d(np.arange(N), vis_ind) 105 | x, y, y1, y2 = np.empty(N), np.empty(N), np.empty(N), np.empty(N) 106 | x[invis_ind], y[invis_ind], y1[invis_ind], y2[invis_ind] = np.nan, np.nan, np.nan, np.nan 107 | x[vis_ind], y[vis_ind], y1[vis_ind], y2[vis_ind] = X[:,0], mean, lower_bound, upper_bound 108 | pylab.plot(x, y, c="#204a87") 109 | pylab.fill_between(x, y1, y2, facecolor="#3465a4", color='w', alpha=0.25) 110 | 111 | def callback(self, x): 112 | if self.callback_counter[0]%self.print_interval == 0: 113 | opt_params = self.unpack(x) 114 | params = extend(self.inputs, self.fixed_params, opt_params) 115 | 116 | if self.use_exact_A: 117 | opt_A_mean = self.vssgp.f['opt_A_mean'](**params) 118 | opt_A_cov = self.vssgp.f['opt_A_cov'](**params) 119 | if 'm' in self.fixed_params: 120 | self.fixed_params['m'] = opt_A_mean 121 | self.fixed_params['ls'] = opt_A_cov 122 | else: 123 | opt_params['m'] = opt_A_mean 124 | opt_params['ls'] = opt_A_cov 125 | 126 | pylab.clf() 127 | pylab.subplot(3, 1, 1) 128 | self.plot_func(params['X'], params['Y'], False) 129 | self.plot_predict(self.inputs['X'], opt_params, False) 130 | if 'X' in self.test_set: 131 | self.plot_func(self.test_set['X'], self.test_set['Y'], True) 132 | self.plot_predict(self.test_set['X'], opt_params, True) 133 | for c in xrange(self.components): 134 | pylab.scatter(params['Z'][0,:,c], 0*params['Z'][0,:,c], c=self.colours[c], zorder=3, edgecolors='none') 135 | 136 | hyp = np.exp(params['lhyp'].copy()) 137 | sf2s = hyp[0] 138 | lss = hyp[1:1+self.Q] 139 | ps = hyp[1+self.Q:] 140 | mean_p, std_p = ps**-1, (2*np.pi*lss)**-1 # Q x comp 141 | mu, Sigma = params['mu'].copy(), np.exp(params['lSigma'].copy()) 142 | min_mean = (std_p[None, :] * mu[0, :, :] + mean_p[None, :]).min() 143 | max_mean = (std_p[None, :] * mu[0, :, :] + mean_p[None, :]).max() 144 | min_std = (std_p[None, :] * Sigma[0, :, :]).max()**0.5 145 | max_std = (std_p[None, :] * Sigma[0, :, :]).max()**0.5 146 | linspace = np.linspace(min_mean-2*min_std, max_mean+2*max_std, 1000) 147 | 148 | pylab.subplot(3, 1, 2) 149 | for c in xrange(self.components): 150 | pdf = pylab.normpdf(linspace,mean_p[:,c],np.min(std_p[:,c],1e-5)) 151 | pylab.plot(linspace,pdf,c=self.colours[c], linewidth=1.0) 152 | pylab.ylim(0,100) 153 | 154 | pylab.subplot(3, 1, 3) 155 | for c in xrange(self.components): 156 | for (mean, std) in zip(mu[0,:,c], Sigma[0,:,c]**0.5): 157 | pdf = pylab.normpdf(linspace,std_p[:,c]*mean+mean_p[:,c],np.min(std_p[:,c]*std,1e-5)) 158 | pylab.plot(linspace,pdf,c=self.colours[c], linewidth=1.0) 159 | pylab.ylim(0,100) 160 | pylab.draw() 161 | 162 | print 'sf2 = ' + str(sf2s.squeeze()) 163 | print 'l = ' + str(lss.squeeze()) 164 | print 'p = ' + str(ps.squeeze()) 165 | print 'tau = ' + str(np.exp(params['ltau'])) 166 | print 'mu = ' 167 | print params['mu'][:,:5,:] 168 | print 'Sigma = ' 169 | print np.exp(params['lSigma'][:,:5,:]) 170 | print 'm = ' 171 | print params['m'][:5,:].T 172 | print 's = ' 173 | print np.exp(params['ls'][:5,:].T) 174 | print 'a = ' + str(params['a']) + ', b = ' + str(params['b']) 175 | print 'EPhi = ' 176 | EPhi = self.vssgp.f['EPhi'](**params) 177 | print EPhi[:5,:5] 178 | LL = self.vssgp.f['LL'](**params) 179 | KL = self.vssgp.f['KL'](**params) 180 | print LL - KL 181 | self.callback_counter[0] += 1 182 | 183 | 184 | def rmsprop_one_step(self, mask, decay = 0.9, momentum = 0, learning_rate_adapt = 0.05, 185 | learning_rate_min = 1e-6, learning_rate_max = 10): 186 | # RMSPROP: Tieleman, T. and Hinton, G. (2012), Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning 187 | # Implementation based on https://github.com/BRML/climin/blob/master/climin/rmsprop.py 188 | 189 | # We use Nesterov momentum: first, we make a step according to the momentum and then we calculate the gradient. 190 | step1 = self.param_updates * momentum 191 | self.params[mask] += step1[mask] 192 | grad = -self.fprime(self.params) 193 | 194 | self.moving_mean_squared[mask] = (decay * self.moving_mean_squared[mask] + (1 - decay) * grad[mask] ** 2) 195 | step2 = self.learning_rates * grad / (self.moving_mean_squared + 1e-8)**0.5 196 | self.params[mask] += step2[mask] 197 | 198 | step = step1 + step2 199 | 200 | # Step rate adaption. If the current step and the momentum agree, we slightly increase the step rate for that dimension. 201 | if learning_rate_adapt: 202 | # This code might look weird, but it makes it work with both numpy and gnumpy. 203 | step_non_negative = step > 0 204 | step_before_non_negative = self.param_updates > 0 205 | agree = (step_non_negative == step_before_non_negative) * 1. 206 | adapt = 1 + agree * learning_rate_adapt * 2 - learning_rate_adapt 207 | self.learning_rates[mask] *= adapt[mask] 208 | self.learning_rates[mask] = np.clip(self.learning_rates[mask], learning_rate_min, learning_rate_max) 209 | 210 | self.param_updates[mask] = step[mask] 211 | --------------------------------------------------------------------------------