├── .gitignore ├── data └── mnist │ ├── mnistSmall.mat │ └── mnistTiny.mat ├── readme.md ├── DLGM ├── launch_all.sh ├── test_sanity.py ├── utils.py ├── test_dlgm.py ├── color.py ├── test_mnist.py └── dlgm.py ├── va ├── test_sanity.py ├── utils.py ├── test_va.py ├── color.py ├── pegasos.py ├── test_mnist.py └── va.py └── RBM ├── utils.py ├── logistic_sgd.py └── rbm.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_STORE 2 | *.pyc 3 | -------------------------------------------------------------------------------- /data/mnist/mnistSmall.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strin/DeepBayes/HEAD/data/mnist/mnistSmall.mat -------------------------------------------------------------------------------- /data/mnist/mnistTiny.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/strin/DeepBayes/HEAD/data/mnist/mnistTiny.mat -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | DeepBayes 2 | 3 | * [DLGM - code that reproduces deep latent generative models] (DLGM/) 4 | -------------------------------------------------------------------------------- /DLGM/launch_all.sh: -------------------------------------------------------------------------------- 1 | for num_node in 50 200 2 | do 3 | for kappa in 0 0.1 4 | do 5 | for sigma in 0 0.001 0.01 0.1 6 | do 7 | nohup python test_mnist.py $num_node $kappa $sigma & 8 | done 9 | done 10 | done 11 | -------------------------------------------------------------------------------- /DLGM/test_sanity.py: -------------------------------------------------------------------------------- 1 | from dlgm import * 2 | import numpy as np 3 | import scipy.io as sio 4 | 5 | def test_h1_v3(): 6 | v1 = [1,0,1] 7 | v2 = [0,0,0] 8 | train_data = np.array([v1 for i in range(500)]+[v2 for i in range(500)]) 9 | test_data = np.array([v1 for i in range(50)] + [v2 for i in range(50)]) 10 | print 'training data', train_data 11 | model = DeepLatentGM([3, 4], batchsize=1, rec_hidden=1, kappa=0, stepsize=1) 12 | model.train(train_data, 10, test_data = train_data) 13 | print 'Generative Model', model.gmodel.pack() 14 | print 'Recognition Model', model.rmodel.pack() 15 | print 'Sample', model.sample(test_data) 16 | 17 | test_h1_v3() 18 | 19 | -------------------------------------------------------------------------------- /va/test_sanity.py: -------------------------------------------------------------------------------- 1 | from dlgm import * 2 | import numpy as np 3 | import scipy.io as sio 4 | 5 | def test_h1_v3(): 6 | v1 = [1,0,1] 7 | v2 = [0,0,0] 8 | train_data = np.array([v1 for i in range(500)]+[v2 for i in range(500)]) 9 | test_data = np.array([v1 for i in range(50)] + [v2 for i in range(50)]) 10 | print 'training data', train_data 11 | model = DeepLatentGM([3, 4], batchsize=1, rec_hidden=1, kappa=0, stepsize=1) 12 | model.train(train_data, 10, test_data = train_data) 13 | print 'Generative Model', model.gmodel.pack() 14 | print 'Recognition Model', model.rmodel.pack() 15 | print 'Sample', model.sample(test_data) 16 | 17 | test_h1_v3() 18 | 19 | -------------------------------------------------------------------------------- /va/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import numpy.linalg as npla 4 | import numpy.random as npr 5 | 6 | def param_op2(param, grad, op): 7 | for i in range(len(param)): 8 | param[i][:] = op(param[i], param[i])[:] 9 | 10 | def param_op(param, op): 11 | for i in range(len(param)): 12 | param[i][:] = op(param[i])[:] 13 | 14 | def param_add(param, grad): 15 | res = grad 16 | if param != []: 17 | for i in range(len(param)): 18 | res[i] += param[i] 19 | return res 20 | 21 | def param_mul_scalar(param, scalar): 22 | res = param 23 | for i in range(len(param)): 24 | res[i] = param[i] * scalar 25 | return res 26 | 27 | def param_neg(param): 28 | res = param 29 | for i in range(len(param)): 30 | res[i] = -param[i] 31 | return res 32 | 33 | def randn01(*shape): 34 | """ 35 | generate random vector/matrix from i.i.d. Gaussian. 36 | renormalize it to unit vector/matrix. 37 | """ 38 | M = npr.randn(*shape) 39 | return M/npla.norm(M) 40 | -------------------------------------------------------------------------------- /DLGM/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import numpy.linalg as npla 4 | import numpy.random as npr 5 | from theano import config 6 | 7 | def param_op2(param, grad, op): 8 | for i in range(len(param)): 9 | param[i][:] = op(param[i], param[i])[:] 10 | 11 | def param_op(param, op): 12 | for i in range(len(param)): 13 | param[i][:] = op(param[i])[:] 14 | 15 | def param_add(param, grad): 16 | res = grad 17 | if param != []: 18 | for i in range(len(param)): 19 | res[i] += param[i] 20 | return res 21 | 22 | def param_mul_scalar(param, scalar): 23 | res = param 24 | for i in range(len(param)): 25 | res[i] = param[i] * np.asarray(scalar, config.floatX) 26 | return res 27 | 28 | def param_neg(param): 29 | res = param 30 | for i in range(len(param)): 31 | res[i] = np.asarray(np.zeros(param[i].shape), config.floatX)-param[i] 32 | return res 33 | 34 | def randn01(*shape): 35 | """ 36 | generate random vector/matrix from i.i.d. Gaussian. 37 | renormalize it to unit vector/matrix. 38 | """ 39 | M = npr.randn(*shape) 40 | return M/npla.norm(M) 41 | -------------------------------------------------------------------------------- /va/test_va.py: -------------------------------------------------------------------------------- 1 | import numpy.random as npr 2 | from va import * 3 | import unittest 4 | 5 | class TestDecoder(unittest.TestCase): 6 | 7 | def test_gen(self): 8 | model = Decoder([2, 4]) 9 | xi = npr.randn(4,2) 10 | # print xi 11 | v = np.array([[1, 0], [0, 1]]).T 12 | param = model.pack() 13 | # print v 14 | # print model.sample(xi) 15 | # print model.get_lhood(v, xi) 16 | resp = np.dot(param['W1'], np.dot(param['G'], xi)) + param['b1'] 17 | lhood = (v) * np.log(np.logistic(resp)) + (1-v) * np.log(1-np.logistic(resp)) 18 | # print lhood.sum() 19 | assert(np.abs(lhood.sum() - model.get_lhood(v, xi)) < 1e-4) 20 | 21 | def test_gen_grad(self): 22 | model = Decoder([2, 4]) 23 | xi = npr.randn(4,1) 24 | # print xi 25 | v = np.array([[1, 0]]).T 26 | gradient = model.get_grad(v, xi) 27 | # print gradient 28 | 29 | def test_gen_grad_xi(self): 30 | model = Decoder([2, 4]) 31 | xi = npr.randn(4,1) 32 | v = np.array([[1, 0]]).T 33 | grad_xi = model.get_grad_xi(v, xi) 34 | # print grad_xi 35 | 36 | class TestEncoder(unittest.TestCase): 37 | 38 | def test_reco(self): 39 | model = Encoder([2, 4], sigma=0.1) 40 | v = np.array([[1, 0]]).T 41 | model.sample_eps(v) 42 | 43 | 44 | 45 | 46 | if __name__ == '__main__': 47 | unittest.main() 48 | -------------------------------------------------------------------------------- /DLGM/test_dlgm.py: -------------------------------------------------------------------------------- 1 | """ 2 | test module for dlgm.py 3 | """ 4 | from dlgm import * 5 | import numpy as np 6 | import numpy.random as npr 7 | import unittest 8 | 9 | class TestGenerativeModel(unittest.TestCase): 10 | 11 | def setUp(me): 12 | pass 13 | 14 | def test_nonlinear(me): 15 | arr = [1, -1, 3, 3, 5, 6] 16 | arch = [1,2] 17 | nn = GenerativeModel(arch) 18 | me.assertEqual(list(nn.nonlinear(arr)), [1,0,3,3,5,6]) 19 | 20 | def test_generate(me): 21 | arch = [1,2] 22 | nn = GenerativeModel(arch) 23 | xi = [npr.randn(i) for i in arch[1:]] 24 | h1 = np.dot(nn.G[1].get_value(), xi[0]) 25 | h0 = np.dot(nn.W[0].get_value(), nn.nonlinear(h1)) + nn.b[0].get_value() 26 | res = nn.generate(*xi) 27 | me.assertEqual(res[0], h0) 28 | assert((res[1] == h1).all()) 29 | 30 | def test_recognition(me): 31 | arch = [5, 10] 32 | nn = RecognitionModel(arch) 33 | v = [1, 0, 0, 1, 1] 34 | z = nn.nonlinear(np.dot(nn.Wv[1].get_value(), v) + nn.bv[1].get_value()) 35 | me.assertEqual(list(nn.get_z(v)[0]), list(z)) 36 | mu = np.dot(nn.Wmu[1].get_value(), z) + nn.bmu[1].get_value() 37 | me.assertEqual(list(nn.get_mu(v)[0]), list(mu)) 38 | d = np.exp(np.dot(nn.Wd[1].get_value(), z) + nn.bd[1].get_value()) 39 | me.assertEqual(list(nn.get_d(v)[0]), list(d)) 40 | u = np.dot(nn.Wu[1].get_value(), z) + nn.bu[1].get_value() 41 | me.assertEqual(list(nn.get_u(v)[0]), list(u)) 42 | 43 | 44 | if __name__ == "__main__": 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /DLGM/color.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class pcolors: 4 | HEADER = '\033[95m' 5 | OKBLUE = '\033[94m' 6 | OKGREEN = '\033[92m' 7 | WARNING = '\033[93m' 8 | FAIL = '\033[91m' 9 | GRAY = '\033[1;30m' 10 | ENDC = '\033[0m' 11 | Red = '\033[91m' 12 | Green = '\033[92m' 13 | Blue = '\033[94m' 14 | Cyan = '\033[96m' 15 | White = '\033[97m' 16 | Yellow = '\033[93m' 17 | Magenta = '\033[95m' 18 | Grey = '\033[90m' 19 | Black = '\033[90m' 20 | 21 | def printRed(*args): 22 | beginRed() 23 | sys.stdout.write(' '.join([str(x) for x in args])+'\n') 24 | end() 25 | 26 | def printBlue(*args): 27 | beginBlue() 28 | sys.stdout.write(' '.join([str(x) for x in args])+'\n') 29 | end() 30 | 31 | def printComment(*args): 32 | beginComment() 33 | sys.stdout.write(' '.join([str(x) for x in args])+'\n') 34 | end() 35 | 36 | def beginTitle(): 37 | sys.stdout.write(pcolors.OKGREEN) 38 | sys.stdout.flush() 39 | 40 | def beginComment(): 41 | sys.stdout.write(pcolors.GRAY) 42 | sys.stdout.flush() 43 | 44 | def beginError(): 45 | sys.stdout.write(pcolors.FAIL) 46 | sys.stdout.flush() 47 | 48 | def beginRed(): 49 | sys.stdout.write(pcolors.Red) 50 | sys.stdout.flush() 51 | 52 | def beginBlue() : 53 | sys.stdout.write(pcolors.OKBLUE) 54 | sys.stdout.flush() 55 | 56 | def end() : 57 | sys.stdout.write(pcolors.ENDC) 58 | sys.stdout.flush() 59 | 60 | 61 | if __name__ == "__main__": 62 | printComment("hi", 1) 63 | -------------------------------------------------------------------------------- /va/color.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | class pcolors: 4 | HEADER = '\033[95m' 5 | OKBLUE = '\033[94m' 6 | OKGREEN = '\033[92m' 7 | WARNING = '\033[93m' 8 | FAIL = '\033[91m' 9 | GRAY = '\033[1;30m' 10 | ENDC = '\033[0m' 11 | Red = '\033[91m' 12 | Green = '\033[92m' 13 | Blue = '\033[94m' 14 | Cyan = '\033[96m' 15 | White = '\033[97m' 16 | Yellow = '\033[93m' 17 | Magenta = '\033[95m' 18 | Grey = '\033[90m' 19 | Black = '\033[90m' 20 | 21 | def printRed(*args): 22 | beginRed() 23 | sys.stdout.write(' '.join([str(x) for x in args])+'\n') 24 | end() 25 | 26 | def printBlue(*args): 27 | beginBlue() 28 | sys.stdout.write(' '.join([str(x) for x in args])+'\n') 29 | end() 30 | 31 | def printComment(*args): 32 | beginComment() 33 | sys.stdout.write(' '.join([str(x) for x in args])+'\n') 34 | end() 35 | 36 | def beginTitle(): 37 | sys.stdout.write(pcolors.OKGREEN) 38 | sys.stdout.flush() 39 | 40 | def beginComment(): 41 | sys.stdout.write(pcolors.GRAY) 42 | sys.stdout.flush() 43 | 44 | def beginError(): 45 | sys.stdout.write(pcolors.FAIL) 46 | sys.stdout.flush() 47 | 48 | def beginRed(): 49 | sys.stdout.write(pcolors.Red) 50 | sys.stdout.flush() 51 | 52 | def beginBlue() : 53 | sys.stdout.write(pcolors.OKBLUE) 54 | sys.stdout.flush() 55 | 56 | def end() : 57 | sys.stdout.write(pcolors.ENDC) 58 | sys.stdout.flush() 59 | 60 | 61 | if __name__ == "__main__": 62 | printComment("hi", 1) 63 | -------------------------------------------------------------------------------- /va/pegasos.py: -------------------------------------------------------------------------------- 1 | import scipy.io as sio 2 | import pickle, gzip 3 | import numpy.random as npr 4 | import numpy as np 5 | import pdb 6 | import matplotlib.pyplot as plt 7 | 8 | toFloat = np.vectorize(float) 9 | 10 | mat = sio.loadmat('../data/mnist/mnist.mat') 11 | result = sio.loadmat('result.mat') 12 | # train_data = np.array(mat['trainData']) 13 | train_data = result['xi_train'].T 14 | train_label = np.argmax(np.array(mat['trainLabels']), axis=1) 15 | # test_data = np.array(mat['testData']) 16 | test_data = result['xi'].T 17 | test_label = np.argmax(np.array(mat['testLabels']), axis=1) 18 | 19 | batchsize = 32 20 | num_iter = 10000 21 | D = train_data.shape[1] 22 | eta = 0.01 23 | 24 | W = npr.randn(D, 10) 25 | G2 = np.zeros_like(W) 26 | data_mean = np.mean(train_data, axis=0) 27 | train_data -= data_mean 28 | 29 | 30 | def test_acc(): 31 | resp = np.dot(test_data - data_mean, W) 32 | predict = np.argmax(resp, 1) 33 | return np.sum(predict == test_label) / float(len(resp)) 34 | 35 | acc = [] 36 | for it in range(num_iter): 37 | ind = npr.choice(range(len(train_data)), batchsize, replace=False) 38 | g = np.zeros_like(W) 39 | for (x, y) in zip(train_data[ind], train_label[ind]): 40 | resp = 100 + np.dot(x, W) - np.dot(x, W[:,y]) 41 | resp[y] = 0 42 | yp = np.argmax(resp) 43 | g[:,yp] -= x 44 | g[:,y] += x 45 | g /= float(batchsize) 46 | G2 += g * g 47 | W += eta * g / (1e-4 + np.sqrt(G2)) 48 | acc += [test_acc()] 49 | print 'iter = ', it, ' , acc = ', acc[-1] 50 | 51 | # sio.savemat('result.mat', {'W':W, 'acc':acc}) 52 | plt.plot(acc) 53 | plt.xlabel('iteration') 54 | plt.ylabel('accuracy') 55 | plt.savefig('mnist.png') 56 | plt.show() 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /va/test_mnist.py: -------------------------------------------------------------------------------- 1 | from va import * 2 | import numpy as np 3 | import sys, os 4 | import scipy.io as sio 5 | from multiprocessing import Pool 6 | import itertools 7 | import pickle, gzip 8 | 9 | toFloat = np.vectorize(float) 10 | 11 | def universal_worker(input_pair): 12 | function, args = input_pair 13 | return function(*args) 14 | 15 | def pool_args(function, *args): 16 | return zip(itertools.repeat(function), zip(*args)) 17 | 18 | def run(hidden, kappa, sigma, stepsize): 19 | mat = sio.loadmat('../data/mnist/mnistSmall.mat') 20 | train_data = np.array(mat['trainData']) 21 | train_label = np.argmax(np.array(mat['trainLabels']), axis=1) 22 | test_data = np.array(mat['testData']) 23 | test_label = np.argmax(np.array(mat['testLabels']), axis=1) 24 | 25 | output_path = '../result/hidden_%d_kappa_%f_sigma_%f' % (hidden, kappa, sigma) 26 | os.system('mkdir -p ../result/%s' % output_path) 27 | model = DeepLatentGM([784, hidden, hidden], batchsize=128, kappa=kappa, sigma=sigma, rec_hidden=hidden, stepsize=stepsize,\ 28 | num_label=10) 29 | model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label, output_path=output_path) 30 | 31 | def run_full(hidden = 50, kappa = 0, sigma = 0, c = 0, stepsize = 0.01): 32 | mat = pickle.load(gzip.open('../data/mnist/mnist.pkl.gz', 'rb')) 33 | train_data = np.array(list(mat[0][0]) + list(mat[1][0])) 34 | train_label = np.array(list(mat[0][1]) + list(mat[1][1])) 35 | test_data = mat[2][0] 36 | test_label = mat[2][1] 37 | 38 | model = AutoEncoder([784, hidden], num_sample=1, batchsize=512, kappa=kappa, sigma=sigma, stepsize=stepsize,\ 39 | num_label=10, c = c, ell=10) 40 | model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label) 41 | 42 | def run_tiny(hidden = 50, kappa = 0, sigma = 0, c = 0, stepsize = 0.01): 43 | mat = sio.loadmat('../data/mnist/mnistTiny.mat') 44 | train_data = np.array(toFloat(mat['trainData'])) # binarize. 45 | train_label = np.argmax(np.array(mat['trainLabels']), axis=1) 46 | test_data = np.array(toFloat(mat['testData'])) # binarize. 47 | test_label = np.argmax(np.array(mat['testLabels']), axis=1) 48 | 49 | model = AutoEncoder([784, hidden], num_sample=1, batchsize=32, kappa=kappa, sigma=sigma, stepsize=stepsize,\ 50 | num_label=10, c = c, ell=10) 51 | model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label) 52 | 53 | # run_tiny() 54 | run_full() 55 | 56 | -------------------------------------------------------------------------------- /DLGM/test_mnist.py: -------------------------------------------------------------------------------- 1 | from dlgm import * 2 | import numpy as np 3 | import sys, os 4 | import scipy.io as sio 5 | from multiprocessing import Pool 6 | import itertools 7 | import pickle, gzip 8 | 9 | toFloat = np.vectorize(float) 10 | 11 | def universal_worker(input_pair): 12 | function, args = input_pair 13 | return function(*args) 14 | 15 | def pool_args(function, *args): 16 | return zip(itertools.repeat(function), zip(*args)) 17 | 18 | def run(hidden, kappa, sigma, stepsize): 19 | mat = sio.loadmat('../data/mnist/mnistSmall.mat') 20 | train_data = np.array(mat['trainData']) 21 | train_label = np.argmax(np.array(mat['trainLabels']), axis=1) 22 | test_data = np.array(mat['testData']) 23 | test_label = np.argmax(np.array(mat['testLabels']), axis=1) 24 | 25 | output_path = '../result/hidden_%d_kappa_%f_sigma_%f' % (hidden, kappa, sigma) 26 | os.system('mkdir -p ../result/%s' % output_path) 27 | model = DeepLatentGM([784, hidden, hidden], batchsize=128, kappa=kappa, sigma=sigma, rec_hidden=hidden, stepsize=stepsize,\ 28 | num_label=10) 29 | model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label, output_path=output_path) 30 | 31 | def run_full(): 32 | mat = pickle.load(gzip.open('../data/mnist/mnist.pkl.gz', 'rb')) 33 | train_data = np.array(list(mat[0][0]) + list(mat[1][0])) 34 | train_label = np.array(list(mat[0][1]) + list(mat[1][1])) 35 | test_data = mat[2][0] 36 | test_label = mat[2][1] 37 | hidden = 100 38 | sigma = 0.001 39 | kappa = 0.1 40 | stepsize = 0.01 41 | 42 | model = DeepLatentGM([784, hidden, hidden], batchsize=64, kappa=kappa, sigma=sigma, rec_hidden=hidden, stepsize=stepsize,\ 43 | num_label=10) 44 | model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label) 45 | 46 | def run_tiny(): 47 | mat = sio.loadmat('../data/mnist/mnistTiny.mat') 48 | train_data = np.array(toFloat(mat['trainData'] > 0.5)) # binarize. 49 | train_label = np.argmax(np.array(mat['trainLabels']), axis=1) 50 | test_data = np.array(toFloat(mat['testData'] > 0.5)) # binarize. 51 | test_label = np.argmax(np.array(mat['testLabels']), axis=1) 52 | 53 | hidden = 100 54 | sigma = 0.001 55 | kappa = 0.1 56 | stepsize = 0.01 57 | 58 | model = DeepLatentGM([784, hidden, hidden], batchsize=64, kappa=kappa, sigma=sigma, rec_hidden=hidden, stepsize=stepsize,\ 59 | num_label=10) 60 | model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label) 61 | 62 | # run_full() 63 | run_tiny() 64 | 65 | -------------------------------------------------------------------------------- /RBM/utils.py: -------------------------------------------------------------------------------- 1 | """ This file contains different utility functions that are not connected 2 | in anyway to the networks presented in the tutorials, but rather help in 3 | processing the outputs into a more understandable way. 4 | 5 | For example ``tile_raster_images`` helps in generating a easy to grasp 6 | image from a set of samples or weights. 7 | """ 8 | 9 | 10 | import numpy 11 | 12 | 13 | def scale_to_unit_interval(ndar, eps=1e-8): 14 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 15 | ndar = ndar.copy() 16 | ndar -= ndar.min() 17 | ndar *= 1.0 / (ndar.max() + eps) 18 | return ndar 19 | 20 | 21 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 22 | scale_rows_to_unit_interval=True, 23 | output_pixel_vals=True): 24 | """ 25 | Transform an array with one flattened image per row, into an array in 26 | which images are reshaped and layed out like tiles on a floor. 27 | 28 | This function is useful for visualizing datasets whose rows are images, 29 | and also columns of matrices for transforming those rows 30 | (such as the first layer of a neural net). 31 | 32 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 33 | be 2-D ndarrays or None; 34 | :param X: a 2-D array in which every row is a flattened image. 35 | 36 | :type img_shape: tuple; (height, width) 37 | :param img_shape: the original shape of each image 38 | 39 | :type tile_shape: tuple; (rows, cols) 40 | :param tile_shape: the number of images to tile (rows, cols) 41 | 42 | :param output_pixel_vals: if output should be pixel values (i.e. int8 43 | values) or floats 44 | 45 | :param scale_rows_to_unit_interval: if the values need to be scaled before 46 | being plotted to [0,1] or not 47 | 48 | 49 | :returns: array suitable for viewing as an image. 50 | (See:`Image.fromarray`.) 51 | :rtype: a 2-d array with same dtype as X. 52 | 53 | """ 54 | 55 | assert len(img_shape) == 2 56 | assert len(tile_shape) == 2 57 | assert len(tile_spacing) == 2 58 | 59 | # The expression below can be re-written in a more C style as 60 | # follows : 61 | # 62 | # out_shape = [0,0] 63 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - 64 | # tile_spacing[0] 65 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - 66 | # tile_spacing[1] 67 | out_shape = [ 68 | (ishp + tsp) * tshp - tsp 69 | for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing) 70 | ] 71 | 72 | if isinstance(X, tuple): 73 | assert len(X) == 4 74 | # Create an output numpy ndarray to store the image 75 | if output_pixel_vals: 76 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 77 | dtype='uint8') 78 | else: 79 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), 80 | dtype=X.dtype) 81 | 82 | #colors default to 0, alpha defaults to 1 (opaque) 83 | if output_pixel_vals: 84 | channel_defaults = [0, 0, 0, 255] 85 | else: 86 | channel_defaults = [0., 0., 0., 1.] 87 | 88 | for i in xrange(4): 89 | if X[i] is None: 90 | # if channel is None, fill it with zeros of the correct 91 | # dtype 92 | dt = out_array.dtype 93 | if output_pixel_vals: 94 | dt = 'uint8' 95 | out_array[:, :, i] = numpy.zeros( 96 | out_shape, 97 | dtype=dt 98 | ) + channel_defaults[i] 99 | else: 100 | # use a recurrent call to compute the channel and store it 101 | # in the output 102 | out_array[:, :, i] = tile_raster_images( 103 | X[i], img_shape, tile_shape, tile_spacing, 104 | scale_rows_to_unit_interval, output_pixel_vals) 105 | return out_array 106 | 107 | else: 108 | # if we are dealing with only one channel 109 | H, W = img_shape 110 | Hs, Ws = tile_spacing 111 | 112 | # generate a matrix to store the output 113 | dt = X.dtype 114 | if output_pixel_vals: 115 | dt = 'uint8' 116 | out_array = numpy.zeros(out_shape, dtype=dt) 117 | 118 | for tile_row in xrange(tile_shape[0]): 119 | for tile_col in xrange(tile_shape[1]): 120 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 121 | this_x = X[tile_row * tile_shape[1] + tile_col] 122 | if scale_rows_to_unit_interval: 123 | # if we should scale values to be between 0 and 1 124 | # do this by calling the `scale_to_unit_interval` 125 | # function 126 | this_img = scale_to_unit_interval( 127 | this_x.reshape(img_shape)) 128 | else: 129 | this_img = this_x.reshape(img_shape) 130 | # add the slice to the corresponding position in the 131 | # output array 132 | c = 1 133 | if output_pixel_vals: 134 | c = 255 135 | out_array[ 136 | tile_row * (H + Hs): tile_row * (H + Hs) + H, 137 | tile_col * (W + Ws): tile_col * (W + Ws) + W 138 | ] = this_img * c 139 | return out_array 140 | -------------------------------------------------------------------------------- /RBM/logistic_sgd.py: -------------------------------------------------------------------------------- 1 | """ 2 | This tutorial introduces logistic regression using Theano and stochastic 3 | gradient descent. 4 | 5 | Logistic regression is a probabilistic, linear classifier. It is parametrized 6 | by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is 7 | done by projecting data points onto a set of hyperplanes, the distance to 8 | which is used to determine a class membership probability. 9 | 10 | Mathematically, this can be written as: 11 | 12 | .. math:: 13 | P(Y=i|x, W,b) &= softmax_i(W x + b) \\ 14 | &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}} 15 | 16 | 17 | The output of the model or prediction is then done by taking the argmax of 18 | the vector whose i'th element is P(Y=i|x). 19 | 20 | .. math:: 21 | 22 | y_{pred} = argmax_i P(Y=i|x,W,b) 23 | 24 | 25 | This tutorial presents a stochastic gradient descent optimization method 26 | suitable for large datasets. 27 | 28 | 29 | References: 30 | 31 | - textbooks: "Pattern Recognition and Machine Learning" - 32 | Christopher M. Bishop, section 4.3.2 33 | 34 | """ 35 | __docformat__ = 'restructedtext en' 36 | 37 | import cPickle 38 | import gzip 39 | import os 40 | import sys 41 | import time 42 | 43 | import numpy 44 | 45 | import theano 46 | import theano.tensor as T 47 | 48 | 49 | class LogisticRegression(object): 50 | """Multi-class Logistic Regression Class 51 | 52 | The logistic regression is fully described by a weight matrix :math:`W` 53 | and bias vector :math:`b`. Classification is done by projecting data 54 | points onto a set of hyperplanes, the distance to which is used to 55 | determine a class membership probability. 56 | """ 57 | 58 | def __init__(self, input, n_in, n_out): 59 | """ Initialize the parameters of the logistic regression 60 | 61 | :type input: theano.tensor.TensorType 62 | :param input: symbolic variable that describes the input of the 63 | architecture (one minibatch) 64 | 65 | :type n_in: int 66 | :param n_in: number of input units, the dimension of the space in 67 | which the datapoints lie 68 | 69 | :type n_out: int 70 | :param n_out: number of output units, the dimension of the space in 71 | which the labels lie 72 | 73 | """ 74 | # start-snippet-1 75 | # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 76 | self.W = theano.shared( 77 | value=numpy.zeros( 78 | (n_in, n_out), 79 | dtype=theano.config.floatX 80 | ), 81 | name='W', 82 | borrow=True 83 | ) 84 | # initialize the baises b as a vector of n_out 0s 85 | self.b = theano.shared( 86 | value=numpy.zeros( 87 | (n_out,), 88 | dtype=theano.config.floatX 89 | ), 90 | name='b', 91 | borrow=True 92 | ) 93 | 94 | # symbolic expression for computing the matrix of class-membership 95 | # probabilities 96 | # Where: 97 | # W is a matrix where column-k represent the separation hyper plain for 98 | # class-k 99 | # x is a matrix where row-j represents input training sample-j 100 | # b is a vector where element-k represent the free parameter of hyper 101 | # plain-k 102 | self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) 103 | 104 | # symbolic description of how to compute prediction as class whose 105 | # probability is maximal 106 | self.y_pred = T.argmax(self.p_y_given_x, axis=1) 107 | # end-snippet-1 108 | 109 | # parameters of the model 110 | self.params = [self.W, self.b] 111 | 112 | def negative_log_likelihood(self, y): 113 | """Return the mean of the negative log-likelihood of the prediction 114 | of this model under a given target distribution. 115 | 116 | .. math:: 117 | 118 | \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 119 | \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} 120 | \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ 121 | \ell (\theta=\{W,b\}, \mathcal{D}) 122 | 123 | :type y: theano.tensor.TensorType 124 | :param y: corresponds to a vector that gives for each example the 125 | correct label 126 | 127 | Note: we use the mean instead of the sum so that 128 | the learning rate is less dependent on the batch size 129 | """ 130 | # start-snippet-2 131 | # y.shape[0] is (symbolically) the number of rows in y, i.e., 132 | # number of examples (call it n) in the minibatch 133 | # T.arange(y.shape[0]) is a symbolic vector which will contain 134 | # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of 135 | # Log-Probabilities (call it LP) with one row per example and 136 | # one column per class LP[T.arange(y.shape[0]),y] is a vector 137 | # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., 138 | # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is 139 | # the mean (across minibatch examples) of the elements in v, 140 | # i.e., the mean log-likelihood across the minibatch. 141 | return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) 142 | # end-snippet-2 143 | 144 | def errors(self, y): 145 | """Return a float representing the number of errors in the minibatch 146 | over the total number of examples of the minibatch ; zero one 147 | loss over the size of the minibatch 148 | 149 | :type y: theano.tensor.TensorType 150 | :param y: corresponds to a vector that gives for each example the 151 | correct label 152 | """ 153 | 154 | # check if y has same dimension of y_pred 155 | if y.ndim != self.y_pred.ndim: 156 | raise TypeError( 157 | 'y should have the same shape as self.y_pred', 158 | ('y', y.type, 'y_pred', self.y_pred.type) 159 | ) 160 | # check if y is of the correct datatype 161 | if y.dtype.startswith('int'): 162 | # the T.neq operator returns a vector of 0s and 1s, where 1 163 | # represents a mistake in prediction 164 | return T.mean(T.neq(self.y_pred, y)) 165 | else: 166 | raise NotImplementedError() 167 | 168 | 169 | def load_data(dataset, size=-1): 170 | ''' Loads the dataset 171 | 172 | :type dataset: string 173 | :param dataset: the path to the dataset (here MNIST) 174 | ''' 175 | 176 | ############# 177 | # LOAD DATA # 178 | ############# 179 | 180 | # Download the MNIST dataset if it is not present 181 | data_dir, data_file = os.path.split(dataset) 182 | if data_dir == "" and not os.path.isfile(dataset): 183 | # Check if dataset is in the data directory. 184 | new_path = os.path.join( 185 | os.path.split(__file__)[0], 186 | "..", 187 | "data", 188 | dataset 189 | ) 190 | if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': 191 | dataset = new_path 192 | 193 | if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': 194 | import urllib 195 | origin = ( 196 | 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' 197 | ) 198 | print 'Downloading data from %s' % origin 199 | urllib.urlretrieve(origin, dataset) 200 | 201 | print '... loading data' 202 | 203 | # Load the dataset 204 | f = gzip.open(dataset, 'rb') 205 | train_set, valid_set, test_set = cPickle.load(f) 206 | def truncate(dt, size): 207 | return (dt[0][:size], dt[1][:size]) 208 | train_set = truncate(train_set, size) 209 | valid_set = truncate(valid_set, size) 210 | test_set = truncate(test_set, size) 211 | 212 | f.close() 213 | #train_set, valid_set, test_set format: tuple(input, target) 214 | #input is an numpy.ndarray of 2 dimensions (a matrix) 215 | #witch row's correspond to an example. target is a 216 | #numpy.ndarray of 1 dimensions (vector)) that have the same length as 217 | #the number of rows in the input. It should give the target 218 | #target to the example with the same index in the input. 219 | 220 | def shared_dataset(data_xy, borrow=True): 221 | """ Function that loads the dataset into shared variables 222 | 223 | The reason we store our dataset in shared variables is to allow 224 | Theano to copy it into the GPU memory (when code is run on GPU). 225 | Since copying data into the GPU is slow, copying a minibatch everytime 226 | is needed (the default behaviour if the data is not in a shared 227 | variable) would lead to a large decrease in performance. 228 | """ 229 | data_x, data_y = data_xy 230 | shared_x = theano.shared(numpy.asarray(data_x, 231 | dtype=theano.config.floatX), 232 | borrow=borrow) 233 | shared_y = theano.shared(numpy.asarray(data_y, 234 | dtype=theano.config.floatX), 235 | borrow=borrow) 236 | # When storing data on the GPU it has to be stored as floats 237 | # therefore we will store the labels as ``floatX`` as well 238 | # (``shared_y`` does exactly that). But during our computations 239 | # we need them as ints (we use labels as index, and if they are 240 | # floats it doesn't make sense) therefore instead of returning 241 | # ``shared_y`` we will have to cast it to int. This little hack 242 | # lets ous get around this issue 243 | return shared_x, shared_y 244 | 245 | test_set_x, test_set_y = shared_dataset(test_set) 246 | valid_set_x, valid_set_y = shared_dataset(valid_set) 247 | train_set_x, train_set_y = shared_dataset(train_set) 248 | 249 | rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), 250 | (test_set_x, test_set_y)] 251 | return rval 252 | 253 | 254 | def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000, 255 | dataset='mnist.pkl.gz', 256 | batch_size=600): 257 | """ 258 | Demonstrate stochastic gradient descent optimization of a log-linear 259 | model 260 | 261 | This is demonstrated on MNIST. 262 | 263 | :type learning_rate: float 264 | :param learning_rate: learning rate used (factor for the stochastic 265 | gradient) 266 | 267 | :type n_epochs: int 268 | :param n_epochs: maximal number of epochs to run the optimizer 269 | 270 | :type dataset: string 271 | :param dataset: the path of the MNIST dataset file from 272 | http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz 273 | 274 | """ 275 | datasets = load_data(dataset) 276 | 277 | train_set_x, train_set_y = datasets[0] 278 | valid_set_x, valid_set_y = datasets[1] 279 | test_set_x, test_set_y = datasets[2] 280 | 281 | 282 | # compute number of minibatches for training, validation and testing 283 | n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size 284 | n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size 285 | n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size 286 | 287 | ###################### 288 | # BUILD ACTUAL MODEL # 289 | ###################### 290 | print '... building the model' 291 | 292 | # allocate symbolic variables for the data 293 | index = T.lscalar() # index to a [mini]batch 294 | 295 | # generate symbolic variables for input (x and y represent a 296 | # minibatch) 297 | x = T.matrix('x') # data, presented as rasterized images 298 | y = T.ivector('y') # labels, presented as 1D vector of [int] labels 299 | 300 | # construct the logistic regression class 301 | # Each MNIST image has size 28*28 302 | classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10) 303 | 304 | # the cost we minimize during training is the negative log likelihood of 305 | # the model in symbolic format 306 | cost = classifier.negative_log_likelihood(y) 307 | 308 | # compiling a Theano function that computes the mistakes that are made by 309 | # the model on a minibatch 310 | test_model = theano.function( 311 | inputs=[index], 312 | outputs=classifier.errors(y), 313 | givens={ 314 | x: test_set_x[index * batch_size: (index + 1) * batch_size], 315 | y: test_set_y[index * batch_size: (index + 1) * batch_size] 316 | } 317 | ) 318 | 319 | validate_model = theano.function( 320 | inputs=[index], 321 | outputs=classifier.errors(y), 322 | givens={ 323 | x: valid_set_x[index * batch_size: (index + 1) * batch_size], 324 | y: valid_set_y[index * batch_size: (index + 1) * batch_size] 325 | } 326 | ) 327 | 328 | # compute the gradient of cost with respect to theta = (W,b) 329 | g_W = T.grad(cost=cost, wrt=classifier.W) 330 | g_b = T.grad(cost=cost, wrt=classifier.b) 331 | 332 | # start-snippet-3 333 | # specify how to update the parameters of the model as a list of 334 | # (variable, update expression) pairs. 335 | updates = [(classifier.W, classifier.W - learning_rate * g_W), 336 | (classifier.b, classifier.b - learning_rate * g_b)] 337 | 338 | # compiling a Theano function `train_model` that returns the cost, but in 339 | # the same time updates the parameter of the model based on the rules 340 | # defined in `updates` 341 | train_model = theano.function( 342 | inputs=[index], 343 | outputs=cost, 344 | updates=updates, 345 | givens={ 346 | x: train_set_x[index * batch_size: (index + 1) * batch_size], 347 | y: train_set_y[index * batch_size: (index + 1) * batch_size] 348 | } 349 | ) 350 | # end-snippet-3 351 | 352 | ############### 353 | # TRAIN MODEL # 354 | ############### 355 | print '... training the model' 356 | # early-stopping parameters 357 | patience = 5000 # look as this many examples regardless 358 | patience_increase = 2 # wait this much longer when a new best is 359 | # found 360 | improvement_threshold = 0.995 # a relative improvement of this much is 361 | # considered significant 362 | validation_frequency = min(n_train_batches, patience / 2) 363 | # go through this many 364 | # minibatche before checking the network 365 | # on the validation set; in this case we 366 | # check every epoch 367 | 368 | best_validation_loss = numpy.inf 369 | test_score = 0. 370 | start_time = time.clock() 371 | 372 | done_looping = False 373 | epoch = 0 374 | while (epoch < n_epochs) and (not done_looping): 375 | epoch = epoch + 1 376 | for minibatch_index in xrange(n_train_batches): 377 | 378 | minibatch_avg_cost = train_model(minibatch_index) 379 | # iteration number 380 | iter = (epoch - 1) * n_train_batches + minibatch_index 381 | 382 | if (iter + 1) % validation_frequency == 0: 383 | # compute zero-one loss on validation set 384 | validation_losses = [validate_model(i) 385 | for i in xrange(n_valid_batches)] 386 | this_validation_loss = numpy.mean(validation_losses) 387 | 388 | print( 389 | 'epoch %i, minibatch %i/%i, validation error %f %%' % 390 | ( 391 | epoch, 392 | minibatch_index + 1, 393 | n_train_batches, 394 | this_validation_loss * 100. 395 | ) 396 | ) 397 | 398 | # if we got the best validation score until now 399 | if this_validation_loss < best_validation_loss: 400 | #improve patience if loss improvement is good enough 401 | if this_validation_loss < best_validation_loss * \ 402 | improvement_threshold: 403 | patience = max(patience, iter * patience_increase) 404 | 405 | best_validation_loss = this_validation_loss 406 | # test it on the test set 407 | 408 | test_losses = [test_model(i) 409 | for i in xrange(n_test_batches)] 410 | test_score = numpy.mean(test_losses) 411 | 412 | print( 413 | ( 414 | ' epoch %i, minibatch %i/%i, test error of' 415 | ' best model %f %%' 416 | ) % 417 | ( 418 | epoch, 419 | minibatch_index + 1, 420 | n_train_batches, 421 | test_score * 100. 422 | ) 423 | ) 424 | 425 | if patience <= iter: 426 | done_looping = True 427 | break 428 | 429 | end_time = time.clock() 430 | print( 431 | ( 432 | 'Optimization complete with best validation score of %f %%,' 433 | 'with test performance %f %%' 434 | ) 435 | % (best_validation_loss * 100., test_score * 100.) 436 | ) 437 | print 'The code run for %d epochs, with %f epochs/sec' % ( 438 | epoch, 1. * epoch / (end_time - start_time)) 439 | print >> sys.stderr, ('The code for file ' + 440 | os.path.split(__file__)[1] + 441 | ' ran for %.1fs' % ((end_time - start_time))) 442 | 443 | if __name__ == '__main__': 444 | sgd_optimization_mnist() 445 | -------------------------------------------------------------------------------- /va/va.py: -------------------------------------------------------------------------------- 1 | """ 2 | implements the models in paper "stochastic backpropagation in DLGMs" 3 | including 4 | * generative model. 5 | * recognition model. 6 | """ 7 | "let client and server have the same imports." 8 | imports = ['import numpy as np', 9 | 'import numpy.random as npr', 10 | 'import theano', 11 | 'import sys, os', 12 | 'import scipy.io as sio', 13 | 'import theano.sandbox.linalg as ta', 14 | 'from theano.tensor.shared_randomstreams import RandomStreams as trng', 15 | 'import theano.tensor as ts', 16 | 'from color import *', 17 | 'from utils import *'] 18 | for _import in imports: 19 | exec _import 20 | 21 | import pdb 22 | import time 23 | from IPython.parallel import Client 24 | 25 | theano.config.exception_verbosity = 'low' 26 | 27 | ts.logistic = lambda z: 1 / (1 + ts.exp(-z)) 28 | np.logistic = lambda z: 1 / (1 + np.exp(-z)) 29 | toInt = np.vectorize(int) 30 | toStr = np.vectorize(str) 31 | 32 | get_value = lambda x: x.get_value() if x != None else None 33 | get_value_all = lambda xs: [get_value(x) for x in xs] 34 | 35 | nonlinear_f = lambda x : ts.log(1+ts.exp(x)) # smooth ReLU. 36 | nonlinear_s = "smooth ReLU" 37 | if os.environ.has_key('nonlinear'): 38 | nonlinear_s = os.environ['nonlinear'] 39 | if nonlinear_s == "ReLU": 40 | f = lambda x : ts.maximum(0, x) # ReLU. 41 | if nonlinear_s == "tanh": 42 | f = lambda x : ts.tanh(x) 43 | 44 | def AdaGRAD(param, grad, G2, stepsize): 45 | """ 46 | adaptive sub-gradient algorithm for tensor-shared objects. 47 | > input: 48 | param: parameters, tensor-shared objects. 49 | grad: gradient, list of numpy arrays. 50 | G2: variance of gradient, list of numpy arrays 51 | """ 52 | for (p, g, g2) in zip(param, grad, G2): 53 | g2[:] = (g2 + g * g)[:] 54 | if type(p) == theano.tensor.sharedvar.TensorSharedVariable: 55 | p.set_value(p.get_value() - stepsize * g / (1e-8 + np.sqrt(g2))) 56 | elif type(p) == np.ndarray: 57 | p[:] = p[:] - stepsize * g[:] / (1e-8 + np.sqrt(g2[:])) 58 | 59 | class Decoder: 60 | """ generative model 61 | """ 62 | def __init__(me, arch, kappa=0.1): 63 | """ 64 | create the variational decoder. 65 | arch: architecture, [vis, hidden] 66 | """ 67 | "set options." 68 | me.f = ts.maximum # nonlinear transformation. 69 | 70 | "set properties." 71 | me.arch = arch 72 | me.num_layers = len(arch) 73 | me.kappa = kappa 74 | assert(me.num_layers > 1) 75 | 76 | "init layers." 77 | me.G = theano.shared(np.eye(arch[1]), name="G") 78 | me.xi = ts.matrix("xi") 79 | me.h = ts.dot(me.G, me.xi) 80 | 81 | def one_layer_logistic(h): 82 | """ 83 | one layer network for decoding 84 | """ 85 | W1 = theano.shared(randn01(arch[0], arch[1]), name="W1") 86 | b1 = theano.shared(np.zeros((arch[0], 1)), name="b1", broadcastable=(False,True)) 87 | resp = ts.dot(W1, h) + b1 88 | return ( resp, 89 | [W1, b1], 90 | lambda v : (v * ts.log(ts.logistic(resp)) + (1-v) * ts.log(1-ts.logistic(resp))).sum() 91 | ) 92 | 93 | def two_layer_logistic(h): 94 | """ 95 | two-layer network for decoding 96 | """ 97 | hidden = 100 98 | W1 = theano.shared(randn01(hidden, arch[1]), name="W1") 99 | b1 = theano.shared(np.zeros((hidden, 1)), name="b1", broadcastable=(False,True)) 100 | W2 = theano.shared(randn01(arch[0], hidden), name="W2") 101 | b2 = theano.shared(np.zeros((arch[0], 1)), name="b2", broadcastable=(False,True)) 102 | u = nonlinear_f(ts.dot(W1, h) + b1) 103 | resp = ts.dot(W2, u) + b2 104 | return ( resp, 105 | [W1, b1, W2, b2], 106 | lambda v : (v * ts.log(ts.logistic(resp)) + (1-v) * ts.log(1-ts.logistic(resp))).sum(), 107 | ) 108 | 109 | (me.resp, me.param, me.lhoodFunc) = one_layer_logistic(me.h) 110 | #(me.resp, me.param, me.lhoodFunc) = two_layer_logistic(me.h) 111 | 112 | me.param += [me.G] 113 | me.G2 = [np.zeros(x.get_value().shape) for x in me.param] # variance of gradient. 114 | 115 | "define objective." 116 | me.v = ts.matrix("v") 117 | me.lhood = me.lhoodFunc(me.v) 118 | me.get_lhood = theano.function([me.v, me.xi], me.lhood) 119 | me.reg = me.kappa * sum([ts.sum(p * p) for p in me.param]) 120 | me.get_reg = theano.function([], me.reg) 121 | 122 | "define gradient." 123 | me.gradient = ts.grad(me.lhood, me.param) 124 | me.gradient_xi = ts.grad(me.lhood, me.xi) 125 | # me.hessian_xi = ts.hessian(me.lhood, me.xi[1:]) 126 | me.get_grad = theano.function([me.v, me.xi], me.gradient) 127 | me.get_grad_xi = theano.function([me.v, me.xi], me.gradient_xi) 128 | # me.get_hess_xi = theano.function([me.v] + me.xi[1:], me.hessian_xi) 129 | me.gradient_reg = ts.grad(me.reg, me.param) 130 | me.get_grad_reg = theano.function([], me.gradient_reg) 131 | 132 | "define utils." 133 | me.generate = theano.function([me.xi], me.resp) 134 | 135 | def sample(me, xi): 136 | resp = me.activate(xi) 137 | return toInt(npr.rand(*resp.shape) < resp) 138 | 139 | def reconstruct(me, xi): 140 | resp = me.activate(xi) 141 | return toInt(np.ones(resp.shape) * 0.5 < resp) 142 | 143 | def activate(me, xi): 144 | resp = me.generate(xi) 145 | return np.logistic(resp) 146 | 147 | 148 | def pack(me): 149 | param = dict() 150 | for p in me.param: 151 | param.update({str(p): p.get_value()}) 152 | return param 153 | 154 | class Encoder: 155 | """ recognition model (encoder) 156 | since xi \sim \Normal(\mu, C) for each layer. 157 | the recognition fits its parameters (\mu, C) discriminatively. 158 | 159 | a simple recognition model uses a two layer NN to fit each parameter. 160 | see DLGM appendix A. 161 | """ 162 | def __init__(me, arch, sigma=1): 163 | """ 164 | create the deep latent Gaussian recognition model. 165 | arch: architecture, [vis, hidden_1, hidden_2, ...] 166 | """ 167 | "set options." 168 | me.f = ts.maximum # nonlinear transformation. 169 | 170 | "set properties." 171 | me.arch = arch 172 | me.num_layers = len(arch) 173 | me.sigma = sigma 174 | assert(me.num_layers > 1) 175 | 176 | "init layers." 177 | me.v = ts.matrix("v") 178 | 179 | def two_layer_recognition(v): 180 | num_hidden = 4 * me.arch[1] 181 | Wv = theano.shared(randn01(num_hidden, arch[0]), name="Wv") 182 | bv = theano.shared(np.zeros((num_hidden, 1)), name="bv", broadcastable=(False,True)) 183 | Wmu = theano.shared(randn01(arch[1], num_hidden), name="Wmu") 184 | bmu = theano.shared(np.zeros((arch[1], 1)), name="bmu", broadcastable=(False, True)) 185 | Wd = theano.shared(randn01(arch[1], num_hidden), name="Wd") 186 | bd = theano.shared(np.zeros((arch[1], 1)), name="bd", broadcastable=(False, True)) 187 | z = nonlinear_f(ts.dot(Wv, v) + bv) 188 | d = ts.exp(ts.dot(Wd, z) + bd) 189 | mu = ts.dot(Wmu, z) + bmu 190 | xs = ts.matrix('x') 191 | return (mu, 192 | d, 193 | [Wv, bv, Wmu, bmu, Wd, bd], 194 | theano.function([v, xs], mu + 1/ts.sqrt(d) * xs), 195 | theano.function([v], z) 196 | ) 197 | 198 | me.sample_eps = lambda V: npr.normal(0, 1, (arch[1], V.shape[1])) 199 | (me.mu, me.d, me.param, me.sample, me.propup) = two_layer_recognition(me.v) 200 | me.G2 = [np.zeros(x.get_value().shape) for x in me.param] # variance of gradient. 201 | me.get_mu = theano.function([me.v], me.mu) 202 | me.get_d = theano.function([me.v], me.d) 203 | 204 | "free energy and gradients." 205 | me.energy = 0; 206 | for layer in range(1, me.num_layers): 207 | me.energy += .5 * me.sigma * ((me.mu * me.mu).sum() + ts.sum(1/me.d) + ts.sum(ts.log(me.d))) 208 | me.get_energy = theano.function([me.v], me.energy) 209 | me.gradient = ts.grad(me.energy, me.param) 210 | me.get_grad = theano.function([me.v], me.gradient) 211 | 212 | """ stochastic gradients. 213 | trick. pretend our objective is inner product with the stochastic gradients. 214 | """ 215 | me.grad_gm = ts.matrix('grad_gm') 216 | me.eps = ts.matrix('eps') 217 | me.obj_mu = -ts.sum(me.mu * me.grad_gm) 218 | me.obj_R = -.5 * ts.sum(me.grad_gm * me.eps * 1/ts.sqrt(me.d)) 219 | me.stoc_grad = ts.grad(me.obj_mu + me.obj_R, me.param) 220 | me.get_stoc_grad = theano.function([me.v] + [me.grad_gm] + [me.eps], me.stoc_grad) 221 | 222 | def pack(me): 223 | param = dict() 224 | for p in me.param: 225 | param.update({str(p): p.get_value()}) 226 | return param 227 | 228 | 229 | "parallel, if server is reachable; otherwise, use map." 230 | try: 231 | rc = Client() 232 | num_threads = len(rc) 233 | rc[:].use_dill() 234 | for _import in imports: 235 | rc[:].execute(_import) 236 | view = rc.load_balanced_view() 237 | view.block = True 238 | mapf = view.map 239 | except: 240 | "cannot connect to parallel server." 241 | num_threads = 1 242 | mapf = map 243 | 244 | class AutoEncoder(object): 245 | """ 246 | train/test DLGM on datasets. 247 | """ 248 | def __init__(me, arch, batchsize = 1, num_sample = 1, kappa = 1, sigma = 1, 249 | stepsize=0.1, num_label=2, ell=10, c = 1, v = 1): 250 | 251 | if os.environ.has_key('hidden'): 252 | arch[1] = int(os.environ['hidden']) 253 | me.num_threads = num_threads 254 | printBlue('> Thread Pool (%d)' % me.num_threads) 255 | me.arch = arch 256 | me.kappa = kappa 257 | me.sigma = sigma 258 | me.batchsize = batchsize 259 | me.stepsize = stepsize 260 | me.num_sample = num_sample 261 | 262 | me.ell = ell 263 | me.c = c 264 | me.num_label = num_label 265 | 266 | if os.environ.has_key('ell'): 267 | me.ell = float(os.environ['ell']) 268 | if os.environ.has_key('c'): 269 | me.c = float(os.environ['c']) 270 | if os.environ.has_key('kappa'): 271 | me.kappa = float(os.environ['kappa']) 272 | if os.environ.has_key('sigma'): 273 | me.sigma = float(os.environ['sigma']) 274 | if os.environ.has_key('stepsize'): 275 | me.stepsize = float(os.environ['stepsize']) 276 | me.stepsize_w = me.stepsize 277 | if os.environ.has_key('stepsize_w'): 278 | me.stepsize_w = float(os.environ['stepsize_w']) 279 | if os.environ.has_key('output'): 280 | me.output_path = os.environ['output'] 281 | else: 282 | me.output_path = 'default' 283 | print 'ell = ', me.ell, 'c = ', me.c, 'sigma = ', me.sigma, 'kappa = ', me.kappa, \ 284 | 'stepsize = ', me.stepsize, 'arch = ', me.arch 285 | print 'nonlinear_f = ', nonlinear_s 286 | 287 | printBlue('> Compiling neural network') 288 | me.W = np.zeros((4 * arch[1] + 1, me.num_label)) 289 | me.W_G2 = np.zeros_like(me.W) 290 | me.gmodel = Decoder(me.arch, kappa=me.kappa) 291 | me.rmodel = Encoder(me.arch, sigma=me.sigma) 292 | 293 | 294 | def __concat__(me, xi): 295 | latent = [1] 296 | latent += list(xi) 297 | latent = np.array(latent) 298 | return latent 299 | 300 | def process(me, ti, V, Y = []): 301 | """ 302 | process one single data point. 303 | > return: (grad of generative model, grad of recognition model) 304 | > input 305 | ti: thread id. 306 | v: data point. 307 | """ 308 | rmodel = me.rmodel 309 | gmodel = me.gmodel 310 | 311 | grad_g = [] 312 | grad_r = [] 313 | grad_w = np.zeros_like(me.W) 314 | 315 | for si in range(me.num_sample): 316 | "first sample stochastic variables." 317 | "eps is randomness for recognition model, xi is randomness for generative model" 318 | eps = rmodel.sample_eps(V) 319 | xi = rmodel.sample(V, eps) 320 | 321 | "compute gradient of generative model." 322 | gg = gmodel.get_grad(V, xi) 323 | gg = param_neg(gg) 324 | grad_g = param_add(grad_g, gg) 325 | 326 | "compute gradient of regularizer in generative model." 327 | gg_reg = gmodel.get_grad_reg() 328 | grad_g = param_add(grad_g, gg_reg) 329 | 330 | "compute free-energy gradient of recognition model." 331 | gr = rmodel.get_grad(V) 332 | grad_r = param_add(grad_r, gr) 333 | 334 | "compute stochastic gradient of recognition model." 335 | gg_xi = gmodel.get_grad_xi(V, xi) 336 | 337 | "add supervision" 338 | if Y != []: 339 | # latents = rmodel.get_mu(V) 340 | # latents = xi 341 | latents = rmodel.propup(V) 342 | for (ni, (y, latent)) in enumerate(zip(Y, latents.T)): 343 | latent = me.__concat__(latent) 344 | resp = me.ell + np.dot(latent, me.W) - np.dot(latent, me.W[:,y]) 345 | resp[y] = 0 346 | yp = np.argmax(resp) 347 | grad_w[:,yp] += latent 348 | grad_w[:,y] -= latent 349 | 350 | # gg_xi[:, ni] -= me.c * (me.W[1:, yp] - me.W[1:, y]) 351 | 352 | gr_stoc = rmodel.get_stoc_grad(V, gg_xi, eps) 353 | grad_r = param_add(grad_r, gr_stoc) 354 | 355 | grad_g = param_mul_scalar(grad_g, 1.0/me.num_sample) 356 | grad_r = param_mul_scalar(grad_r, 1.0/me.num_sample) 357 | grad_w /= me.num_sample 358 | 359 | return (grad_g, grad_r, grad_w) 360 | 361 | def neg_lhood(me, data): 362 | eps = me.rmodel.sample_eps(data.T) 363 | xi = me.rmodel.sample(data.T, eps) 364 | nlh = -me.gmodel.get_lhood(data.T, xi) 365 | return nlh 366 | 367 | def test(me, data, label): 368 | predict = [] 369 | acc = 0 370 | # eps = me.rmodel.sample_eps(data.T) 371 | # xi = me.rmodel.sample(data.T, eps).T # use posterior mean to make predictions. 372 | # xi = me.rmodel.get_mu(data.T).T 373 | xi = me.rmodel.propup(data.T).T 374 | for (v, lb, x) in zip(data, label, xi): 375 | # eps = me.rmodel.sample_eps(v) 376 | # xi = me.rmodel.sample(v, eps) 377 | latent = me.__concat__(x) 378 | resp = np.dot(latent, me.W) 379 | yp = np.argmax(resp) 380 | predict += [yp] 381 | if yp == lb: 382 | acc += 1 383 | acc /= float(len(data)) 384 | return (predict, acc) 385 | 386 | def reconstruct(me, data): 387 | eps = me.rmodel.sample_eps(data.T) 388 | xi = me.rmodel.sample(data.T, eps) 389 | recon = me.gmodel.activate(xi).T 390 | return (recon, xi) 391 | 392 | def train(me, data, label, num_iter, test_data = [], test_label = []): 393 | """ 394 | start the training algorithm. 395 | > input 396 | data: N x D data matrix, each row is a data of dimension D. 397 | """ 398 | printBlue('> Start training neural nets') 399 | 400 | os.system('mkdir -p ../result/%s' % me.output_path) 401 | 402 | data = np.array(data) 403 | lhood = [] 404 | test_lhood = [] 405 | recon_err = [] 406 | test_recon_err = [] 407 | train_recon_err = [] 408 | accuracy = [] 409 | 410 | LAG = 10 411 | ta = time.time() 412 | for it in range(num_iter): 413 | allind = set(range(data.shape[0])) 414 | while len(allind) >= me.batchsize: 415 | "extract mini-batch" 416 | ind = npr.choice(list(allind), me.batchsize, replace=False) 417 | allind -= set(ind) 418 | V = data[ind, :].T 419 | Y = label[ind] 420 | 421 | "compute gradients" 422 | result = mapf(me.process, [0], [V], [Y]) 423 | 424 | grad_r = [] 425 | grad_g = [] 426 | grad_w = np.zeros_like(me.W) 427 | 428 | for (ti, res) in enumerate(result): 429 | grad_g = param_add(grad_g, res[0]) 430 | grad_r = param_add(grad_r, res[1]) 431 | grad_w += res[2] 432 | 433 | grad_g = param_mul_scalar(grad_g, 1.0/len(V)); 434 | grad_r = param_mul_scalar(grad_r, 1.0/len(V)); 435 | grad_w /= len(V) 436 | 437 | 438 | "aggregate gradients" 439 | AdaGRAD(me.gmodel.param, grad_g, me.gmodel.G2, me.stepsize) 440 | AdaGRAD(me.rmodel.param, grad_r, me.rmodel.G2, me.stepsize) 441 | AdaGRAD([me.W], [grad_w], [me.W_G2], me.stepsize_w) 442 | 443 | "evaluate" 444 | if test_data != [] and (it+1) % LAG == 0: 445 | tb = time.time() 446 | [predict, acc] = me.test(test_data, test_label) 447 | accuracy += [acc] 448 | # print '\tGenerative Model', me.gmodel.pack() 449 | # print '\tRecognition Model', me.rmodel.pack() 450 | (recon, xi) = me.reconstruct(test_data) 451 | recon_err += [np.abs(recon - test_data).sum() / float(test_data.shape[0]) / float(test_data.shape[1])] 452 | 453 | test_lhood += [me.neg_lhood(test_data)] 454 | lhood += [me.neg_lhood(data)] 455 | 456 | (recon_train, xi_train) = me.reconstruct(data) 457 | train_recon_err += [np.abs(recon_train - data).sum() / float(data.shape[0]) / float(data.shape[1])] 458 | 459 | time_elapsed = (tb-ta) / float(LAG) 460 | print 'epoch = ', it, 'time elapsed = ', time_elapsed, '-lhood', test_lhood[-1], '-lhood(train)', lhood[-1], 'test recon err', \ 461 | recon_err[-1], 'train recon err', train_recon_err[-1], 'test acc', acc 462 | 463 | sio.savemat('../result/%s/recon.mat' % me.output_path, {'recon': recon, 'xi': xi, 'xi_train':xi_train, 'data':test_data, 464 | 'recon_train':recon_train, 'lhood':lhood, 'test_lhood':test_lhood, 'recon_err':recon_err, 465 | 'train_recon_err':train_recon_err, 'test_acc':accuracy, 'time_elapsed':time_elapsed}) 466 | 467 | ta = time.time() 468 | 469 | with open('../result/%s/log.txt' % me.output_path, "a") as output: 470 | output.write('\n') 471 | output.write(' '.join(toStr(['ell = ', me.ell, 'c = ', me.c, 'sigma = ', me.sigma, 'kappa = ', me.kappa, \ 472 | 'stepsize = ', me.stepsize, 'arch = ', me.arch[0], me.arch[1]]))+'\n') 473 | output.write(' '.join(toStr(['nonlinear_f = ', nonlinear_s]))+'\n') 474 | output.write(' '.join(toStr(['epoch = ', it, '-lhood', test_lhood[-1], '-lhood(train)', lhood[-1], 475 | 'test recon err', recon_err[-1], 'test acc', acc]))+'\n') 476 | output.flush() 477 | output.close() 478 | 479 | 480 | 481 | printBlue('> Training complete') 482 | 483 | if __name__ == "__main__": 484 | model = DeepLatentGM([2,4]) 485 | model.train(npr.randn(1024,2), 16) 486 | print 'Generative Model', model.gmodel.pack() 487 | print 'Recognition Model', model.rmodel.pack() 488 | -------------------------------------------------------------------------------- /DLGM/dlgm.py: -------------------------------------------------------------------------------- 1 | """ 2 | implements the models in paper "stochastic backpropagation in DLGMs" 3 | including 4 | * generative model. 5 | * recognition model. 6 | """ 7 | "let client and server have the same imports." 8 | imports = ['import numpy as np', 9 | 'import numpy.random as npr', 10 | 'import theano', 11 | 'from theano import config', 12 | 'import sys, os', 13 | 'import scipy.io as sio', 14 | 'import theano.sandbox.linalg as ta', 15 | 'import theano.tensor as ts', 16 | 'from color import *', 17 | 'from utils import *'] 18 | for _import in imports: 19 | exec _import 20 | import pdb 21 | 22 | import time 23 | from IPython.parallel import Client 24 | 25 | theano.config.exception_verbosity = 'low' 26 | 27 | ts.logistic = lambda z: 1 / (1 + ts.exp(-z)) 28 | np.logistic = lambda z: 1 / (1 + np.exp(-z)) 29 | 30 | toInt = np.vectorize(int) 31 | toStr = np.vectorize(str) 32 | strConcat = lambda ls : ' '.join(toStr(ls)) 33 | 34 | get_value = lambda x: x.get_value() if x != None else None 35 | get_value_all = lambda xs: [get_value(x) for x in xs if x != None] 36 | 37 | nonlinear_f = lambda x : ts.log(1+ts.exp(x)) # smooth ReLU. 38 | nonlinear_s = "smooth ReLU" 39 | if os.environ.has_key('nonlinear'): 40 | nonlinear_s = os.environ['nonlinear'] 41 | if nonlinear_s == "ReLU": 42 | f = lambda x : ts.maximum(0, x) # ReLU. 43 | if nonlinear_s == "tanh": 44 | f = lambda x : ts.tanh(x) 45 | 46 | 47 | def AdaGRAD(param, grad, G2, stepsize): 48 | """ 49 | adaptive sub-gradient algorithm for tensor-shared objects. 50 | > input: 51 | param: parameters, tensor-shared objects. 52 | grad: gradient, list of numpy arrays. 53 | G2: variance of gradient, list of numpy arrays 54 | """ 55 | for (p, g, g2) in zip(param, grad, G2): 56 | g2[:] = (g2 + g * g)[:] 57 | if type(p) == theano.tensor.sharedvar.TensorSharedVariable: 58 | p.set_value(p.get_value() - stepsize * g / (1e-4 + np.sqrt(g2))) 59 | elif type(p) == np.ndarray: 60 | p[:] = p[:] - stepsize * g[:] / (1e-4 + np.sqrt(g2[:])) 61 | 62 | class GenerativeModel: 63 | """ generative model 64 | """ 65 | def __init__(me, arch, kappa=0.1): 66 | """ 67 | create the deep latent Gaussian model. 68 | arch: architecture, [vis, hidden_1, hidden_2, ...] 69 | """ 70 | "set options." 71 | me.f = ts.maximum # nonlinear transformation. 72 | me.lhoodFunc = lambda v, resp: ts.sum(v * ts.log(ts.logistic(resp)) + (1-v) * ts.log(1-ts.logistic(resp))) 73 | 74 | "set properties." 75 | me.arch = arch 76 | me.num_layers = len(arch) 77 | me.kappa = kappa 78 | assert(me.num_layers > 1) 79 | 80 | "init layers." 81 | (me.G, me.W, me.b, me.xi, me.h) = tuple([[None]*(me.num_layers) for i in range(5)]) 82 | for layer in range(me.num_layers-1, -1, -1): 83 | if layer < me.num_layers-1: 84 | me.W[layer] = theano.shared(np.asarray(randn01(arch[layer], arch[layer+1]), config.floatX), name="W%d" % layer) 85 | me.b[layer] = theano.shared(np.asarray(np.zeros((arch[layer], 1)), config.floatX), name="b%d" % layer, broadcastable=(False,True)) 86 | me.h[layer] = 0 87 | if layer > 0: 88 | me.G[layer] = theano.shared(np.asarray(np.eye(arch[layer]), config.floatX), name="G%d" % layer) 89 | me.xi[layer] = ts.matrix("xi%d" % layer) 90 | me.h[layer] += ts.dot(me.G[layer], me.xi[layer]) 91 | if layer < me.num_layers-1: 92 | me.h[layer] += ts.dot(me.W[layer], me.f(0, me.h[layer+1])) + me.b[layer] 93 | 94 | me.param = me.G[1:] + me.W[:-1] + me.b[:-1] 95 | me.G2 = [np.asarray(np.zeros(x.get_value().shape), config.floatX) for x in me.param] # variance of gradient. 96 | 97 | "define objective." 98 | me.v = ts.matrix("v") 99 | me.lhood = me.lhoodFunc(me.v, me.h[0]) 100 | me.get_lhood = theano.function([me.v] + me.xi[1:], me.lhood) 101 | me.reg = me.kappa * sum([ts.sum(p * p) for p in me.param]) 102 | me.get_reg = theano.function([], me.reg) 103 | 104 | "define gradient." 105 | me.gradient = ts.grad(me.lhood, me.param) 106 | me.gradient_xi = ts.grad(me.lhood, me.xi[1:]) 107 | # me.hessian_xi = ts.hessian(me.lhood, me.xi[1:]) 108 | me.get_grad = theano.function([me.v] + me.xi[1:], me.gradient) 109 | me.get_grad_xi = theano.function([me.v] + me.xi[1:], me.gradient_xi) 110 | # me.get_hess_xi = theano.function([me.v] + me.xi[1:], me.hessian_xi) 111 | me.gradient_reg = ts.grad(me.reg, me.param) 112 | me.get_grad_reg = theano.function([], me.gradient_reg) 113 | 114 | "define utils." 115 | me.generate = theano.function(me.xi[1:], ts.logistic(me.h[0])) 116 | me.hidden_activation = ts.vector("hidden_activiation") 117 | me.hidden_rectified = me.f(0, me.hidden_activation) 118 | me.nonlinear = theano.function([me.hidden_activation], me.hidden_rectified) 119 | 120 | def sample(me, xi): 121 | resp = me.generate(*xi) 122 | return toInt(npr.rand(*resp.shape) < resp) 123 | 124 | def activate(me, xi): 125 | return me.generate(*xi) 126 | 127 | def pack(me): 128 | return {'G': get_value_all(me.G), \ 129 | 'W': get_value_all(me.W), 130 | 'b': get_value_all(me.b)} 131 | 132 | class RecognitionModel: 133 | """ recognition model (interface) 134 | since xi \sim \Normal(\mu, C) for each layer. 135 | the recognition fits its parameters (\mu, C) discriminatively. 136 | 137 | a simple recognition model uses a two layer NN to fit each parameter. 138 | see DLGM appendix A. 139 | """ 140 | def __init__(me, arch, num_hidden=10, sigma=1): 141 | """ 142 | create the deep latent Gaussian recognition model. 143 | arch: architecture, [vis, hidden_1, hidden_2, ...] 144 | """ 145 | "set options." 146 | me.f = ts.maximum # nonlinear transformation. 147 | 148 | "set properties." 149 | me.arch = arch 150 | me.num_layers = len(arch) 151 | me.num_hidden = num_hidden 152 | me.sigma = sigma 153 | assert(me.num_layers > 1) 154 | 155 | "init layers." 156 | me.v = ts.matrix("v") # N x K matrix, N is the sample size, K is the dimension. 157 | (me.Wv, me.Wu, me.Wd, me.Wmu, me.bv, me.bu, me.bd, me.bmu, me.z, me.d, me.u, me.mu, me.R, me.C) \ 158 | = tuple([[None] * me.num_layers for i in range(14)]) 159 | for layer in range(1, me.num_layers): 160 | me.Wv[layer] = theano.shared(np.asarray(randn01(num_hidden, arch[0]), config.floatX), name="Wv%d" % layer) 161 | me.Wu[layer] = theano.shared(np.asarray(randn01(arch[layer], num_hidden), config.floatX), name="Wu%d" % layer) 162 | me.Wd[layer] = theano.shared(np.asarray(randn01(arch[layer], num_hidden), config.floatX), name="Wd%d" % layer) 163 | me.Wmu[layer] = theano.shared(np.asarray(randn01(arch[layer], num_hidden), config.floatX), name="Wmu%d" % layer) 164 | me.bv[layer] = theano.shared(np.asarray(np.zeros((num_hidden, 1)), config.floatX), name="bv%d" % layer, broadcastable=(False, True)) 165 | me.bu[layer] = theano.shared(np.asarray(np.zeros((arch[layer], 1)), config.floatX), name="bu%d" % layer, broadcastable=(False, True)) 166 | me.bd[layer] = theano.shared(np.asarray(np.zeros((arch[layer], 1)), config.floatX), name="bd%d" % layer, broadcastable=(False, True)) 167 | me.bmu[layer] = theano.shared(np.asarray(np.zeros((arch[layer], 1)), config.floatX), name="bmu%d" % layer, broadcastable=(False, True)) 168 | me.z[layer] = me.f(0, ts.dot(me.Wv[layer], me.v) + me.bv[layer]) 169 | me.mu[layer] = ts.dot(me.Wmu[layer], me.z[layer]) + me.bmu[layer] 170 | me.d[layer] = ts.exp(ts.dot(me.Wd[layer], me.z[layer]) + me.bd[layer]) 171 | me.u[layer] = ts.dot(me.Wu[layer], me.z[layer]) + me.bu[layer] 172 | 173 | 174 | """model covariance jointly 175 | utDneg1u = sum([ts.dot(u, u/d) for (u, d) in zip(me.u, me.d)]) 176 | me.eta = 1/(1+utDneg1u) 177 | me.Rdot = theano.function([me.v] + [tensor.vector('x') for u in me.u], \ 178 | [1/ts.sqrt(d) * x - ts.dot(1/ts.sqrt(d) * x, u) * ts.dot(u, 1/d) \ 179 | * (1-ts.sqrt(me.eta)) / utDneg1u \ 180 | for (u, d, x) in zip(me.u, me.d, me.x) \ 181 | ]\ 182 | ) 183 | """ 184 | eps_s = [ts.matrix('x') for u in me.u] 185 | me.Rdot = theano.function([me.v] + eps_s[1:], \ 186 | [1/ts.sqrt(d) * x for (d, x) in zip(me.d[1:], eps_s[1:])] \ 187 | ) 188 | 189 | "utils." 190 | me.get_mu = theano.function([me.v], me.mu[1:]) 191 | me.get_u = theano.function([me.v], me.u[1:]) 192 | me.get_d = theano.function([me.v], me.d[1:]) 193 | me.get_z = theano.function([me.v], me.z[1:]) 194 | 195 | me.sample_eps = lambda v: [np.asarray(npr.randn(ac, v.shape[1]), config.floatX) \ 196 | for ac in arch[1:]] 197 | 198 | me.sample = lambda v, eps: param_add(me.get_mu(v), me.Rdot(v, *eps)) 199 | 200 | me.hidden_activation = ts.vector("hidden_activiation") 201 | me.hidden_rectified = me.f(0, me.hidden_activation) 202 | me.nonlinear = theano.function([me.hidden_activation], me.hidden_rectified) 203 | 204 | "free energy." 205 | me.energy = 0; 206 | for layer in range(1, me.num_layers): 207 | me.energy += me.sigma * (ts.sum(me.mu[layer] * me.mu[layer]) + ts.sum(1/me.d[layer])+ ts.sum(ts.log(me.d[layer]))) \ 208 | + 0 * ts.sum(me.u[layer] * me.u[layer]) 209 | me.energy *= 0 210 | me.get_energy = theano.function([me.v], me.energy) 211 | 212 | "free energy gradients." 213 | me.param = me.Wv[1:] + me.Wu[1:] + me.Wd[1:] + me.Wmu[1:] + me.bv[1:] + me.bu[1:] + me.bd[1:]+ me.bmu[1:] 214 | me.G2 = [np.asarray(np.zeros(x.get_value().shape), config.floatX) for x in me.param] # variance of gradient. 215 | me.gradient = ts.grad(me.energy, me.param) 216 | me.get_grad = theano.function([me.v], me.gradient) 217 | 218 | """ stochastic gradients. 219 | trick. pretend our objective is inner product with the stochastic gradients. 220 | """ 221 | me.grad_gm = [None] * me.num_layers 222 | me.eps = [None] * me.num_layers 223 | me.obj_mu = 0 224 | me.obj_R = 0 225 | for layer in range(1, me.num_layers): 226 | me.grad_gm[layer] = ts.matrix('grad_gm_%d' % layer) 227 | me.eps[layer] = ts.matrix('eps_%d' % layer) 228 | me.obj_mu += ts.sum(me.mu[layer] * me.grad_gm[layer]) 229 | me.obj_R += .5 * ts.sum(me.grad_gm[layer] * me.eps[layer] / ts.sqrt(me.d[layer])) + 0 * ts.sum(me.u[layer] * \ 230 | me.u[layer]) 231 | # me.obj_R += .5 * (ts.outer(me.grad_gm[layer], me.eps[layer]) * 1/ts.sqrt(me.d[layer])).sum() + 0 * ts.dot(me.u[layer].T, 232 | # me.u[layer]) 233 | me.stoc_grad = ts.grad(me.obj_mu + me.obj_R, me.param) 234 | me.get_stoc_grad = theano.function([me.v] + me.grad_gm[1:] + me.eps[1:], me.stoc_grad) 235 | 236 | def pack(me): 237 | return {'Wv': get_value_all(me.Wv), 238 | 'Wu': get_value_all(me.Wu), 239 | 'Wd': get_value_all(me.Wd), 240 | 'Wmu': get_value_all(me.Wmu), 241 | 'bv': get_value_all(me.bv), 242 | 'bu': get_value_all(me.bu), 243 | 'bd': get_value_all(me.bd), 244 | 'bmu': get_value_all(me.bmu)} 245 | 246 | 247 | class DeepLatentGM(object): 248 | """ 249 | train/test DLGM on datasets. 250 | """ 251 | def __init__(me, arch, batchsize = 1, num_sample = 1, kappa = 1, sigma = 1, rec_hidden = 100, 252 | stepsize=0.1, num_label=2, ell=100, c = 1, v = 1): 253 | if os.environ.has_key('hidden'): 254 | hidden = int(os.environ['hidden']) 255 | rec_hidden = 4 * hidden 256 | for i in range(1, len(arch)): 257 | arch[i] = hidden 258 | 259 | me.arch = arch 260 | me.kappa = kappa 261 | me.sigma = sigma 262 | me.batchsize = batchsize 263 | me.stepsize = stepsize 264 | me.num_sample = num_sample 265 | 266 | me.ell = ell 267 | me.c = c 268 | me.num_label = num_label 269 | me.v = 1 270 | 271 | if os.environ.has_key('ell'): 272 | me.ell = float(os.environ['ell']) 273 | if os.environ.has_key('c'): 274 | me.c = float(os.environ['c']) 275 | if os.environ.has_key('kappa'): 276 | me.kappa = float(os.environ['kappa']) 277 | if os.environ.has_key('sigma'): 278 | me.sigma = float(os.environ['sigma']) 279 | if os.environ.has_key('stepsize'): 280 | me.stepsize = float(os.environ['stepsize']) 281 | me.stepsize_w = me.stepsize 282 | if os.environ.has_key('stepsize_w'): 283 | me.stepsize_w = float(os.environ['stepsize_w']) 284 | if os.environ.has_key('output'): 285 | me.output_path = os.environ['output'] 286 | else: 287 | me.output_path = 'default' 288 | 289 | printRed(strConcat(['ell = ', me.ell, 'c = ', me.c, 'sigma = ', me.sigma, 'kappa = ', me.kappa, 290 | 'stepsize = ', me.stepsize, 'arch = ', strConcat(me.arch)])) 291 | printRed(strConcat(['nonlinear_f = ', nonlinear_s])) 292 | printBlue('> Compiling neural network') 293 | me.gmodel = GenerativeModel(me.arch, kappa=me.kappa) 294 | me.rmodel = RecognitionModel(me.arch, num_hidden=rec_hidden, sigma=me.sigma) 295 | 296 | me.W = np.zeros((sum(arch[1:])+1, me.num_label)) 297 | me.W_G2 = np.zeros_like(me.W) 298 | 299 | def __concat__(me, xi): 300 | latent = [1] 301 | for x in xi: 302 | latent += list(x) 303 | latent = np.array(latent) 304 | return latent 305 | 306 | def process(me, V, Y): 307 | """ 308 | process one single data point. 309 | > return: (grad of generative model, grad of recognition model) 310 | > input 311 | ti: thread id. 312 | v: data point. 313 | """ 314 | rmodel = me.rmodel 315 | gmodel = me.gmodel 316 | 317 | V = np.array(V) 318 | if len(V.shape) < 2: 319 | V = np.array([V]) 320 | 321 | grad_g = [] 322 | grad_r = [] 323 | grad_w = np.zeros_like(me.W) 324 | 325 | for si in range(me.num_sample): 326 | "first sample stochastic variables." 327 | eps = rmodel.sample_eps(V.T) 328 | xi = rmodel.sample(V.T, eps) 329 | 330 | # pdb.set_trace() 331 | ta = time.clock() 332 | "compute gradient of generative model." 333 | gg = gmodel.get_grad(V.T, *xi) 334 | gg = param_neg(gg) 335 | grad_g = param_add(grad_g, gg) 336 | 337 | "compute gradient of regularizer in generative model." 338 | gg_reg = gmodel.get_grad_reg() 339 | gg_reg = param_mul_scalar(gg_reg, me.kappa) 340 | grad_g = param_add(grad_g, gg_reg) 341 | 342 | "compute free-energy gradient of recognition model." 343 | gr = rmodel.get_grad(V.T) 344 | grad_r = param_add(grad_r, gr) 345 | 346 | "compute stochastic gradient of recognition model." 347 | gg_xi = gmodel.get_grad_xi(V.T, *xi) 348 | gg_xi = param_neg(gg_xi) 349 | 350 | "add supervision" 351 | code = rmodel.get_mu(V.T) 352 | for vi in range(V.shape[0]): 353 | latent = me.__concat__([c[:, vi] for c in code]) 354 | y = Y[vi] 355 | 356 | resp = me.ell + np.dot(latent, me.W) - np.dot(latent, me.W[:,y]) 357 | resp[y] = 0 358 | yp = np.argmax(resp) 359 | grad_w[:,yp] += latent 360 | grad_w[:,y] -= latent 361 | 362 | # ind = 1 # skip bias. 363 | # for ni in range(len(gg_xi)): 364 | # for nj in range(len(gg_xi[ni])): 365 | # gg_xi[ni][nj] += me.c * (me.W[ind, yp] - me.W[ind, y]) 366 | # ind += 1 367 | 368 | gr_stoc = rmodel.get_stoc_grad(V.T, *(gg_xi + eps)) 369 | grad_r = param_add(grad_r, gr_stoc) 370 | 371 | grad_g = param_mul_scalar(grad_g, 1.0/me.num_sample) 372 | grad_r = param_mul_scalar(grad_r, 1.0/me.num_sample) 373 | grad_w /= me.num_sample 374 | 375 | return (grad_g, grad_r, grad_w) 376 | 377 | def neg_lhood(me, data): 378 | nlh = 0 379 | V = np.array(data); 380 | eps = me.rmodel.sample_eps(V.T) 381 | xi = me.rmodel.sample(V.T, eps) 382 | nlh -= me.gmodel.get_lhood(V.T, *xi) 383 | return nlh 384 | 385 | def test(me, data, label): 386 | predict = [] 387 | acc = 0 388 | xi = me.rmodel.get_mu(data.T) 389 | for (li, lb) in enumerate(label): 390 | latent = me.__concat__([x[:,li] for x in xi]) 391 | resp = np.dot(latent, me.W) 392 | yp = np.argmax(resp) 393 | predict += [yp] 394 | if yp == lb: 395 | acc += 1 396 | acc /= float(len(label)) 397 | return (predict, acc) 398 | 399 | def reconstruct(me, data): 400 | eps = me.rmodel.sample_eps(data.T) 401 | xi = me.rmodel.sample(data.T, eps) 402 | recon = me.gmodel.activate(xi).T 403 | return (recon, xi) 404 | 405 | def train(me, data, label, num_iter, test_data = [], test_label = []): 406 | """ 407 | start the training algorithm. 408 | > input 409 | data: N x D data matrix, each row is a data of dimension D. 410 | """ 411 | printBlue('> Start training neural nets') 412 | 413 | os.system('mkdir -p ../result/%s' % me.output_path) 414 | 415 | data = np.array(data).astype(np.float32) 416 | if test_data != []: 417 | test_data = np.array(test_data).astype(np.float32) 418 | label = label.astype(np.float32) 419 | if test_label != []: 420 | test_label = test_label.astype(np.float32) 421 | 422 | lhood = [] 423 | test_lhood = [] 424 | recon_err = [] 425 | train_recon_err = [] 426 | accuracy = [] 427 | 428 | LAG = 10 429 | ta = time.time() 430 | for it in range(num_iter): 431 | allind = set(range(data.shape[0])) 432 | while len(allind) >= me.batchsize: 433 | "extract mini-batch" 434 | ind = npr.choice(list(allind), me.batchsize, replace=False) 435 | allind -= set(ind) 436 | V = data[ind, :] 437 | Y = label[ind] 438 | 439 | "compute gradients" 440 | 441 | (grad_g, grad_r, grad_w) = me.process(V, Y) 442 | 443 | grad_g = param_mul_scalar(grad_g, 1.0/len(V)); 444 | grad_r = param_mul_scalar(grad_r, 1.0/len(V)); 445 | grad_w /= len(V) 446 | 447 | "aggregate gradients" 448 | AdaGRAD(me.gmodel.param, grad_g, me.gmodel.G2, me.stepsize) 449 | AdaGRAD(me.rmodel.param, grad_r, me.rmodel.G2, me.stepsize) 450 | AdaGRAD([me.W], [grad_w], [me.W_G2], me.stepsize_w) 451 | 452 | "evaluate" 453 | if test_data != [] and (it+1) % LAG == 0: 454 | tb = time.time() 455 | [predict, acc] = me.test(test_data, test_label) 456 | accuracy += [acc] 457 | # print '\tGenerative Model', me.gmodel.pack() 458 | # print '\tRecognition Model', me.rmodel.pack() 459 | (recon, xis) = me.reconstruct(test_data) 460 | recon_err += [np.abs(recon - test_data).sum() / float(test_data.shape[0]) / float(test_data.shape[1])] 461 | 462 | test_lhood += [me.neg_lhood(test_data)] 463 | lhood += [me.neg_lhood(data)] 464 | 465 | (recon_train, xis_train) = me.reconstruct(data) 466 | train_recon_err += [np.abs(recon_train - data).sum() / float(data.shape[0]) / float(data.shape[1])] 467 | 468 | time_elapsed = (tb-ta) / float(LAG) 469 | 470 | print 'epoch = ', it, 'time elapsed = ', time_elapsed, '-lhood', test_lhood[-1], '-lhood(train)', lhood[-1], 'test recon err', \ 471 | recon_err[-1], 'train recon err', train_recon_err[-1], 'test acc', acc 472 | 473 | result = {'recon': recon, 'xi': xis, 'xi_train':xis_train, 'data':test_data, 474 | 'recon_train':recon_train, 'lhood':lhood, 'test_lhood':test_lhood, 'recon_err':recon_err, 475 | 'train_recon_err':train_recon_err, 'test_acc':accuracy, 'time_elapsed':time_elapsed} 476 | result.update(me.rmodel.pack()) 477 | result.update(me.gmodel.pack()) 478 | sio.savemat('../result/%s/recon.mat' % me.output_path, result) 479 | 480 | 481 | with open('../result/%s/log.txt' % me.output_path, "a") as output: 482 | print >>output, 'epoch = ', it, 'time elapsed = ', time_elapsed, '-lhood', test_lhood[-1], '-lhood(train)', lhood[-1], 'test recon err', \ 483 | recon_err[-1], 'train recon err', train_recon_err[-1], 'test acc', acc 484 | output.flush() 485 | output.close() 486 | 487 | printBlue('> Training complete') 488 | 489 | if __name__ == "__main__": 490 | model = DeepLatentGM([2,4]) 491 | model.train(npr.randn(1024,2), 16) 492 | print 'Generative Model', model.gmodel.pack() 493 | print 'Recognition Model', model.rmodel.pack() 494 | -------------------------------------------------------------------------------- /RBM/rbm.py: -------------------------------------------------------------------------------- 1 | """This tutorial introduces restricted boltzmann machines (RBM) using Theano. 2 | 3 | Boltzmann Machines (BMs) are a particular form of energy-based model which 4 | contain hidden variables. Restricted Boltzmann Machines further restrict BMs 5 | to those without visible-visible and hidden-hidden connections. 6 | """ 7 | import time 8 | 9 | try: 10 | import PIL.Image as Image 11 | except ImportError: 12 | import Image 13 | 14 | import numpy 15 | 16 | import theano 17 | import theano.tensor as T 18 | import os 19 | 20 | from theano.tensor.shared_randomstreams import RandomStreams 21 | 22 | from utils import tile_raster_images 23 | from logistic_sgd import load_data 24 | 25 | import pdb 26 | 27 | 28 | # start-snippet-1 29 | class RBM(object): 30 | """Restricted Boltzmann Machine (RBM) """ 31 | def __init__( 32 | self, 33 | input=None, 34 | label=None, 35 | n_visible=784, 36 | n_hidden=500, 37 | W=None, 38 | hbias=None, 39 | vbias=None, 40 | numpy_rng=None, 41 | theano_rng=None, 42 | c = 1, 43 | ell = 100, 44 | n_class = 10, 45 | ): 46 | """ 47 | RBM constructor. Defines the parameters of the model along with 48 | basic operations for inferring hidden from visible (and vice-versa), 49 | as well as for performing CD updates. 50 | 51 | :param input: None for standalone RBMs or symbolic variable if RBM is 52 | part of a larger graph. 53 | 54 | :param n_visible: number of visible units 55 | 56 | :param n_hidden: number of hidden units 57 | 58 | :param W: None for standalone RBMs or symbolic variable pointing to a 59 | shared weight matrix in case RBM is part of a DBN network; in a DBN, 60 | the weights are shared between RBMs and layers of a MLP 61 | 62 | :param hbias: None for standalone RBMs or symbolic variable pointing 63 | to a shared hidden units bias vector in case RBM is part of a 64 | different network 65 | 66 | :param vbias: None for standalone RBMs or a symbolic variable 67 | pointing to a shared visible units bias 68 | """ 69 | 70 | self.n_visible = n_visible 71 | self.n_hidden = n_hidden 72 | 73 | if numpy_rng is None: 74 | # create a number generator 75 | numpy_rng = numpy.random.RandomState(1234) 76 | 77 | if theano_rng is None: 78 | theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) 79 | 80 | if W is None: 81 | # W is initialized with `initial_W` which is uniformely 82 | # sampled from -4*sqrt(6./(n_visible+n_hidden)) and 83 | # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if 84 | # converted using asarray to dtype theano.config.floatX so 85 | # that the code is runable on GPU 86 | initial_W = numpy.asarray( 87 | numpy_rng.uniform( 88 | low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), 89 | high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), 90 | size=(n_visible, n_hidden) 91 | ), 92 | dtype=theano.config.floatX 93 | ) 94 | # theano shared variables for weights and biases 95 | W = theano.shared(value=initial_W, name='W', borrow=True) 96 | 97 | if hbias is None: 98 | # create shared variable for hidden units bias 99 | hbias = theano.shared( 100 | value=numpy.zeros( 101 | n_hidden, 102 | dtype=theano.config.floatX 103 | ), 104 | name='hbias', 105 | borrow=True 106 | ) 107 | 108 | if vbias is None: 109 | # create shared variable for visible units bias 110 | vbias = theano.shared( 111 | value=numpy.zeros( 112 | n_visible, 113 | dtype=theano.config.floatX 114 | ), 115 | name='vbias', 116 | borrow=True 117 | ) 118 | 119 | # initialize input layer for standalone RBM or layer0 of DBN 120 | self.input = input 121 | if not input: 122 | self.input = T.matrix('input') 123 | self.label = label 124 | if not label: 125 | self.label = T.matrix('label') 126 | 127 | self.W = W 128 | self.hbias = hbias 129 | self.vbias = vbias 130 | self.theano_rng = theano_rng 131 | # **** WARNING: It is not a good idea to put things in this list 132 | # other than shared variables created in this function. 133 | 134 | # initialize parameters for supervised learning. 135 | self.c = c 136 | self.ell = 16 137 | self.weights = theano.shared( 138 | value=numpy.zeros( 139 | (n_visible, n_class), 140 | dtype=theano.config.floatX 141 | ), 142 | name='weights', 143 | borrow=True 144 | ) 145 | # parameter grouping. 146 | self.params = [self.weights] 147 | self.G2 = [ 148 | theano.shared(value=numpy.zeros((n_hidden, n_class)), borrow=True) 149 | ] 150 | # end-snippet-1 151 | 152 | def free_energy(self, v_sample): 153 | ''' Function to compute the free energy ''' 154 | wx_b = T.dot(v_sample, self.W) + self.hbias 155 | vbias_term = T.dot(v_sample, self.vbias) 156 | hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) 157 | return -hidden_term - vbias_term 158 | 159 | def loss(self, vis, y): 160 | ell = T.cast(self.ell, dtype=theano.config.floatX) 161 | true_resp = (T.dot(vis, self.weights) * y).sum(axis=1, keepdims=True) 162 | T.addbroadcast(true_resp, 1) 163 | return (self.ell * (1-y) + T.dot(vis, self.weights) - true_resp).max(axis=1).sum() 164 | 165 | def classify(self, vis): 166 | pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(vis) 167 | predict = T.dot(ph_mean, self.weights) 168 | return predict 169 | 170 | def propup(self, vis): 171 | '''This function propagates the visible units activation upwards to 172 | the hidden units 173 | 174 | Note that we return also the pre-sigmoid activation of the 175 | layer. As it will turn out later, due to how Theano deals with 176 | optimizations, this symbolic variable will be needed to write 177 | down a more stable computational graph (see details in the 178 | reconstruction cost function) 179 | 180 | ''' 181 | pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias 182 | return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] 183 | 184 | def sample_h_given_v(self, v0_sample): 185 | ''' This function infers state of hidden units given visible units ''' 186 | # compute the activation of the hidden units given a sample of 187 | # the visibles 188 | pre_sigmoid_h1, h1_mean = self.propup(v0_sample) 189 | # get a sample of the hiddens given their activation 190 | # Note that theano_rng.binomial returns a symbolic sample of dtype 191 | # int64 by default. If we want to keep our computations in floatX 192 | # for the GPU we need to specify to return the dtype floatX 193 | h1_sample = self.theano_rng.binomial(size=h1_mean.shape, 194 | n=1, p=h1_mean, 195 | dtype=theano.config.floatX) 196 | return [pre_sigmoid_h1, h1_mean, h1_sample] 197 | 198 | def propdown(self, hid): 199 | '''This function propagates the hidden units activation downwards to 200 | the visible units 201 | 202 | Note that we return also the pre_sigmoid_activation of the 203 | layer. As it will turn out later, due to how Theano deals with 204 | optimizations, this symbolic variable will be needed to write 205 | down a more stable computational graph (see details in the 206 | reconstruction cost function) 207 | 208 | ''' 209 | pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias 210 | return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)] 211 | 212 | def sample_v_given_h(self, h0_sample): 213 | ''' This function infers state of visible units given hidden units ''' 214 | # compute the activation of the visible given the hidden sample 215 | pre_sigmoid_v1, v1_mean = self.propdown(h0_sample) 216 | # get a sample of the visible given their activation 217 | # Note that theano_rng.binomial returns a symbolic sample of dtype 218 | # int64 by default. If we want to keep our computations in floatX 219 | # for the GPU we need to specify to return the dtype floatX 220 | v1_sample = self.theano_rng.binomial(size=v1_mean.shape, 221 | n=1, p=v1_mean, 222 | dtype=theano.config.floatX) 223 | return [pre_sigmoid_v1, v1_mean, v1_sample] 224 | 225 | def gibbs_hvh(self, h0_sample): 226 | ''' This function implements one step of Gibbs sampling, 227 | starting from the hidden state''' 228 | pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample) 229 | pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample) 230 | return [pre_sigmoid_v1, v1_mean, v1_sample, 231 | pre_sigmoid_h1, h1_mean, h1_sample] 232 | 233 | def gibbs_vhv(self, v0_sample): 234 | ''' This function implements one step of Gibbs sampling, 235 | starting from the visible state''' 236 | pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample) 237 | pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample) 238 | return [pre_sigmoid_h1, h1_mean, h1_sample, 239 | pre_sigmoid_v1, v1_mean, v1_sample] 240 | 241 | # start-snippet-2 242 | def get_cost_updates(self, lr=0.1, persistent=None, k=1, update_method='adagrad'): 243 | """This functions implements one step of CD-k or PCD-k 244 | 245 | :param lr: learning rate used to train the RBM 246 | 247 | :param persistent: None for CD. For PCD, shared variable 248 | containing old state of Gibbs chain. This must be a shared 249 | variable of size (batch size, number of hidden units). 250 | 251 | :param k: number of Gibbs steps to do in CD-k/PCD-k 252 | 253 | Returns a proxy for the cost and the updates dictionary. The 254 | dictionary contains the update rules for weights and biases but 255 | also an update of the shared variable used to store the persistent 256 | chain, if one is used. 257 | 258 | """ 259 | cost = self.c * self.loss(self.input, self.label) 260 | # We must not compute the gradient through the gibbs sampling 261 | gparams = T.grad(cost, self.params) 262 | # end-snippet-3 start-snippet-4 263 | if update_method == 'sgd': 264 | # constructs the update dictionary 265 | for gparam, param in zip(gparams, self.params): 266 | # make sure that the learning rate is of the right dtype 267 | updates[param] = param - gparam * T.cast( 268 | lr, 269 | dtype=theano.config.floatX 270 | ) 271 | elif update_method == 'adagrad': 272 | for gparam, param, g2 in zip(gparams, self.params, self.G2): 273 | # make sure that the learning rate is of the right dtype 274 | updates[g2] = g2 + gparam * gparam 275 | updates[param] = param - gparam * T.cast(lr, \ 276 | dtype=theano.config.floatX) \ 277 | / (1e-4 + T.sqrt(g2 + gparam * gparam)) 278 | 279 | monitoring_cost = 0 280 | train_err = 0 281 | return monitoring_cost, train_err, updates 282 | # end-snippet-4 283 | 284 | def get_error(self, predict, label): 285 | return T.neq(T.argmax(predict, axis=1), 286 | T.argmax(label, axis=1)).sum() / T.cast(label.shape[0], dtype=theano.config.floatX) 287 | 288 | 289 | def get_pseudo_likelihood_cost(self, updates): 290 | """Stochastic approximation to the pseudo-likelihood""" 291 | 292 | # index of bit i in expression p(x_i | x_{\i}) 293 | bit_i_idx = theano.shared(value=0, name='bit_i_idx') 294 | 295 | # binarize the input image by rounding to nearest integer 296 | xi = T.round(self.input) 297 | 298 | # calculate free energy for the given bit configuration 299 | fe_xi = self.free_energy(xi) 300 | 301 | # flip bit x_i of matrix xi and preserve all other bits x_{\i} 302 | # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns 303 | # the result to xi_flip, instead of working in place on xi. 304 | xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx]) 305 | 306 | # calculate free energy with bit flipped 307 | fe_xi_flip = self.free_energy(xi_flip) 308 | 309 | # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i}))) 310 | cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip - 311 | fe_xi))) 312 | 313 | # increment bit_i_idx % number as part of updates 314 | updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible 315 | 316 | return cost 317 | 318 | def get_reconstruction_cost(self, updates, pre_sigmoid_nv): 319 | """Approximation to the reconstruction error 320 | 321 | Note that this function requires the pre-sigmoid activation as 322 | input. To understand why this is so you need to understand a 323 | bit about how Theano works. Whenever you compile a Theano 324 | function, the computational graph that you pass as input gets 325 | optimized for speed and stability. This is done by changing 326 | several parts of the subgraphs with others. One such 327 | optimization expresses terms of the form log(sigmoid(x)) in 328 | terms of softplus. We need this optimization for the 329 | cross-entropy since sigmoid of numbers larger than 30. (or 330 | even less then that) turn to 1. and numbers smaller than 331 | -30. turn to 0 which in terms will force theano to compute 332 | log(0) and therefore we will get either -inf or NaN as 333 | cost. If the value is expressed in terms of softplus we do not 334 | get this undesirable behaviour. This optimization usually 335 | works fine, but here we have a special case. The sigmoid is 336 | applied inside the scan op, while the log is 337 | outside. Therefore Theano will only see log(scan(..)) instead 338 | of log(sigmoid(..)) and will not apply the wanted 339 | optimization. We can not go and replace the sigmoid in scan 340 | with something else also, because this only needs to be done 341 | on the last step. Therefore the easiest and more efficient way 342 | is to get also the pre-sigmoid activation as an output of 343 | scan, and apply both the log and sigmoid outside scan such 344 | that Theano can catch and optimize the expression. 345 | 346 | """ 347 | 348 | cross_entropy = T.mean( 349 | T.sum( 350 | self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) + 351 | (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)), 352 | axis=1 353 | ) 354 | ) 355 | 356 | return cross_entropy 357 | 358 | 359 | def test_rbm(learning_rate=1, training_epochs=200, 360 | dataset='../data/mnist/mnist.pkl.gz', batch_size=32, 361 | n_chains=20, n_samples=10, output_folder='rbm_plots', 362 | n_hidden=200, n_example=-1): 363 | """ 364 | Demonstrate how to train and afterwards sample from it using Theano. 365 | 366 | This is demonstrated on MNIST. 367 | 368 | :param learning_rate: learning rate used for training the RBM 369 | 370 | :param training_epochs: number of epochs used for training 371 | 372 | :param dataset: path the the pickled dataset 373 | 374 | :param batch_size: size of a batch used to train the RBM 375 | 376 | :param n_chains: number of parallel Gibbs chains to be used for sampling 377 | 378 | :param n_samples: number of samples to plot for each chain 379 | 380 | """ 381 | datasets = load_data(dataset, n_example) 382 | 383 | train_set_x, train_set_y = datasets[0] 384 | test_set_x, test_set_y = datasets[2] 385 | 386 | def convert_to_ind(y, borrow=True): 387 | y = y.get_value() 388 | label = numpy.unique(y) 389 | newy = numpy.zeros((len(y), len(label))) 390 | for i in range(len(y)): 391 | newy[i, y[i]] = 1 392 | sharedy = theano.shared(numpy.asarray(newy, 393 | dtype=theano.config.floatX), 394 | borrow=borrow) 395 | return sharedy 396 | 397 | train_set_y_ind = convert_to_ind(train_set_y) 398 | test_set_y_ind = convert_to_ind(test_set_y) 399 | 400 | 401 | # compute number of minibatches for training, validation and testing 402 | n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size 403 | 404 | # allocate symbolic variables for the data 405 | index = T.lscalar() # index to a [mini]batch 406 | x = T.matrix('x') # the data is presented as rasterized images 407 | y = T.matrix('y') # the label is a N x C matrix, each row only true class is 1. 408 | 409 | rng = numpy.random.RandomState(123) 410 | theano_rng = RandomStreams(rng.randint(2 ** 30)) 411 | 412 | # initialize storage for the persistent chain (state = hidden 413 | # layer of chain) 414 | persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden), 415 | dtype=theano.config.floatX), 416 | borrow=True) 417 | 418 | # construct the RBM class 419 | rbm = RBM(input=x, label=y, n_visible=28 * 28, 420 | n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng) 421 | 422 | # get the cost and the gradient corresponding to one step of CD-15 423 | cost, train_err, updates = rbm.get_cost_updates(lr=learning_rate, 424 | persistent=persistent_chain, k=15) 425 | 426 | ################################# 427 | # Training the RBM # 428 | ################################# 429 | if not os.path.isdir(output_folder): 430 | os.makedirs(output_folder) 431 | os.chdir(output_folder) 432 | 433 | # start-snippet-5 434 | # it is ok for a theano function to have no output 435 | # the purpose of train_rbm is solely to update the RBM parameters 436 | train_rbm = theano.function( 437 | [index], 438 | [cost, train_err], 439 | updates=updates, 440 | givens={ 441 | x: train_set_x[index * batch_size: (index + 1) * batch_size], 442 | y: train_set_y_ind[index * batch_size : (index + 1) * batch_size] 443 | }, 444 | name='train_rbm' 445 | ) 446 | 447 | tx = T.matrix('tx') # the data is presented as rasterized images 448 | ty = T.matrix('ty') # the label is a N x C matrix, each row only true class is 1. 449 | predict = rbm.classify(tx) 450 | test_err = rbm.get_error(predict, ty) 451 | test_rbm = theano.function( 452 | [], 453 | [predict, test_err], 454 | givens = { 455 | tx: test_set_x, 456 | ty: test_set_y_ind 457 | }, 458 | name = 'test_rbm' 459 | ) 460 | 461 | plotting_time = 0. 462 | start_time = time.clock() 463 | 464 | # go through training epochs 465 | test_err_list = [] 466 | for epoch in xrange(training_epochs): 467 | 468 | # go through the training set 469 | mean_cost = [] 470 | mean_train_err = [] 471 | for batch_index in xrange(n_train_batches): 472 | [cost, train_err] = train_rbm(batch_index) 473 | mean_cost += [cost] 474 | mean_train_err += [train_err] 475 | 476 | # Test on test set. 477 | [predict, test_err] = test_rbm() 478 | test_err_list += [test_err] 479 | print 'Training epoch %d, cost = %f, test err = %f' % (epoch, numpy.mean(mean_cost), test_err) 480 | 481 | end_time = time.clock() 482 | 483 | pretraining_time = (end_time - start_time) - plotting_time 484 | 485 | print ('Training took %f minutes' % (pretraining_time / 60.)) 486 | # end-snippet-5 start-snippet-6 487 | ################################# 488 | # Sampling from the RBM # 489 | ################################# 490 | # find out the number of test samples 491 | number_of_test_samples = test_set_x.get_value(borrow=True).shape[0] 492 | 493 | # pick random test examples, with which to initialize the persistent chain 494 | test_idx = rng.randint(number_of_test_samples - n_chains) 495 | persistent_vis_chain = theano.shared( 496 | numpy.asarray( 497 | test_set_x.get_value(borrow=True)[test_idx:test_idx + n_chains], 498 | dtype=theano.config.floatX 499 | ) 500 | ) 501 | # end-snippet-6 start-snippet-7 502 | plot_every = 1000 503 | # define one step of Gibbs sampling (mf = mean-field) define a 504 | # function that does `plot_every` steps before returning the 505 | # sample for plotting 506 | ( 507 | [ 508 | presig_hids, 509 | hid_mfs, 510 | hid_samples, 511 | presig_vis, 512 | vis_mfs, 513 | vis_samples 514 | ], 515 | updates 516 | ) = theano.scan( 517 | rbm.gibbs_vhv, 518 | outputs_info=[None, None, None, None, None, persistent_vis_chain], 519 | n_steps=plot_every 520 | ) 521 | 522 | # add to updates the shared variable that takes care of our persistent 523 | # chain :. 524 | updates.update({persistent_vis_chain: vis_samples[-1]}) 525 | # construct the function that implements our persistent chain. 526 | # we generate the "mean field" activations for plotting and the actual 527 | # samples for reinitializing the state of our persistent chain 528 | sample_fn = theano.function( 529 | [], 530 | [ 531 | vis_mfs[-1], 532 | vis_samples[-1] 533 | ], 534 | updates=updates, 535 | name='sample_fn' 536 | ) 537 | 538 | # create a space to store the image for plotting ( we need to leave 539 | # room for the tile_spacing as well) 540 | image_data = numpy.zeros( 541 | (29 * n_samples + 1, 29 * n_chains - 1), 542 | dtype='uint8' 543 | ) 544 | for idx in xrange(n_samples): 545 | # generate `plot_every` intermediate samples that we discard, 546 | # because successive samples in the chain are too correlated 547 | vis_mf, vis_sample = sample_fn() 548 | print ' ... plotting sample ', idx 549 | image_data[29 * idx:29 * idx + 28, :] = tile_raster_images( 550 | X=vis_mf, 551 | img_shape=(28, 28), 552 | tile_shape=(1, n_chains), 553 | tile_spacing=(1, 1) 554 | ) 555 | 556 | # construct image 557 | image = Image.fromarray(image_data) 558 | image.save('samples.png') 559 | # end-snippet-7 560 | os.chdir('../') 561 | 562 | if __name__ == '__main__': 563 | test_rbm(n_example=-1) 564 | #test_rbm(n_example=100) 565 | --------------------------------------------------------------------------------