├── .gitignore
├── data
    └── mnist
    │   ├── mnistSmall.mat
    │   └── mnistTiny.mat
├── readme.md
├── DLGM
    ├── launch_all.sh
    ├── test_sanity.py
    ├── utils.py
    ├── test_dlgm.py
    ├── color.py
    ├── test_mnist.py
    └── dlgm.py
├── va
    ├── test_sanity.py
    ├── utils.py
    ├── test_va.py
    ├── color.py
    ├── pegasos.py
    ├── test_mnist.py
    └── va.py
└── RBM
    ├── utils.py
    ├── logistic_sgd.py
    └── rbm.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_STORE
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/data/mnist/mnistSmall.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strin/DeepBayes/HEAD/data/mnist/mnistSmall.mat


--------------------------------------------------------------------------------
/data/mnist/mnistTiny.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/strin/DeepBayes/HEAD/data/mnist/mnistTiny.mat


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | DeepBayes
2 | 
3 | * [DLGM - code that reproduces deep latent generative models] (DLGM/)
4 | 


--------------------------------------------------------------------------------
/DLGM/launch_all.sh:
--------------------------------------------------------------------------------
 1 | for num_node in 50 200
 2 | do
 3 |   for kappa in 0 0.1 
 4 |   do
 5 |     for sigma in 0 0.001 0.01 0.1
 6 |     do
 7 |       nohup python test_mnist.py $num_node $kappa $sigma &
 8 |     done
 9 |   done
10 | done
11 | 


--------------------------------------------------------------------------------
/DLGM/test_sanity.py:
--------------------------------------------------------------------------------
 1 | from dlgm import *
 2 | import numpy as np
 3 | import scipy.io as sio
 4 | 
 5 | def test_h1_v3():
 6 |   v1 = [1,0,1]
 7 |   v2 = [0,0,0]
 8 |   train_data = np.array([v1 for i in range(500)]+[v2 for i in range(500)])
 9 |   test_data = np.array([v1 for i in range(50)] + [v2 for i in range(50)])
10 |   print 'training data', train_data
11 |   model = DeepLatentGM([3, 4], batchsize=1, rec_hidden=1, kappa=0, stepsize=1)
12 |   model.train(train_data, 10, test_data = train_data)
13 |   print 'Generative Model', model.gmodel.pack()
14 |   print 'Recognition Model', model.rmodel.pack()
15 |   print 'Sample', model.sample(test_data)
16 | 
17 | test_h1_v3()
18 | 
19 | 


--------------------------------------------------------------------------------
/va/test_sanity.py:
--------------------------------------------------------------------------------
 1 | from dlgm import *
 2 | import numpy as np
 3 | import scipy.io as sio
 4 | 
 5 | def test_h1_v3():
 6 |   v1 = [1,0,1]
 7 |   v2 = [0,0,0]
 8 |   train_data = np.array([v1 for i in range(500)]+[v2 for i in range(500)])
 9 |   test_data = np.array([v1 for i in range(50)] + [v2 for i in range(50)])
10 |   print 'training data', train_data
11 |   model = DeepLatentGM([3, 4], batchsize=1, rec_hidden=1, kappa=0, stepsize=1)
12 |   model.train(train_data, 10, test_data = train_data)
13 |   print 'Generative Model', model.gmodel.pack()
14 |   print 'Recognition Model', model.rmodel.pack()
15 |   print 'Sample', model.sample(test_data)
16 | 
17 | test_h1_v3()
18 | 
19 | 


--------------------------------------------------------------------------------
/va/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | import numpy.linalg as npla
 4 | import numpy.random as npr
 5 | 
 6 | def param_op2(param, grad, op):
 7 |   for i in range(len(param)):
 8 |     param[i][:] = op(param[i], param[i])[:]
 9 | 
10 | def param_op(param, op):
11 |   for i in range(len(param)):
12 |     param[i][:] = op(param[i])[:]
13 | 
14 | def param_add(param, grad):
15 |   res = grad
16 |   if param != []:
17 |     for i in range(len(param)):
18 |       res[i] += param[i]
19 |   return res
20 | 
21 | def param_mul_scalar(param, scalar):
22 |   res = param
23 |   for i in range(len(param)):
24 |     res[i] = param[i] * scalar
25 |   return res
26 | 
27 | def param_neg(param):
28 |   res = param
29 |   for i in range(len(param)):
30 |     res[i] = -param[i]
31 |   return res
32 | 
33 | def randn01(*shape):
34 |   """
35 |   generate random vector/matrix from i.i.d. Gaussian. 
36 |   renormalize it to unit vector/matrix.
37 |   """
38 |   M = npr.randn(*shape)
39 |   return M/npla.norm(M)
40 | 


--------------------------------------------------------------------------------
/DLGM/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | import numpy.linalg as npla
 4 | import numpy.random as npr
 5 | from theano import config
 6 | 
 7 | def param_op2(param, grad, op):
 8 |   for i in range(len(param)):
 9 |     param[i][:] = op(param[i], param[i])[:]
10 | 
11 | def param_op(param, op):
12 |   for i in range(len(param)):
13 |     param[i][:] = op(param[i])[:]
14 | 
15 | def param_add(param, grad):
16 |   res = grad
17 |   if param != []:
18 |     for i in range(len(param)):
19 |       res[i] += param[i]
20 |   return res
21 | 
22 | def param_mul_scalar(param, scalar):
23 |   res = param
24 |   for i in range(len(param)):
25 |     res[i] = param[i] * np.asarray(scalar, config.floatX)
26 |   return res
27 | 
28 | def param_neg(param):
29 |   res = param
30 |   for i in range(len(param)):
31 |     res[i] = np.asarray(np.zeros(param[i].shape), config.floatX)-param[i]
32 |   return res
33 | 
34 | def randn01(*shape):
35 |   """
36 |   generate random vector/matrix from i.i.d. Gaussian. 
37 |   renormalize it to unit vector/matrix.
38 |   """
39 |   M = npr.randn(*shape)
40 |   return M/npla.norm(M)
41 | 


--------------------------------------------------------------------------------
/va/test_va.py:
--------------------------------------------------------------------------------
 1 | import numpy.random as npr
 2 | from va import *
 3 | import unittest
 4 | 
 5 | class TestDecoder(unittest.TestCase):
 6 | 
 7 |   def test_gen(self):
 8 |     model = Decoder([2, 4])
 9 |     xi = npr.randn(4,2)
10 |     # print xi
11 |     v = np.array([[1, 0], [0, 1]]).T
12 |     param = model.pack()
13 |     # print v
14 |     # print model.sample(xi)
15 |     # print model.get_lhood(v, xi)
16 |     resp = np.dot(param['W1'], np.dot(param['G'], xi)) + param['b1']
17 |     lhood = (v) * np.log(np.logistic(resp)) + (1-v) * np.log(1-np.logistic(resp))
18 |     # print lhood.sum()
19 |     assert(np.abs(lhood.sum() - model.get_lhood(v, xi)) < 1e-4)
20 | 
21 |   def test_gen_grad(self):
22 |     model = Decoder([2, 4])
23 |     xi = npr.randn(4,1)
24 |     # print xi
25 |     v = np.array([[1, 0]]).T
26 |     gradient = model.get_grad(v, xi)
27 |     # print gradient
28 | 
29 |   def test_gen_grad_xi(self):
30 |     model = Decoder([2, 4])
31 |     xi = npr.randn(4,1)
32 |     v = np.array([[1, 0]]).T
33 |     grad_xi = model.get_grad_xi(v, xi)
34 |     # print grad_xi
35 | 
36 | class TestEncoder(unittest.TestCase):
37 | 
38 |   def test_reco(self):
39 |     model = Encoder([2, 4], sigma=0.1)
40 |     v = np.array([[1, 0]]).T
41 |     model.sample_eps(v)
42 | 
43 | 
44 | 
45 | 
46 | if __name__ == '__main__':
47 |   unittest.main()
48 | 


--------------------------------------------------------------------------------
/DLGM/test_dlgm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | test module for dlgm.py
 3 | """
 4 | from dlgm import *
 5 | import numpy as np
 6 | import numpy.random as npr
 7 | import unittest
 8 | 
 9 | class TestGenerativeModel(unittest.TestCase):
10 | 
11 |   def setUp(me):
12 |     pass
13 | 
14 |   def test_nonlinear(me):
15 |     arr = [1, -1, 3, 3, 5, 6]
16 |     arch = [1,2]
17 |     nn = GenerativeModel(arch)
18 |     me.assertEqual(list(nn.nonlinear(arr)), [1,0,3,3,5,6])
19 |   
20 |   def test_generate(me):
21 |     arch = [1,2]
22 |     nn = GenerativeModel(arch)
23 |     xi = [npr.randn(i) for i in arch[1:]]
24 |     h1 = np.dot(nn.G[1].get_value(), xi[0])
25 |     h0 = np.dot(nn.W[0].get_value(), nn.nonlinear(h1)) + nn.b[0].get_value()
26 |     res = nn.generate(*xi)
27 |     me.assertEqual(res[0], h0)
28 |     assert((res[1] == h1).all())
29 | 
30 |   def test_recognition(me):
31 |     arch = [5, 10]
32 |     nn = RecognitionModel(arch)
33 |     v = [1, 0, 0, 1, 1]
34 |     z = nn.nonlinear(np.dot(nn.Wv[1].get_value(), v) + nn.bv[1].get_value())
35 |     me.assertEqual(list(nn.get_z(v)[0]), list(z))
36 |     mu = np.dot(nn.Wmu[1].get_value(), z) + nn.bmu[1].get_value()
37 |     me.assertEqual(list(nn.get_mu(v)[0]), list(mu))
38 |     d = np.exp(np.dot(nn.Wd[1].get_value(), z) + nn.bd[1].get_value())
39 |     me.assertEqual(list(nn.get_d(v)[0]), list(d))
40 |     u = np.dot(nn.Wu[1].get_value(), z) + nn.bu[1].get_value()
41 |     me.assertEqual(list(nn.get_u(v)[0]), list(u))
42 | 
43 | 
44 | if __name__ == "__main__":
45 |   unittest.main() 
46 | 


--------------------------------------------------------------------------------
/DLGM/color.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | class pcolors:
 4 |     HEADER = '\033[95m'
 5 |     OKBLUE = '\033[94m'
 6 |     OKGREEN = '\033[92m'
 7 |     WARNING = '\033[93m'
 8 |     FAIL = '\033[91m'
 9 |     GRAY = '\033[1;30m'
10 |     ENDC = '\033[0m'
11 |     Red = '\033[91m'
12 |     Green = '\033[92m'
13 |     Blue = '\033[94m'
14 |     Cyan = '\033[96m'
15 |     White = '\033[97m'
16 |     Yellow = '\033[93m'
17 |     Magenta = '\033[95m'
18 |     Grey = '\033[90m'
19 |     Black = '\033[90m'
20 | 
21 | def printRed(*args):
22 |     beginRed()
23 |     sys.stdout.write(' '.join([str(x) for x in args])+'\n')
24 |     end()
25 | 
26 | def printBlue(*args):
27 |     beginBlue()
28 |     sys.stdout.write(' '.join([str(x) for x in args])+'\n')
29 |     end()
30 | 
31 | def printComment(*args):
32 |     beginComment()
33 |     sys.stdout.write(' '.join([str(x) for x in args])+'\n')
34 |     end()
35 | 
36 | def beginTitle():
37 |     sys.stdout.write(pcolors.OKGREEN)
38 |     sys.stdout.flush()
39 | 
40 | def beginComment():
41 |     sys.stdout.write(pcolors.GRAY)
42 |     sys.stdout.flush()
43 | 
44 | def beginError():
45 |     sys.stdout.write(pcolors.FAIL)
46 |     sys.stdout.flush()
47 | 
48 | def beginRed():
49 |     sys.stdout.write(pcolors.Red)
50 |     sys.stdout.flush()
51 | 
52 | def beginBlue() :
53 |     sys.stdout.write(pcolors.OKBLUE)
54 |     sys.stdout.flush()
55 | 
56 | def end() :
57 |     sys.stdout.write(pcolors.ENDC)
58 |     sys.stdout.flush()
59 |     
60 | 
61 | if __name__ == "__main__":
62 |     printComment("hi", 1)
63 | 


--------------------------------------------------------------------------------
/va/color.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | class pcolors:
 4 |     HEADER = '\033[95m'
 5 |     OKBLUE = '\033[94m'
 6 |     OKGREEN = '\033[92m'
 7 |     WARNING = '\033[93m'
 8 |     FAIL = '\033[91m'
 9 |     GRAY = '\033[1;30m'
10 |     ENDC = '\033[0m'
11 |     Red = '\033[91m'
12 |     Green = '\033[92m'
13 |     Blue = '\033[94m'
14 |     Cyan = '\033[96m'
15 |     White = '\033[97m'
16 |     Yellow = '\033[93m'
17 |     Magenta = '\033[95m'
18 |     Grey = '\033[90m'
19 |     Black = '\033[90m'
20 | 
21 | def printRed(*args):
22 |     beginRed()
23 |     sys.stdout.write(' '.join([str(x) for x in args])+'\n')
24 |     end()
25 | 
26 | def printBlue(*args):
27 |     beginBlue()
28 |     sys.stdout.write(' '.join([str(x) for x in args])+'\n')
29 |     end()
30 | 
31 | def printComment(*args):
32 |     beginComment()
33 |     sys.stdout.write(' '.join([str(x) for x in args])+'\n')
34 |     end()
35 | 
36 | def beginTitle():
37 |     sys.stdout.write(pcolors.OKGREEN)
38 |     sys.stdout.flush()
39 | 
40 | def beginComment():
41 |     sys.stdout.write(pcolors.GRAY)
42 |     sys.stdout.flush()
43 | 
44 | def beginError():
45 |     sys.stdout.write(pcolors.FAIL)
46 |     sys.stdout.flush()
47 | 
48 | def beginRed():
49 |     sys.stdout.write(pcolors.Red)
50 |     sys.stdout.flush()
51 | 
52 | def beginBlue() :
53 |     sys.stdout.write(pcolors.OKBLUE)
54 |     sys.stdout.flush()
55 | 
56 | def end() :
57 |     sys.stdout.write(pcolors.ENDC)
58 |     sys.stdout.flush()
59 |     
60 | 
61 | if __name__ == "__main__":
62 |     printComment("hi", 1)
63 | 


--------------------------------------------------------------------------------
/va/pegasos.py:
--------------------------------------------------------------------------------
 1 | import scipy.io as sio
 2 | import pickle, gzip
 3 | import numpy.random as npr
 4 | import numpy as np
 5 | import pdb
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | toFloat = np.vectorize(float)
 9 | 
10 | mat = sio.loadmat('../data/mnist/mnist.mat')
11 | result = sio.loadmat('result.mat')
12 | # train_data = np.array(mat['trainData'])   
13 | train_data =  result['xi_train'].T
14 | train_label = np.argmax(np.array(mat['trainLabels']), axis=1)
15 | # test_data = np.array(mat['testData'])     
16 | test_data = result['xi'].T
17 | test_label = np.argmax(np.array(mat['testLabels']), axis=1)
18 | 
19 | batchsize = 32
20 | num_iter = 10000
21 | D = train_data.shape[1]
22 | eta = 0.01
23 | 
24 | W = npr.randn(D, 10)
25 | G2 = np.zeros_like(W)
26 | data_mean = np.mean(train_data, axis=0)
27 | train_data -= data_mean
28 | 
29 | 
30 | def test_acc():
31 |   resp = np.dot(test_data - data_mean, W)
32 |   predict = np.argmax(resp, 1)
33 |   return np.sum(predict == test_label) / float(len(resp))
34 | 
35 | acc = []
36 | for it in range(num_iter):
37 |   ind = npr.choice(range(len(train_data)), batchsize, replace=False)
38 |   g = np.zeros_like(W)
39 |   for (x, y) in zip(train_data[ind], train_label[ind]):
40 |     resp = 100 + np.dot(x, W) - np.dot(x, W[:,y])
41 |     resp[y] = 0
42 |     yp = np.argmax(resp) 
43 |     g[:,yp] -= x
44 |     g[:,y] += x
45 |   g /= float(batchsize)
46 |   G2 += g * g
47 |   W += eta * g / (1e-4 + np.sqrt(G2))
48 |   acc += [test_acc()]
49 |   print 'iter = ', it, ' , acc = ', acc[-1]
50 | 
51 | # sio.savemat('result.mat', {'W':W, 'acc':acc})
52 | plt.plot(acc)
53 | plt.xlabel('iteration')
54 | plt.ylabel('accuracy')
55 | plt.savefig('mnist.png')
56 | plt.show()
57 | 
58 | 
59 | 
60 |     
61 |     
62 | 
63 | 


--------------------------------------------------------------------------------
/va/test_mnist.py:
--------------------------------------------------------------------------------
 1 | from va import *
 2 | import numpy as np
 3 | import sys, os
 4 | import scipy.io as sio
 5 | from multiprocessing import Pool
 6 | import itertools
 7 | import pickle, gzip
 8 | 
 9 | toFloat = np.vectorize(float)
10 | 
11 | def universal_worker(input_pair):
12 |     function, args = input_pair
13 |     return function(*args)
14 | 
15 | def pool_args(function, *args):
16 |     return zip(itertools.repeat(function), zip(*args))
17 |     
18 | def run(hidden, kappa, sigma, stepsize):
19 |   mat = sio.loadmat('../data/mnist/mnistSmall.mat')
20 |   train_data = np.array(mat['trainData'])
21 |   train_label = np.argmax(np.array(mat['trainLabels']), axis=1)
22 |   test_data = np.array(mat['testData'])
23 |   test_label = np.argmax(np.array(mat['testLabels']), axis=1)
24 | 
25 |   output_path = '../result/hidden_%d_kappa_%f_sigma_%f' % (hidden, kappa, sigma)
26 |   os.system('mkdir -p ../result/%s' % output_path)
27 |   model = DeepLatentGM([784, hidden, hidden], batchsize=128, kappa=kappa, sigma=sigma, rec_hidden=hidden, stepsize=stepsize,\
28 |                         num_label=10)
29 |   model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label, output_path=output_path)
30 | 
31 | def run_full(hidden = 50, kappa = 0, sigma = 0, c = 0, stepsize = 0.01):
32 |   mat = pickle.load(gzip.open('../data/mnist/mnist.pkl.gz', 'rb'))
33 |   train_data = np.array(list(mat[0][0]) + list(mat[1][0]))
34 |   train_label = np.array(list(mat[0][1]) + list(mat[1][1]))
35 |   test_data = mat[2][0]
36 |   test_label = mat[2][1]
37 | 
38 |   model = AutoEncoder([784, hidden], num_sample=1, batchsize=512, kappa=kappa, sigma=sigma, stepsize=stepsize,\
39 |                         num_label=10, c = c, ell=10)
40 |   model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label)
41 | 
42 | def run_tiny(hidden = 50, kappa = 0, sigma = 0, c = 0, stepsize = 0.01):
43 |   mat = sio.loadmat('../data/mnist/mnistTiny.mat')
44 |   train_data = np.array(toFloat(mat['trainData']))   # binarize.
45 |   train_label = np.argmax(np.array(mat['trainLabels']), axis=1)
46 |   test_data = np.array(toFloat(mat['testData']))     # binarize.
47 |   test_label = np.argmax(np.array(mat['testLabels']), axis=1)
48 | 
49 |   model = AutoEncoder([784, hidden], num_sample=1, batchsize=32, kappa=kappa, sigma=sigma, stepsize=stepsize,\
50 |                         num_label=10, c = c, ell=10)
51 |   model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label)
52 | 
53 | # run_tiny()
54 | run_full()
55 | 
56 | 


--------------------------------------------------------------------------------
/DLGM/test_mnist.py:
--------------------------------------------------------------------------------
 1 | from dlgm import *
 2 | import numpy as np
 3 | import sys, os
 4 | import scipy.io as sio
 5 | from multiprocessing import Pool
 6 | import itertools
 7 | import pickle, gzip
 8 | 
 9 | toFloat = np.vectorize(float)
10 | 
11 | def universal_worker(input_pair):
12 |     function, args = input_pair
13 |     return function(*args)
14 | 
15 | def pool_args(function, *args):
16 |     return zip(itertools.repeat(function), zip(*args))
17 |     
18 | def run(hidden, kappa, sigma, stepsize):
19 |   mat = sio.loadmat('../data/mnist/mnistSmall.mat')
20 |   train_data = np.array(mat['trainData'])
21 |   train_label = np.argmax(np.array(mat['trainLabels']), axis=1)
22 |   test_data = np.array(mat['testData'])
23 |   test_label = np.argmax(np.array(mat['testLabels']), axis=1)
24 | 
25 |   output_path = '../result/hidden_%d_kappa_%f_sigma_%f' % (hidden, kappa, sigma)
26 |   os.system('mkdir -p ../result/%s' % output_path)
27 |   model = DeepLatentGM([784, hidden, hidden], batchsize=128, kappa=kappa, sigma=sigma, rec_hidden=hidden, stepsize=stepsize,\
28 |                         num_label=10)
29 |   model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label, output_path=output_path)
30 | 
31 | def run_full():
32 |   mat = pickle.load(gzip.open('../data/mnist/mnist.pkl.gz', 'rb'))
33 |   train_data = np.array(list(mat[0][0]) + list(mat[1][0]))
34 |   train_label = np.array(list(mat[0][1]) + list(mat[1][1]))
35 |   test_data = mat[2][0]
36 |   test_label = mat[2][1]
37 |   hidden = 100
38 |   sigma = 0.001
39 |   kappa = 0.1
40 |   stepsize = 0.01
41 | 
42 |   model = DeepLatentGM([784, hidden, hidden], batchsize=64, kappa=kappa, sigma=sigma, rec_hidden=hidden, stepsize=stepsize,\
43 |                         num_label=10)
44 |   model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label)
45 | 
46 | def run_tiny():
47 |   mat = sio.loadmat('../data/mnist/mnistTiny.mat')
48 |   train_data = np.array(toFloat(mat['trainData'] > 0.5))   # binarize.
49 |   train_label = np.argmax(np.array(mat['trainLabels']), axis=1)
50 |   test_data = np.array(toFloat(mat['testData'] > 0.5))     # binarize.
51 |   test_label = np.argmax(np.array(mat['testLabels']), axis=1)
52 | 
53 |   hidden = 100
54 |   sigma = 0.001
55 |   kappa = 0.1
56 |   stepsize = 0.01
57 | 
58 |   model = DeepLatentGM([784, hidden, hidden], batchsize=64, kappa=kappa, sigma=sigma, rec_hidden=hidden, stepsize=stepsize,\
59 |                         num_label=10)
60 |   model.train(train_data, train_label, 500, test_data = test_data, test_label = test_label)
61 | 
62 | # run_full()
63 | run_tiny()
64 | 
65 | 


--------------------------------------------------------------------------------
/RBM/utils.py:
--------------------------------------------------------------------------------
  1 | """ This file contains different utility functions that are not connected
  2 | in anyway to the networks presented in the tutorials, but rather help in
  3 | processing the outputs into a more understandable way.
  4 | 
  5 | For example ``tile_raster_images`` helps in generating a easy to grasp
  6 | image from a set of samples or weights.
  7 | """
  8 | 
  9 | 
 10 | import numpy
 11 | 
 12 | 
 13 | def scale_to_unit_interval(ndar, eps=1e-8):
 14 |     """ Scales all values in the ndarray ndar to be between 0 and 1 """
 15 |     ndar = ndar.copy()
 16 |     ndar -= ndar.min()
 17 |     ndar *= 1.0 / (ndar.max() + eps)
 18 |     return ndar
 19 | 
 20 | 
 21 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 22 |                        scale_rows_to_unit_interval=True,
 23 |                        output_pixel_vals=True):
 24 |     """
 25 |     Transform an array with one flattened image per row, into an array in
 26 |     which images are reshaped and layed out like tiles on a floor.
 27 | 
 28 |     This function is useful for visualizing datasets whose rows are images,
 29 |     and also columns of matrices for transforming those rows
 30 |     (such as the first layer of a neural net).
 31 | 
 32 |     :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 33 |     be 2-D ndarrays or None;
 34 |     :param X: a 2-D array in which every row is a flattened image.
 35 | 
 36 |     :type img_shape: tuple; (height, width)
 37 |     :param img_shape: the original shape of each image
 38 | 
 39 |     :type tile_shape: tuple; (rows, cols)
 40 |     :param tile_shape: the number of images to tile (rows, cols)
 41 | 
 42 |     :param output_pixel_vals: if output should be pixel values (i.e. int8
 43 |     values) or floats
 44 | 
 45 |     :param scale_rows_to_unit_interval: if the values need to be scaled before
 46 |     being plotted to [0,1] or not
 47 | 
 48 | 
 49 |     :returns: array suitable for viewing as an image.
 50 |     (See:`Image.fromarray`.)
 51 |     :rtype: a 2-d array with same dtype as X.
 52 | 
 53 |     """
 54 | 
 55 |     assert len(img_shape) == 2
 56 |     assert len(tile_shape) == 2
 57 |     assert len(tile_spacing) == 2
 58 | 
 59 |     # The expression below can be re-written in a more C style as
 60 |     # follows :
 61 |     #
 62 |     # out_shape    = [0,0]
 63 |     # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] -
 64 |     #                tile_spacing[0]
 65 |     # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] -
 66 |     #                tile_spacing[1]
 67 |     out_shape = [
 68 |         (ishp + tsp) * tshp - tsp
 69 |         for ishp, tshp, tsp in zip(img_shape, tile_shape, tile_spacing)
 70 |     ]
 71 | 
 72 |     if isinstance(X, tuple):
 73 |         assert len(X) == 4
 74 |         # Create an output numpy ndarray to store the image
 75 |         if output_pixel_vals:
 76 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 77 |                                     dtype='uint8')
 78 |         else:
 79 |             out_array = numpy.zeros((out_shape[0], out_shape[1], 4),
 80 |                                     dtype=X.dtype)
 81 | 
 82 |         #colors default to 0, alpha defaults to 1 (opaque)
 83 |         if output_pixel_vals:
 84 |             channel_defaults = [0, 0, 0, 255]
 85 |         else:
 86 |             channel_defaults = [0., 0., 0., 1.]
 87 | 
 88 |         for i in xrange(4):
 89 |             if X[i] is None:
 90 |                 # if channel is None, fill it with zeros of the correct
 91 |                 # dtype
 92 |                 dt = out_array.dtype
 93 |                 if output_pixel_vals:
 94 |                     dt = 'uint8'
 95 |                 out_array[:, :, i] = numpy.zeros(
 96 |                     out_shape,
 97 |                     dtype=dt
 98 |                 ) + channel_defaults[i]
 99 |             else:
100 |                 # use a recurrent call to compute the channel and store it
101 |                 # in the output
102 |                 out_array[:, :, i] = tile_raster_images(
103 |                     X[i], img_shape, tile_shape, tile_spacing,
104 |                     scale_rows_to_unit_interval, output_pixel_vals)
105 |         return out_array
106 | 
107 |     else:
108 |         # if we are dealing with only one channel
109 |         H, W = img_shape
110 |         Hs, Ws = tile_spacing
111 | 
112 |         # generate a matrix to store the output
113 |         dt = X.dtype
114 |         if output_pixel_vals:
115 |             dt = 'uint8'
116 |         out_array = numpy.zeros(out_shape, dtype=dt)
117 | 
118 |         for tile_row in xrange(tile_shape[0]):
119 |             for tile_col in xrange(tile_shape[1]):
120 |                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
121 |                     this_x = X[tile_row * tile_shape[1] + tile_col]
122 |                     if scale_rows_to_unit_interval:
123 |                         # if we should scale values to be between 0 and 1
124 |                         # do this by calling the `scale_to_unit_interval`
125 |                         # function
126 |                         this_img = scale_to_unit_interval(
127 |                             this_x.reshape(img_shape))
128 |                     else:
129 |                         this_img = this_x.reshape(img_shape)
130 |                     # add the slice to the corresponding position in the
131 |                     # output array
132 |                     c = 1
133 |                     if output_pixel_vals:
134 |                         c = 255
135 |                     out_array[
136 |                         tile_row * (H + Hs): tile_row * (H + Hs) + H,
137 |                         tile_col * (W + Ws): tile_col * (W + Ws) + W
138 |                     ] = this_img * c
139 |         return out_array
140 | 


--------------------------------------------------------------------------------
/RBM/logistic_sgd.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This tutorial introduces logistic regression using Theano and stochastic
  3 | gradient descent.
  4 | 
  5 | Logistic regression is a probabilistic, linear classifier. It is parametrized
  6 | by a weight matrix :math:`W` and a bias vector :math:`b`. Classification is
  7 | done by projecting data points onto a set of hyperplanes, the distance to
  8 | which is used to determine a class membership probability.
  9 | 
 10 | Mathematically, this can be written as:
 11 | 
 12 | .. math::
 13 |   P(Y=i|x, W,b) &= softmax_i(W x + b) \\
 14 |                 &= \frac {e^{W_i x + b_i}} {\sum_j e^{W_j x + b_j}}
 15 | 
 16 | 
 17 | The output of the model or prediction is then done by taking the argmax of
 18 | the vector whose i'th element is P(Y=i|x).
 19 | 
 20 | .. math::
 21 | 
 22 |   y_{pred} = argmax_i P(Y=i|x,W,b)
 23 | 
 24 | 
 25 | This tutorial presents a stochastic gradient descent optimization method
 26 | suitable for large datasets.
 27 | 
 28 | 
 29 | References:
 30 | 
 31 |     - textbooks: "Pattern Recognition and Machine Learning" -
 32 |                  Christopher M. Bishop, section 4.3.2
 33 | 
 34 | """
 35 | __docformat__ = 'restructedtext en'
 36 | 
 37 | import cPickle
 38 | import gzip
 39 | import os
 40 | import sys
 41 | import time
 42 | 
 43 | import numpy
 44 | 
 45 | import theano
 46 | import theano.tensor as T
 47 | 
 48 | 
 49 | class LogisticRegression(object):
 50 |     """Multi-class Logistic Regression Class
 51 | 
 52 |     The logistic regression is fully described by a weight matrix :math:`W`
 53 |     and bias vector :math:`b`. Classification is done by projecting data
 54 |     points onto a set of hyperplanes, the distance to which is used to
 55 |     determine a class membership probability.
 56 |     """
 57 | 
 58 |     def __init__(self, input, n_in, n_out):
 59 |         """ Initialize the parameters of the logistic regression
 60 | 
 61 |         :type input: theano.tensor.TensorType
 62 |         :param input: symbolic variable that describes the input of the
 63 |                       architecture (one minibatch)
 64 | 
 65 |         :type n_in: int
 66 |         :param n_in: number of input units, the dimension of the space in
 67 |                      which the datapoints lie
 68 | 
 69 |         :type n_out: int
 70 |         :param n_out: number of output units, the dimension of the space in
 71 |                       which the labels lie
 72 | 
 73 |         """
 74 |         # start-snippet-1
 75 |         # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
 76 |         self.W = theano.shared(
 77 |             value=numpy.zeros(
 78 |                 (n_in, n_out),
 79 |                 dtype=theano.config.floatX
 80 |             ),
 81 |             name='W',
 82 |             borrow=True
 83 |         )
 84 |         # initialize the baises b as a vector of n_out 0s
 85 |         self.b = theano.shared(
 86 |             value=numpy.zeros(
 87 |                 (n_out,),
 88 |                 dtype=theano.config.floatX
 89 |             ),
 90 |             name='b',
 91 |             borrow=True
 92 |         )
 93 | 
 94 |         # symbolic expression for computing the matrix of class-membership
 95 |         # probabilities
 96 |         # Where:
 97 |         # W is a matrix where column-k represent the separation hyper plain for
 98 |         # class-k
 99 |         # x is a matrix where row-j  represents input training sample-j
100 |         # b is a vector where element-k represent the free parameter of hyper
101 |         # plain-k
102 |         self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
103 | 
104 |         # symbolic description of how to compute prediction as class whose
105 |         # probability is maximal
106 |         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
107 |         # end-snippet-1
108 | 
109 |         # parameters of the model
110 |         self.params = [self.W, self.b]
111 | 
112 |     def negative_log_likelihood(self, y):
113 |         """Return the mean of the negative log-likelihood of the prediction
114 |         of this model under a given target distribution.
115 | 
116 |         .. math::
117 | 
118 |             \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
119 |             \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
120 |                 \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
121 |             \ell (\theta=\{W,b\}, \mathcal{D})
122 | 
123 |         :type y: theano.tensor.TensorType
124 |         :param y: corresponds to a vector that gives for each example the
125 |                   correct label
126 | 
127 |         Note: we use the mean instead of the sum so that
128 |               the learning rate is less dependent on the batch size
129 |         """
130 |         # start-snippet-2
131 |         # y.shape[0] is (symbolically) the number of rows in y, i.e.,
132 |         # number of examples (call it n) in the minibatch
133 |         # T.arange(y.shape[0]) is a symbolic vector which will contain
134 |         # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of
135 |         # Log-Probabilities (call it LP) with one row per example and
136 |         # one column per class LP[T.arange(y.shape[0]),y] is a vector
137 |         # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ...,
138 |         # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is
139 |         # the mean (across minibatch examples) of the elements in v,
140 |         # i.e., the mean log-likelihood across the minibatch.
141 |         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
142 |         # end-snippet-2
143 | 
144 |     def errors(self, y):
145 |         """Return a float representing the number of errors in the minibatch
146 |         over the total number of examples of the minibatch ; zero one
147 |         loss over the size of the minibatch
148 | 
149 |         :type y: theano.tensor.TensorType
150 |         :param y: corresponds to a vector that gives for each example the
151 |                   correct label
152 |         """
153 | 
154 |         # check if y has same dimension of y_pred
155 |         if y.ndim != self.y_pred.ndim:
156 |             raise TypeError(
157 |                 'y should have the same shape as self.y_pred',
158 |                 ('y', y.type, 'y_pred', self.y_pred.type)
159 |             )
160 |         # check if y is of the correct datatype
161 |         if y.dtype.startswith('int'):
162 |             # the T.neq operator returns a vector of 0s and 1s, where 1
163 |             # represents a mistake in prediction
164 |             return T.mean(T.neq(self.y_pred, y))
165 |         else:
166 |             raise NotImplementedError()
167 | 
168 | 
169 | def load_data(dataset, size=-1):
170 |     ''' Loads the dataset
171 | 
172 |     :type dataset: string
173 |     :param dataset: the path to the dataset (here MNIST)
174 |     '''
175 | 
176 |     #############
177 |     # LOAD DATA #
178 |     #############
179 | 
180 |     # Download the MNIST dataset if it is not present
181 |     data_dir, data_file = os.path.split(dataset)
182 |     if data_dir == "" and not os.path.isfile(dataset):
183 |         # Check if dataset is in the data directory.
184 |         new_path = os.path.join(
185 |             os.path.split(__file__)[0],
186 |             "..",
187 |             "data",
188 |             dataset
189 |         )
190 |         if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
191 |             dataset = new_path
192 | 
193 |     if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
194 |         import urllib
195 |         origin = (
196 |             'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
197 |         )
198 |         print 'Downloading data from %s' % origin
199 |         urllib.urlretrieve(origin, dataset)
200 | 
201 |     print '... loading data'
202 | 
203 |     # Load the dataset
204 |     f = gzip.open(dataset, 'rb')
205 |     train_set, valid_set, test_set = cPickle.load(f)
206 |     def truncate(dt, size):
207 |         return (dt[0][:size], dt[1][:size])
208 |     train_set = truncate(train_set, size)
209 |     valid_set = truncate(valid_set, size)
210 |     test_set = truncate(test_set, size)
211 | 
212 |     f.close()
213 |     #train_set, valid_set, test_set format: tuple(input, target)
214 |     #input is an numpy.ndarray of 2 dimensions (a matrix)
215 |     #witch row's correspond to an example. target is a
216 |     #numpy.ndarray of 1 dimensions (vector)) that have the same length as
217 |     #the number of rows in the input. It should give the target
218 |     #target to the example with the same index in the input.
219 | 
220 |     def shared_dataset(data_xy, borrow=True):
221 |         """ Function that loads the dataset into shared variables
222 | 
223 |         The reason we store our dataset in shared variables is to allow
224 |         Theano to copy it into the GPU memory (when code is run on GPU).
225 |         Since copying data into the GPU is slow, copying a minibatch everytime
226 |         is needed (the default behaviour if the data is not in a shared
227 |         variable) would lead to a large decrease in performance.
228 |         """
229 |         data_x, data_y = data_xy
230 |         shared_x = theano.shared(numpy.asarray(data_x,
231 |                                                dtype=theano.config.floatX),
232 |                                  borrow=borrow)
233 |         shared_y = theano.shared(numpy.asarray(data_y,
234 |                                                dtype=theano.config.floatX),
235 |                                  borrow=borrow)
236 |         # When storing data on the GPU it has to be stored as floats
237 |         # therefore we will store the labels as ``floatX`` as well
238 |         # (``shared_y`` does exactly that). But during our computations
239 |         # we need them as ints (we use labels as index, and if they are
240 |         # floats it doesn't make sense) therefore instead of returning
241 |         # ``shared_y`` we will have to cast it to int. This little hack
242 |         # lets ous get around this issue
243 |         return shared_x, shared_y
244 | 
245 |     test_set_x, test_set_y = shared_dataset(test_set)
246 |     valid_set_x, valid_set_y = shared_dataset(valid_set)
247 |     train_set_x, train_set_y = shared_dataset(train_set)
248 | 
249 |     rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
250 |             (test_set_x, test_set_y)]
251 |     return rval
252 | 
253 | 
254 | def sgd_optimization_mnist(learning_rate=0.13, n_epochs=1000,
255 |                            dataset='mnist.pkl.gz',
256 |                            batch_size=600):
257 |     """
258 |     Demonstrate stochastic gradient descent optimization of a log-linear
259 |     model
260 | 
261 |     This is demonstrated on MNIST.
262 | 
263 |     :type learning_rate: float
264 |     :param learning_rate: learning rate used (factor for the stochastic
265 |                           gradient)
266 | 
267 |     :type n_epochs: int
268 |     :param n_epochs: maximal number of epochs to run the optimizer
269 | 
270 |     :type dataset: string
271 |     :param dataset: the path of the MNIST dataset file from
272 |                  http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
273 | 
274 |     """
275 |     datasets = load_data(dataset)
276 | 
277 |     train_set_x, train_set_y = datasets[0]
278 |     valid_set_x, valid_set_y = datasets[1]
279 |     test_set_x, test_set_y = datasets[2]
280 | 
281 | 
282 |     # compute number of minibatches for training, validation and testing
283 |     n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
284 |     n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
285 |     n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size
286 | 
287 |     ######################
288 |     # BUILD ACTUAL MODEL #
289 |     ######################
290 |     print '... building the model'
291 | 
292 |     # allocate symbolic variables for the data
293 |     index = T.lscalar()  # index to a [mini]batch
294 | 
295 |     # generate symbolic variables for input (x and y represent a
296 |     # minibatch)
297 |     x = T.matrix('x')  # data, presented as rasterized images
298 |     y = T.ivector('y')  # labels, presented as 1D vector of [int] labels
299 | 
300 |     # construct the logistic regression class
301 |     # Each MNIST image has size 28*28
302 |     classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)
303 | 
304 |     # the cost we minimize during training is the negative log likelihood of
305 |     # the model in symbolic format
306 |     cost = classifier.negative_log_likelihood(y)
307 | 
308 |     # compiling a Theano function that computes the mistakes that are made by
309 |     # the model on a minibatch
310 |     test_model = theano.function(
311 |         inputs=[index],
312 |         outputs=classifier.errors(y),
313 |         givens={
314 |             x: test_set_x[index * batch_size: (index + 1) * batch_size],
315 |             y: test_set_y[index * batch_size: (index + 1) * batch_size]
316 |         }
317 |     )
318 | 
319 |     validate_model = theano.function(
320 |         inputs=[index],
321 |         outputs=classifier.errors(y),
322 |         givens={
323 |             x: valid_set_x[index * batch_size: (index + 1) * batch_size],
324 |             y: valid_set_y[index * batch_size: (index + 1) * batch_size]
325 |         }
326 |     )
327 | 
328 |     # compute the gradient of cost with respect to theta = (W,b)
329 |     g_W = T.grad(cost=cost, wrt=classifier.W)
330 |     g_b = T.grad(cost=cost, wrt=classifier.b)
331 | 
332 |     # start-snippet-3
333 |     # specify how to update the parameters of the model as a list of
334 |     # (variable, update expression) pairs.
335 |     updates = [(classifier.W, classifier.W - learning_rate * g_W),
336 |                (classifier.b, classifier.b - learning_rate * g_b)]
337 | 
338 |     # compiling a Theano function `train_model` that returns the cost, but in
339 |     # the same time updates the parameter of the model based on the rules
340 |     # defined in `updates`
341 |     train_model = theano.function(
342 |         inputs=[index],
343 |         outputs=cost,
344 |         updates=updates,
345 |         givens={
346 |             x: train_set_x[index * batch_size: (index + 1) * batch_size],
347 |             y: train_set_y[index * batch_size: (index + 1) * batch_size]
348 |         }
349 |     )
350 |     # end-snippet-3
351 | 
352 |     ###############
353 |     # TRAIN MODEL #
354 |     ###############
355 |     print '... training the model'
356 |     # early-stopping parameters
357 |     patience = 5000  # look as this many examples regardless
358 |     patience_increase = 2  # wait this much longer when a new best is
359 |                                   # found
360 |     improvement_threshold = 0.995  # a relative improvement of this much is
361 |                                   # considered significant
362 |     validation_frequency = min(n_train_batches, patience / 2)
363 |                                   # go through this many
364 |                                   # minibatche before checking the network
365 |                                   # on the validation set; in this case we
366 |                                   # check every epoch
367 | 
368 |     best_validation_loss = numpy.inf
369 |     test_score = 0.
370 |     start_time = time.clock()
371 | 
372 |     done_looping = False
373 |     epoch = 0
374 |     while (epoch < n_epochs) and (not done_looping):
375 |         epoch = epoch + 1
376 |         for minibatch_index in xrange(n_train_batches):
377 | 
378 |             minibatch_avg_cost = train_model(minibatch_index)
379 |             # iteration number
380 |             iter = (epoch - 1) * n_train_batches + minibatch_index
381 | 
382 |             if (iter + 1) % validation_frequency == 0:
383 |                 # compute zero-one loss on validation set
384 |                 validation_losses = [validate_model(i)
385 |                                      for i in xrange(n_valid_batches)]
386 |                 this_validation_loss = numpy.mean(validation_losses)
387 | 
388 |                 print(
389 |                     'epoch %i, minibatch %i/%i, validation error %f %%' %
390 |                     (
391 |                         epoch,
392 |                         minibatch_index + 1,
393 |                         n_train_batches,
394 |                         this_validation_loss * 100.
395 |                     )
396 |                 )
397 | 
398 |                 # if we got the best validation score until now
399 |                 if this_validation_loss < best_validation_loss:
400 |                     #improve patience if loss improvement is good enough
401 |                     if this_validation_loss < best_validation_loss *  \
402 |                        improvement_threshold:
403 |                         patience = max(patience, iter * patience_increase)
404 | 
405 |                     best_validation_loss = this_validation_loss
406 |                     # test it on the test set
407 | 
408 |                     test_losses = [test_model(i)
409 |                                    for i in xrange(n_test_batches)]
410 |                     test_score = numpy.mean(test_losses)
411 | 
412 |                     print(
413 |                         (
414 |                             '     epoch %i, minibatch %i/%i, test error of'
415 |                             ' best model %f %%'
416 |                         ) %
417 |                         (
418 |                             epoch,
419 |                             minibatch_index + 1,
420 |                             n_train_batches,
421 |                             test_score * 100.
422 |                         )
423 |                     )
424 | 
425 |             if patience <= iter:
426 |                 done_looping = True
427 |                 break
428 | 
429 |     end_time = time.clock()
430 |     print(
431 |         (
432 |             'Optimization complete with best validation score of %f %%,'
433 |             'with test performance %f %%'
434 |         )
435 |         % (best_validation_loss * 100., test_score * 100.)
436 |     )
437 |     print 'The code run for %d epochs, with %f epochs/sec' % (
438 |         epoch, 1. * epoch / (end_time - start_time))
439 |     print >> sys.stderr, ('The code for file ' +
440 |                           os.path.split(__file__)[1] +
441 |                           ' ran for %.1fs' % ((end_time - start_time)))
442 | 
443 | if __name__ == '__main__':
444 |     sgd_optimization_mnist()
445 | 


--------------------------------------------------------------------------------
/va/va.py:
--------------------------------------------------------------------------------
  1 | """
  2 | implements the models in paper "stochastic backpropagation in DLGMs"
  3 | including
  4 | * generative model.
  5 | * recognition model.
  6 | """
  7 | "let client and server have the same imports."
  8 | imports = ['import numpy as np', 
  9 |            'import numpy.random as npr', 
 10 |            'import theano',
 11 |            'import sys, os',
 12 |            'import scipy.io as sio', 
 13 |            'import theano.sandbox.linalg as ta',
 14 |            'from theano.tensor.shared_randomstreams import RandomStreams as trng',
 15 |            'import theano.tensor as ts',
 16 |            'from color import *',
 17 |            'from utils import *']
 18 | for _import in imports:
 19 |   exec _import
 20 | 
 21 | import pdb
 22 | import time
 23 | from IPython.parallel import Client
 24 | 
 25 | theano.config.exception_verbosity = 'low'
 26 | 
 27 | ts.logistic = lambda z: 1 / (1 + ts.exp(-z)) 
 28 | np.logistic = lambda z: 1 / (1 + np.exp(-z))
 29 | toInt = np.vectorize(int)
 30 | toStr = np.vectorize(str)
 31 | 
 32 | get_value = lambda x: x.get_value() if x != None else None
 33 | get_value_all = lambda xs: [get_value(x) for x in xs]
 34 | 
 35 | nonlinear_f = lambda x : ts.log(1+ts.exp(x))  # smooth ReLU.
 36 | nonlinear_s = "smooth ReLU"
 37 | if os.environ.has_key('nonlinear'):
 38 |   nonlinear_s = os.environ['nonlinear']
 39 |   if nonlinear_s == "ReLU":
 40 |     f = lambda x : ts.maximum(0, x) # ReLU.
 41 |   if nonlinear_s == "tanh":
 42 |     f = lambda x : ts.tanh(x)
 43 | 
 44 | def AdaGRAD(param, grad, G2, stepsize):
 45 |   """
 46 |   adaptive sub-gradient algorithm for tensor-shared objects.
 47 |   > input:
 48 |     param: parameters, tensor-shared objects. 
 49 |     grad: gradient, list of numpy arrays.
 50 |     G2: variance of gradient, list of numpy arrays
 51 |   """
 52 |   for (p, g, g2) in zip(param, grad, G2):
 53 |     g2[:] = (g2 + g * g)[:]
 54 |     if type(p) == theano.tensor.sharedvar.TensorSharedVariable:
 55 |       p.set_value(p.get_value() - stepsize * g / (1e-8 + np.sqrt(g2))) 
 56 |     elif type(p) == np.ndarray:
 57 |       p[:] = p[:] - stepsize * g[:] / (1e-8 + np.sqrt(g2[:]))
 58 | 
 59 | class Decoder:
 60 |   """ generative model 
 61 |   """
 62 |   def __init__(me, arch, kappa=0.1):
 63 |     """
 64 |       create the variational decoder.
 65 |         arch: architecture, [vis, hidden]
 66 |     """
 67 |     "set options."
 68 |     me.f = ts.maximum         # nonlinear transformation.
 69 | 
 70 |     "set properties."
 71 |     me.arch = arch
 72 |     me.num_layers = len(arch)
 73 |     me.kappa = kappa
 74 |     assert(me.num_layers > 1)
 75 | 
 76 |     "init layers."
 77 |     me.G = theano.shared(np.eye(arch[1]), name="G")
 78 |     me.xi = ts.matrix("xi")
 79 |     me.h = ts.dot(me.G, me.xi)
 80 | 
 81 |     def one_layer_logistic(h):
 82 |       """
 83 |       one layer network for decoding
 84 |       """
 85 |       W1 = theano.shared(randn01(arch[0], arch[1]), name="W1")
 86 |       b1 = theano.shared(np.zeros((arch[0], 1)), name="b1", broadcastable=(False,True))
 87 |       resp = ts.dot(W1, h) + b1
 88 |       return ( resp,
 89 |                [W1, b1], 
 90 |                lambda v : (v * ts.log(ts.logistic(resp)) + (1-v) * ts.log(1-ts.logistic(resp))).sum()
 91 |              )
 92 | 
 93 |     def two_layer_logistic(h):
 94 |       """
 95 |       two-layer network for decoding
 96 |       """
 97 |       hidden = 100
 98 |       W1 = theano.shared(randn01(hidden, arch[1]), name="W1")
 99 |       b1 = theano.shared(np.zeros((hidden, 1)), name="b1", broadcastable=(False,True))
100 |       W2 = theano.shared(randn01(arch[0], hidden), name="W2")
101 |       b2 = theano.shared(np.zeros((arch[0], 1)), name="b2", broadcastable=(False,True))
102 |       u = nonlinear_f(ts.dot(W1, h) + b1)
103 |       resp = ts.dot(W2, u) + b2
104 |       return ( resp,
105 |                [W1, b1, W2, b2], 
106 |                lambda v : (v * ts.log(ts.logistic(resp)) + (1-v) * ts.log(1-ts.logistic(resp))).sum(),
107 |              )
108 |     
109 |     (me.resp, me.param, me.lhoodFunc) = one_layer_logistic(me.h)
110 |     #(me.resp, me.param, me.lhoodFunc) = two_layer_logistic(me.h)
111 | 
112 |     me.param += [me.G]
113 |     me.G2 = [np.zeros(x.get_value().shape) for x in me.param] # variance of gradient.
114 | 
115 |     "define objective."
116 |     me.v = ts.matrix("v")
117 |     me.lhood = me.lhoodFunc(me.v)
118 |     me.get_lhood = theano.function([me.v, me.xi], me.lhood) 
119 |     me.reg = me.kappa * sum([ts.sum(p * p) for p in me.param])
120 |     me.get_reg = theano.function([], me.reg)
121 | 
122 |     "define gradient."
123 |     me.gradient = ts.grad(me.lhood, me.param)
124 |     me.gradient_xi = ts.grad(me.lhood, me.xi)
125 |     # me.hessian_xi = ts.hessian(me.lhood, me.xi[1:])
126 |     me.get_grad = theano.function([me.v, me.xi], me.gradient)
127 |     me.get_grad_xi = theano.function([me.v, me.xi], me.gradient_xi)
128 |     # me.get_hess_xi = theano.function([me.v] + me.xi[1:], me.hessian_xi)
129 |     me.gradient_reg = ts.grad(me.reg, me.param)
130 |     me.get_grad_reg = theano.function([], me.gradient_reg)
131 | 
132 |     "define utils."
133 |     me.generate = theano.function([me.xi], me.resp)
134 | 
135 |   def sample(me, xi):
136 |     resp = me.activate(xi)
137 |     return toInt(npr.rand(*resp.shape) < resp)
138 |   
139 |   def reconstruct(me, xi):
140 |     resp = me.activate(xi)
141 |     return toInt(np.ones(resp.shape) * 0.5 < resp)
142 |   
143 |   def activate(me, xi):
144 |     resp = me.generate(xi)
145 |     return np.logistic(resp)
146 | 
147 | 
148 |   def pack(me):
149 |     param = dict()
150 |     for p in me.param:
151 |       param.update({str(p): p.get_value()})
152 |     return param
153 |  
154 | class Encoder:
155 |   """ recognition model (encoder)
156 |         since xi \sim \Normal(\mu, C) for each layer. 
157 |         the recognition fits its parameters (\mu, C) discriminatively.
158 | 
159 |       a simple recognition model uses a two layer NN to fit each parameter.
160 |       see DLGM appendix A.
161 |   """
162 |   def __init__(me, arch, sigma=1):
163 |     """
164 |       create the deep latent Gaussian recognition model.
165 |         arch: architecture, [vis, hidden_1, hidden_2, ...]
166 |     """
167 |     "set options."
168 |     me.f = ts.maximum         # nonlinear transformation.
169 | 
170 |     "set properties."
171 |     me.arch = arch
172 |     me.num_layers = len(arch)
173 |     me.sigma = sigma
174 |     assert(me.num_layers > 1)
175 | 
176 |     "init layers."
177 |     me.v = ts.matrix("v")
178 | 
179 |     def two_layer_recognition(v):
180 |       num_hidden = 4 * me.arch[1]
181 |       Wv = theano.shared(randn01(num_hidden, arch[0]), name="Wv")
182 |       bv = theano.shared(np.zeros((num_hidden, 1)), name="bv", broadcastable=(False,True))
183 |       Wmu = theano.shared(randn01(arch[1], num_hidden), name="Wmu")
184 |       bmu = theano.shared(np.zeros((arch[1], 1)), name="bmu", broadcastable=(False, True))
185 |       Wd = theano.shared(randn01(arch[1], num_hidden), name="Wd")
186 |       bd = theano.shared(np.zeros((arch[1], 1)), name="bd", broadcastable=(False, True))
187 |       z =  nonlinear_f(ts.dot(Wv, v) + bv)
188 |       d = ts.exp(ts.dot(Wd, z) + bd)
189 |       mu = ts.dot(Wmu, z) + bmu
190 |       xs = ts.matrix('x')
191 |       return (mu, 
192 |               d,
193 |               [Wv, bv, Wmu, bmu, Wd, bd], 
194 |               theano.function([v, xs], mu + 1/ts.sqrt(d) * xs),
195 |               theano.function([v], z)
196 |              )
197 | 
198 |     me.sample_eps = lambda V: npr.normal(0, 1, (arch[1], V.shape[1]))
199 |     (me.mu, me.d, me.param, me.sample, me.propup) = two_layer_recognition(me.v)
200 |     me.G2 = [np.zeros(x.get_value().shape) for x in me.param] # variance of gradient.
201 |     me.get_mu = theano.function([me.v], me.mu)
202 |     me.get_d = theano.function([me.v], me.d)
203 |      
204 |     "free energy and gradients."
205 |     me.energy = 0;
206 |     for layer in range(1, me.num_layers):
207 |       me.energy += .5 * me.sigma * ((me.mu * me.mu).sum() + ts.sum(1/me.d) + ts.sum(ts.log(me.d)))
208 |     me.get_energy = theano.function([me.v], me.energy)
209 |     me.gradient = ts.grad(me.energy, me.param)
210 |     me.get_grad = theano.function([me.v], me.gradient)
211 |    
212 |     """ stochastic gradients.
213 |         trick. pretend our objective is inner product with the stochastic gradients.
214 |     """
215 |     me.grad_gm = ts.matrix('grad_gm')
216 |     me.eps = ts.matrix('eps')
217 |     me.obj_mu = -ts.sum(me.mu * me.grad_gm)
218 |     me.obj_R = -.5 * ts.sum(me.grad_gm * me.eps * 1/ts.sqrt(me.d))
219 |     me.stoc_grad = ts.grad(me.obj_mu + me.obj_R, me.param)
220 |     me.get_stoc_grad = theano.function([me.v] + [me.grad_gm] + [me.eps], me.stoc_grad)
221 |     
222 |   def pack(me):
223 |     param = dict()
224 |     for p in me.param:
225 |       param.update({str(p): p.get_value()})
226 |     return param
227 |         
228 | 
229 | "parallel, if server is reachable; otherwise, use map."
230 | try:
231 |   rc = Client()
232 |   num_threads = len(rc)
233 |   rc[:].use_dill()
234 |   for _import in imports:
235 |     rc[:].execute(_import)
236 |   view = rc.load_balanced_view()
237 |   view.block = True
238 |   mapf = view.map
239 | except:
240 |   "cannot connect to parallel server."
241 |   num_threads = 1
242 |   mapf = map
243 | 
244 | class AutoEncoder(object):
245 |   """
246 |     train/test DLGM on datasets.
247 |   """
248 |   def __init__(me, arch, batchsize = 1, num_sample = 1, kappa = 1, sigma = 1, 
249 |                     stepsize=0.1, num_label=2, ell=10, c = 1, v = 1):
250 | 
251 |     if os.environ.has_key('hidden'):
252 |       arch[1] = int(os.environ['hidden'])
253 |     me.num_threads = num_threads
254 |     printBlue('> Thread Pool (%d)' % me.num_threads)
255 |     me.arch = arch
256 |     me.kappa = kappa
257 |     me.sigma = sigma
258 |     me.batchsize = batchsize
259 |     me.stepsize = stepsize
260 |     me.num_sample = num_sample
261 | 
262 |     me.ell = ell
263 |     me.c = c
264 |     me.num_label = num_label
265 | 
266 |     if os.environ.has_key('ell'):
267 |       me.ell = float(os.environ['ell'])
268 |     if os.environ.has_key('c'):
269 |       me.c = float(os.environ['c'])
270 |     if os.environ.has_key('kappa'):
271 |       me.kappa = float(os.environ['kappa'])
272 |     if os.environ.has_key('sigma'):
273 |       me.sigma = float(os.environ['sigma'])
274 |     if os.environ.has_key('stepsize'):
275 |       me.stepsize = float(os.environ['stepsize'])
276 |     me.stepsize_w = me.stepsize
277 |     if os.environ.has_key('stepsize_w'):
278 |       me.stepsize_w = float(os.environ['stepsize_w'])
279 |     if os.environ.has_key('output'):
280 |       me.output_path = os.environ['output']
281 |     else:
282 |       me.output_path = 'default'
283 |     print 'ell = ', me.ell, 'c = ', me.c, 'sigma = ', me.sigma, 'kappa = ', me.kappa, \
284 |           'stepsize = ', me.stepsize, 'arch = ', me.arch
285 |     print 'nonlinear_f = ', nonlinear_s
286 | 
287 |     printBlue('> Compiling neural network')
288 |     me.W = np.zeros((4 * arch[1] + 1, me.num_label))
289 |     me.W_G2 = np.zeros_like(me.W)
290 |     me.gmodel = Decoder(me.arch, kappa=me.kappa)
291 |     me.rmodel = Encoder(me.arch, sigma=me.sigma)
292 | 
293 | 
294 |   def __concat__(me, xi):
295 |     latent = [1]
296 |     latent += list(xi)
297 |     latent = np.array(latent)
298 |     return latent
299 | 
300 |   def process(me, ti, V, Y = []):
301 |     """
302 |       process one single data point.
303 |         > return: (grad of generative model, grad of recognition model)
304 |         > input
305 |           ti: thread id.
306 |           v: data point.
307 |     """
308 |     rmodel = me.rmodel
309 |     gmodel = me.gmodel
310 | 
311 |     grad_g = []
312 |     grad_r = []
313 |     grad_w = np.zeros_like(me.W)
314 | 
315 |     for si in range(me.num_sample):
316 |       "first sample stochastic variables."
317 |       "eps is randomness for recognition model, xi is randomness for generative model"
318 |       eps = rmodel.sample_eps(V)
319 |       xi = rmodel.sample(V, eps)
320 | 
321 |       "compute gradient of generative model."
322 |       gg = gmodel.get_grad(V, xi)
323 |       gg = param_neg(gg)
324 |       grad_g = param_add(grad_g, gg)
325 | 
326 |       "compute gradient of regularizer in generative model."
327 |       gg_reg = gmodel.get_grad_reg()
328 |       grad_g = param_add(grad_g, gg_reg)
329 | 
330 |       "compute free-energy gradient of recognition model."
331 |       gr = rmodel.get_grad(V)
332 |       grad_r = param_add(grad_r, gr) 
333 | 
334 |       "compute stochastic gradient of recognition model."
335 |       gg_xi = gmodel.get_grad_xi(V, xi)
336 | 
337 |       "add supervision"
338 |       if Y != []:
339 |         # latents = rmodel.get_mu(V)
340 |         # latents = xi
341 |         latents = rmodel.propup(V)
342 |         for (ni, (y, latent)) in enumerate(zip(Y, latents.T)):
343 |           latent = me.__concat__(latent)
344 |           resp = me.ell + np.dot(latent, me.W) - np.dot(latent, me.W[:,y])
345 |           resp[y] = 0
346 |           yp = np.argmax(resp) 
347 |           grad_w[:,yp] += latent
348 |           grad_w[:,y] -= latent
349 | 
350 |           # gg_xi[:, ni] -= me.c * (me.W[1:, yp] - me.W[1:, y])
351 | 
352 |       gr_stoc = rmodel.get_stoc_grad(V, gg_xi, eps)
353 |       grad_r = param_add(grad_r, gr_stoc)
354 | 
355 |       grad_g = param_mul_scalar(grad_g, 1.0/me.num_sample)
356 |       grad_r = param_mul_scalar(grad_r, 1.0/me.num_sample)
357 |       grad_w /= me.num_sample
358 | 
359 |     return (grad_g, grad_r, grad_w) 
360 | 
361 |   def neg_lhood(me, data):
362 |     eps = me.rmodel.sample_eps(data.T)
363 |     xi = me.rmodel.sample(data.T, eps)
364 |     nlh = -me.gmodel.get_lhood(data.T, xi)
365 |     return nlh
366 | 
367 |   def test(me, data, label):
368 |     predict = []
369 |     acc = 0
370 |     # eps = me.rmodel.sample_eps(data.T)
371 |     # xi = me.rmodel.sample(data.T, eps).T        # use posterior mean to make predictions.
372 |     # xi = me.rmodel.get_mu(data.T).T
373 |     xi = me.rmodel.propup(data.T).T
374 |     for (v, lb, x) in zip(data, label, xi):
375 |       # eps = me.rmodel.sample_eps(v)
376 |       # xi = me.rmodel.sample(v, eps)
377 |       latent = me.__concat__(x)
378 |       resp = np.dot(latent, me.W)
379 |       yp = np.argmax(resp)
380 |       predict += [yp]
381 |       if yp == lb:
382 |         acc += 1
383 |     acc /= float(len(data))
384 |     return (predict, acc)
385 | 
386 |   def reconstruct(me, data):
387 |     eps = me.rmodel.sample_eps(data.T)
388 |     xi = me.rmodel.sample(data.T, eps)
389 |     recon = me.gmodel.activate(xi).T
390 |     return (recon, xi)
391 |       
392 |   def train(me, data, label, num_iter, test_data = [], test_label = []):
393 |     """
394 |       start the training algorithm.
395 |         > input
396 |           data: N x D data matrix, each row is a data of dimension D.
397 |     """
398 |     printBlue('> Start training neural nets')
399 | 
400 |     os.system('mkdir -p ../result/%s' % me.output_path)
401 | 
402 |     data = np.array(data)
403 |     lhood = []
404 |     test_lhood = []
405 |     recon_err = []
406 |     test_recon_err = []
407 |     train_recon_err = []
408 |     accuracy = []
409 | 
410 |     LAG = 10
411 |     ta = time.time()
412 |     for it in range(num_iter):
413 |       allind = set(range(data.shape[0]))
414 |       while len(allind) >= me.batchsize:
415 |         "extract mini-batch" 
416 |         ind = npr.choice(list(allind), me.batchsize, replace=False)
417 |         allind -= set(ind)
418 |         V = data[ind, :].T
419 |         Y = label[ind]
420 | 
421 |         "compute gradients"
422 |         result = mapf(me.process, [0], [V], [Y])
423 | 
424 |         grad_r = []
425 |         grad_g = []
426 |         grad_w = np.zeros_like(me.W)
427 | 
428 |         for (ti, res) in enumerate(result):
429 |           grad_g = param_add(grad_g, res[0])
430 |           grad_r = param_add(grad_r, res[1])
431 |           grad_w += res[2]
432 |         
433 |         grad_g = param_mul_scalar(grad_g, 1.0/len(V));
434 |         grad_r = param_mul_scalar(grad_r, 1.0/len(V));
435 |         grad_w /= len(V)
436 | 
437 | 
438 |         "aggregate gradients"
439 |         AdaGRAD(me.gmodel.param, grad_g, me.gmodel.G2, me.stepsize)
440 |         AdaGRAD(me.rmodel.param, grad_r, me.rmodel.G2, me.stepsize)
441 |         AdaGRAD([me.W], [grad_w], [me.W_G2], me.stepsize_w)
442 | 
443 |       "evaluate"
444 |       if test_data != [] and (it+1) % LAG == 0:
445 |         tb = time.time()
446 |         [predict, acc] = me.test(test_data, test_label)
447 |         accuracy += [acc]
448 |         # print '\tGenerative Model', me.gmodel.pack()
449 |         # print '\tRecognition Model', me.rmodel.pack()
450 |         (recon, xi) = me.reconstruct(test_data)
451 |         recon_err += [np.abs(recon - test_data).sum() / float(test_data.shape[0]) / float(test_data.shape[1])]
452 | 
453 |         test_lhood += [me.neg_lhood(test_data)]
454 |         lhood += [me.neg_lhood(data)]
455 | 
456 |         (recon_train, xi_train) = me.reconstruct(data)
457 |         train_recon_err += [np.abs(recon_train - data).sum() / float(data.shape[0]) / float(data.shape[1])]
458 | 
459 |         time_elapsed = (tb-ta) / float(LAG)
460 |         print 'epoch = ', it, 'time elapsed = ', time_elapsed, '-lhood', test_lhood[-1], '-lhood(train)', lhood[-1], 'test recon err', \
461 |             recon_err[-1], 'train recon err', train_recon_err[-1], 'test acc', acc
462 | 
463 |         sio.savemat('../result/%s/recon.mat' % me.output_path, {'recon': recon, 'xi': xi, 'xi_train':xi_train, 'data':test_data, 
464 |                     'recon_train':recon_train, 'lhood':lhood, 'test_lhood':test_lhood, 'recon_err':recon_err, 
465 |                     'train_recon_err':train_recon_err, 'test_acc':accuracy, 'time_elapsed':time_elapsed})
466 | 
467 |         ta = time.time()
468 | 
469 |     with open('../result/%s/log.txt' % me.output_path, "a") as output:
470 |       output.write('\n')
471 |       output.write(' '.join(toStr(['ell = ', me.ell, 'c = ', me.c, 'sigma = ', me.sigma, 'kappa = ', me.kappa, \
472 |                     'stepsize = ', me.stepsize, 'arch = ', me.arch[0], me.arch[1]]))+'\n')
473 |       output.write(' '.join(toStr(['nonlinear_f = ', nonlinear_s]))+'\n')
474 |       output.write(' '.join(toStr(['epoch = ', it, '-lhood', test_lhood[-1], '-lhood(train)', lhood[-1],  
475 |                     'test recon err', recon_err[-1], 'test acc', acc]))+'\n')
476 |       output.flush()
477 |       output.close()
478 | 
479 | 
480 | 
481 |     printBlue('> Training complete')
482 | 
483 | if __name__ == "__main__":
484 |   model = DeepLatentGM([2,4]) 
485 |   model.train(npr.randn(1024,2), 16)
486 |   print 'Generative Model', model.gmodel.pack()
487 |   print 'Recognition Model', model.rmodel.pack()
488 | 


--------------------------------------------------------------------------------
/DLGM/dlgm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | implements the models in paper "stochastic backpropagation in DLGMs"
  3 | including
  4 | * generative model.
  5 | * recognition model.
  6 | """
  7 | "let client and server have the same imports."
  8 | imports = ['import numpy as np', 
  9 |            'import numpy.random as npr', 
 10 |            'import theano',
 11 |            'from theano import config',
 12 |            'import sys, os',
 13 |            'import scipy.io as sio', 
 14 |            'import theano.sandbox.linalg as ta',
 15 |            'import theano.tensor as ts',
 16 |            'from color import *',
 17 |            'from utils import *']
 18 | for _import in imports:
 19 |   exec _import
 20 | import pdb
 21 | 
 22 | import time
 23 | from IPython.parallel import Client
 24 | 
 25 | theano.config.exception_verbosity = 'low'
 26 | 
 27 | ts.logistic = lambda z: 1 / (1 + ts.exp(-z)) 
 28 | np.logistic = lambda z: 1 / (1 + np.exp(-z))
 29 | 
 30 | toInt = np.vectorize(int)
 31 | toStr = np.vectorize(str)
 32 | strConcat = lambda ls : '  '.join(toStr(ls))
 33 | 
 34 | get_value = lambda x: x.get_value() if x != None else None
 35 | get_value_all = lambda xs: [get_value(x) for x in xs if x != None]
 36 | 
 37 | nonlinear_f = lambda x : ts.log(1+ts.exp(x))  # smooth ReLU.
 38 | nonlinear_s = "smooth ReLU"
 39 | if os.environ.has_key('nonlinear'):
 40 |   nonlinear_s = os.environ['nonlinear']
 41 |   if nonlinear_s == "ReLU":
 42 |     f = lambda x : ts.maximum(0, x) # ReLU.
 43 |   if nonlinear_s == "tanh":
 44 |     f = lambda x : ts.tanh(x)
 45 | 
 46 | 
 47 | def AdaGRAD(param, grad, G2, stepsize):
 48 |   """
 49 |   adaptive sub-gradient algorithm for tensor-shared objects.
 50 |   > input:
 51 |     param: parameters, tensor-shared objects. 
 52 |     grad: gradient, list of numpy arrays.
 53 |     G2: variance of gradient, list of numpy arrays
 54 |   """
 55 |   for (p, g, g2) in zip(param, grad, G2):
 56 |     g2[:] = (g2 + g * g)[:]
 57 |     if type(p) == theano.tensor.sharedvar.TensorSharedVariable:
 58 |       p.set_value(p.get_value() - stepsize * g / (1e-4 + np.sqrt(g2))) 
 59 |     elif type(p) == np.ndarray:
 60 |       p[:] = p[:] - stepsize * g[:] / (1e-4 + np.sqrt(g2[:]))
 61 | 
 62 | class GenerativeModel:
 63 |   """ generative model 
 64 |   """
 65 |   def __init__(me, arch, kappa=0.1):
 66 |     """
 67 |       create the deep latent Gaussian model.
 68 |         arch: architecture, [vis, hidden_1, hidden_2, ...]
 69 |     """
 70 |     "set options."
 71 |     me.f = ts.maximum         # nonlinear transformation.
 72 |     me.lhoodFunc = lambda v, resp: ts.sum(v * ts.log(ts.logistic(resp)) + (1-v) * ts.log(1-ts.logistic(resp)))
 73 | 
 74 |     "set properties."
 75 |     me.arch = arch
 76 |     me.num_layers = len(arch)
 77 |     me.kappa = kappa
 78 |     assert(me.num_layers > 1)
 79 | 
 80 |     "init layers."
 81 |     (me.G, me.W, me.b, me.xi, me.h) = tuple([[None]*(me.num_layers) for i in range(5)])
 82 |     for layer in range(me.num_layers-1, -1, -1):
 83 |       if layer < me.num_layers-1:
 84 |         me.W[layer] = theano.shared(np.asarray(randn01(arch[layer], arch[layer+1]), config.floatX), name="W%d" % layer)
 85 |         me.b[layer] = theano.shared(np.asarray(np.zeros((arch[layer], 1)), config.floatX), name="b%d" % layer, broadcastable=(False,True))
 86 |       me.h[layer] = 0
 87 |       if layer > 0:
 88 |         me.G[layer] = theano.shared(np.asarray(np.eye(arch[layer]), config.floatX), name="G%d" % layer)
 89 |         me.xi[layer] = ts.matrix("xi%d" % layer)
 90 |         me.h[layer] += ts.dot(me.G[layer], me.xi[layer])
 91 |       if layer < me.num_layers-1:
 92 |         me.h[layer] += ts.dot(me.W[layer], me.f(0, me.h[layer+1])) + me.b[layer]
 93 | 
 94 |     me.param = me.G[1:] + me.W[:-1] + me.b[:-1]
 95 |     me.G2 = [np.asarray(np.zeros(x.get_value().shape), config.floatX) for x in me.param] # variance of gradient.
 96 | 
 97 |     "define objective."
 98 |     me.v = ts.matrix("v")
 99 |     me.lhood = me.lhoodFunc(me.v, me.h[0])
100 |     me.get_lhood = theano.function([me.v] + me.xi[1:], me.lhood) 
101 |     me.reg = me.kappa * sum([ts.sum(p * p) for p in me.param])
102 |     me.get_reg = theano.function([], me.reg)
103 | 
104 |     "define gradient."
105 |     me.gradient = ts.grad(me.lhood, me.param)
106 |     me.gradient_xi = ts.grad(me.lhood, me.xi[1:])
107 |     # me.hessian_xi = ts.hessian(me.lhood, me.xi[1:])
108 |     me.get_grad = theano.function([me.v] + me.xi[1:], me.gradient)
109 |     me.get_grad_xi = theano.function([me.v] + me.xi[1:], me.gradient_xi)
110 |     # me.get_hess_xi = theano.function([me.v] + me.xi[1:], me.hessian_xi)
111 |     me.gradient_reg = ts.grad(me.reg, me.param)
112 |     me.get_grad_reg = theano.function([], me.gradient_reg)
113 | 
114 |     "define utils."
115 |     me.generate = theano.function(me.xi[1:], ts.logistic(me.h[0]))
116 |     me.hidden_activation = ts.vector("hidden_activiation")
117 |     me.hidden_rectified = me.f(0, me.hidden_activation)
118 |     me.nonlinear = theano.function([me.hidden_activation], me.hidden_rectified)
119 | 
120 |   def sample(me, xi):
121 |     resp = me.generate(*xi)
122 |     return toInt(npr.rand(*resp.shape) < resp)
123 | 
124 |   def activate(me, xi):
125 |     return me.generate(*xi)
126 | 
127 |   def pack(me):
128 |     return {'G': get_value_all(me.G), \
129 |             'W': get_value_all(me.W),
130 |             'b': get_value_all(me.b)}
131 |  
132 | class RecognitionModel:
133 |   """ recognition model (interface)
134 |         since xi \sim \Normal(\mu, C) for each layer. 
135 |         the recognition fits its parameters (\mu, C) discriminatively.
136 | 
137 |       a simple recognition model uses a two layer NN to fit each parameter.
138 |       see DLGM appendix A.
139 |   """
140 |   def __init__(me, arch, num_hidden=10, sigma=1):
141 |     """
142 |       create the deep latent Gaussian recognition model.
143 |         arch: architecture, [vis, hidden_1, hidden_2, ...]
144 |     """
145 |     "set options."
146 |     me.f = ts.maximum         # nonlinear transformation.
147 | 
148 |     "set properties."
149 |     me.arch = arch
150 |     me.num_layers = len(arch)
151 |     me.num_hidden = num_hidden
152 |     me.sigma = sigma
153 |     assert(me.num_layers > 1)
154 | 
155 |     "init layers."
156 |     me.v = ts.matrix("v") # N x K matrix, N is the sample size, K is the dimension.
157 |     (me.Wv, me.Wu, me.Wd, me.Wmu, me.bv, me.bu, me.bd, me.bmu, me.z, me.d, me.u, me.mu, me.R, me.C) \
158 |         = tuple([[None] * me.num_layers for i in range(14)])
159 |     for layer in range(1, me.num_layers):
160 |       me.Wv[layer] = theano.shared(np.asarray(randn01(num_hidden, arch[0]), config.floatX), name="Wv%d" % layer)
161 |       me.Wu[layer] = theano.shared(np.asarray(randn01(arch[layer], num_hidden), config.floatX), name="Wu%d" % layer)
162 |       me.Wd[layer] = theano.shared(np.asarray(randn01(arch[layer], num_hidden), config.floatX), name="Wd%d" % layer)
163 |       me.Wmu[layer] = theano.shared(np.asarray(randn01(arch[layer], num_hidden), config.floatX), name="Wmu%d" % layer)
164 |       me.bv[layer] = theano.shared(np.asarray(np.zeros((num_hidden, 1)), config.floatX), name="bv%d" % layer, broadcastable=(False, True))
165 |       me.bu[layer] = theano.shared(np.asarray(np.zeros((arch[layer], 1)), config.floatX), name="bu%d" % layer, broadcastable=(False, True))
166 |       me.bd[layer] = theano.shared(np.asarray(np.zeros((arch[layer], 1)), config.floatX), name="bd%d" % layer, broadcastable=(False, True))
167 |       me.bmu[layer] = theano.shared(np.asarray(np.zeros((arch[layer], 1)), config.floatX), name="bmu%d" % layer, broadcastable=(False, True))
168 |       me.z[layer] =  me.f(0, ts.dot(me.Wv[layer], me.v) + me.bv[layer])
169 |       me.mu[layer] = ts.dot(me.Wmu[layer], me.z[layer]) + me.bmu[layer]
170 |       me.d[layer] = ts.exp(ts.dot(me.Wd[layer], me.z[layer]) + me.bd[layer])
171 |       me.u[layer] = ts.dot(me.Wu[layer], me.z[layer]) + me.bu[layer]
172 | 
173 | 
174 |     """model covariance jointly
175 |     utDneg1u = sum([ts.dot(u, u/d) for (u, d) in zip(me.u, me.d)])
176 |     me.eta = 1/(1+utDneg1u)
177 |     me.Rdot = theano.function([me.v] + [tensor.vector('x') for u in me.u], \
178 |                 [1/ts.sqrt(d) * x - ts.dot(1/ts.sqrt(d) * x, u) * ts.dot(u, 1/d) \
179 |                   * (1-ts.sqrt(me.eta)) / utDneg1u \
180 |                   for (u, d, x) in zip(me.u, me.d, me.x) \
181 |                 ]\
182 |               )
183 |     """
184 |     eps_s = [ts.matrix('x') for u in me.u] 
185 |     me.Rdot = theano.function([me.v] + eps_s[1:], \
186 |                 [1/ts.sqrt(d) * x for (d, x) in zip(me.d[1:], eps_s[1:])] \
187 |                 )
188 | 
189 |     "utils."
190 |     me.get_mu = theano.function([me.v], me.mu[1:])
191 |     me.get_u = theano.function([me.v], me.u[1:])
192 |     me.get_d = theano.function([me.v], me.d[1:])
193 |     me.get_z = theano.function([me.v], me.z[1:])
194 | 
195 |     me.sample_eps = lambda v: [np.asarray(npr.randn(ac, v.shape[1]), config.floatX) \
196 |                                 for ac in arch[1:]]
197 | 
198 |     me.sample = lambda v, eps: param_add(me.get_mu(v), me.Rdot(v, *eps)) 
199 | 
200 |     me.hidden_activation = ts.vector("hidden_activiation")
201 |     me.hidden_rectified = me.f(0, me.hidden_activation)
202 |     me.nonlinear = theano.function([me.hidden_activation], me.hidden_rectified)
203 | 
204 |     "free energy."
205 |     me.energy = 0;
206 |     for layer in range(1, me.num_layers):
207 |       me.energy += me.sigma * (ts.sum(me.mu[layer] * me.mu[layer]) + ts.sum(1/me.d[layer])+ ts.sum(ts.log(me.d[layer]))) \
208 |                     + 0 * ts.sum(me.u[layer] * me.u[layer]) 
209 |     me.energy *= 0
210 |     me.get_energy = theano.function([me.v], me.energy)
211 | 
212 |     "free energy gradients."
213 |     me.param = me.Wv[1:] + me.Wu[1:] + me.Wd[1:] + me.Wmu[1:] + me.bv[1:] + me.bu[1:] + me.bd[1:]+ me.bmu[1:]
214 |     me.G2 = [np.asarray(np.zeros(x.get_value().shape), config.floatX) for x in me.param] # variance of gradient.
215 |     me.gradient = ts.grad(me.energy, me.param)
216 |     me.get_grad = theano.function([me.v], me.gradient)
217 |    
218 |     """ stochastic gradients.
219 |         trick. pretend our objective is inner product with the stochastic gradients.
220 |     """
221 |     me.grad_gm = [None] * me.num_layers
222 |     me.eps = [None] * me.num_layers
223 |     me.obj_mu = 0
224 |     me.obj_R = 0
225 |     for layer in range(1, me.num_layers):
226 |       me.grad_gm[layer] = ts.matrix('grad_gm_%d' % layer)
227 |       me.eps[layer] = ts.matrix('eps_%d' % layer)
228 |       me.obj_mu += ts.sum(me.mu[layer] * me.grad_gm[layer])
229 |       me.obj_R += .5 * ts.sum(me.grad_gm[layer] * me.eps[layer] / ts.sqrt(me.d[layer])) + 0 * ts.sum(me.u[layer] * \
230 |           me.u[layer])
231 |       # me.obj_R += .5 * (ts.outer(me.grad_gm[layer], me.eps[layer]) * 1/ts.sqrt(me.d[layer])).sum() + 0 * ts.dot(me.u[layer].T,
232 |       #     me.u[layer])
233 |     me.stoc_grad = ts.grad(me.obj_mu + me.obj_R, me.param)
234 |     me.get_stoc_grad = theano.function([me.v] + me.grad_gm[1:] + me.eps[1:], me.stoc_grad)
235 |     
236 |   def pack(me):
237 |     return {'Wv': get_value_all(me.Wv), 
238 |             'Wu': get_value_all(me.Wu),
239 |             'Wd': get_value_all(me.Wd),
240 |             'Wmu': get_value_all(me.Wmu),
241 |             'bv': get_value_all(me.bv),
242 |             'bu': get_value_all(me.bu),
243 |             'bd': get_value_all(me.bd),
244 |             'bmu': get_value_all(me.bmu)}
245 |         
246 | 
247 | class DeepLatentGM(object):
248 |   """
249 |     train/test DLGM on datasets.
250 |   """
251 |   def __init__(me, arch, batchsize = 1, num_sample = 1, kappa = 1, sigma = 1, rec_hidden = 100, 
252 |                     stepsize=0.1, num_label=2, ell=100, c = 1, v = 1):
253 |     if os.environ.has_key('hidden'):
254 |       hidden = int(os.environ['hidden'])
255 |       rec_hidden = 4 * hidden
256 |       for i in range(1, len(arch)):
257 |         arch[i] = hidden 
258 | 
259 |     me.arch = arch
260 |     me.kappa = kappa
261 |     me.sigma = sigma
262 |     me.batchsize = batchsize
263 |     me.stepsize = stepsize
264 |     me.num_sample = num_sample
265 | 
266 |     me.ell = ell
267 |     me.c = c
268 |     me.num_label = num_label
269 |     me.v = 1
270 | 
271 |     if os.environ.has_key('ell'):
272 |       me.ell = float(os.environ['ell'])
273 |     if os.environ.has_key('c'):
274 |       me.c = float(os.environ['c'])
275 |     if os.environ.has_key('kappa'):
276 |       me.kappa = float(os.environ['kappa'])
277 |     if os.environ.has_key('sigma'):
278 |       me.sigma = float(os.environ['sigma'])
279 |     if os.environ.has_key('stepsize'):
280 |       me.stepsize = float(os.environ['stepsize'])
281 |     me.stepsize_w = me.stepsize
282 |     if os.environ.has_key('stepsize_w'):
283 |       me.stepsize_w = float(os.environ['stepsize_w'])
284 |     if os.environ.has_key('output'):
285 |       me.output_path = os.environ['output']
286 |     else:
287 |       me.output_path = 'default'
288 | 
289 |     printRed(strConcat(['ell = ', me.ell, 'c = ', me.c, 'sigma = ', me.sigma, 'kappa = ', me.kappa, 
290 |                     'stepsize = ', me.stepsize, 'arch = ', strConcat(me.arch)]))
291 |     printRed(strConcat(['nonlinear_f = ', nonlinear_s]))
292 |     printBlue('> Compiling neural network')
293 |     me.gmodel = GenerativeModel(me.arch, kappa=me.kappa)
294 |     me.rmodel = RecognitionModel(me.arch, num_hidden=rec_hidden, sigma=me.sigma)
295 | 
296 |     me.W = np.zeros((sum(arch[1:])+1, me.num_label))
297 |     me.W_G2 = np.zeros_like(me.W)
298 | 
299 |   def __concat__(me, xi):
300 |     latent = [1]
301 |     for x in xi:
302 |       latent += list(x)
303 |     latent = np.array(latent)
304 |     return latent
305 | 
306 |   def process(me, V, Y):
307 |     """
308 |       process one single data point.
309 |         > return: (grad of generative model, grad of recognition model)
310 |         > input
311 |           ti: thread id.
312 |           v: data point.
313 |     """
314 |     rmodel = me.rmodel
315 |     gmodel = me.gmodel
316 | 
317 |     V = np.array(V)
318 |     if len(V.shape) < 2:
319 |       V = np.array([V])
320 | 
321 |     grad_g = []
322 |     grad_r = []
323 |     grad_w = np.zeros_like(me.W)
324 | 
325 |     for si in range(me.num_sample):
326 |       "first sample stochastic variables."
327 |       eps = rmodel.sample_eps(V.T)
328 |       xi = rmodel.sample(V.T, eps)
329 | 
330 |       # pdb.set_trace()
331 |       ta = time.clock()
332 |       "compute gradient of generative model."
333 |       gg = gmodel.get_grad(V.T, *xi)
334 |       gg = param_neg(gg)
335 |       grad_g = param_add(grad_g, gg)
336 | 
337 |       "compute gradient of regularizer in generative model."
338 |       gg_reg = gmodel.get_grad_reg()
339 |       gg_reg = param_mul_scalar(gg_reg, me.kappa)
340 |       grad_g = param_add(grad_g, gg_reg)
341 | 
342 |       "compute free-energy gradient of recognition model."
343 |       gr = rmodel.get_grad(V.T)
344 |       grad_r = param_add(grad_r, gr) 
345 | 
346 |       "compute stochastic gradient of recognition model."
347 |       gg_xi = gmodel.get_grad_xi(V.T, *xi)
348 |       gg_xi = param_neg(gg_xi)
349 | 
350 |       "add supervision"
351 |       code = rmodel.get_mu(V.T)
352 |       for vi in range(V.shape[0]):
353 |         latent = me.__concat__([c[:, vi] for c in code])
354 |         y = Y[vi]
355 |         
356 |         resp = me.ell + np.dot(latent, me.W) - np.dot(latent, me.W[:,y])
357 |         resp[y] = 0
358 |         yp = np.argmax(resp) 
359 |         grad_w[:,yp] += latent
360 |         grad_w[:,y] -= latent
361 | 
362 |         # ind = 1   # skip bias.
363 |         # for ni in range(len(gg_xi)):
364 |         #   for nj in range(len(gg_xi[ni])):
365 |         #     gg_xi[ni][nj] += me.c * (me.W[ind, yp] - me.W[ind, y])
366 |         #     ind += 1
367 | 
368 |       gr_stoc = rmodel.get_stoc_grad(V.T, *(gg_xi + eps))
369 |       grad_r = param_add(grad_r, gr_stoc)
370 | 
371 |     grad_g = param_mul_scalar(grad_g, 1.0/me.num_sample)
372 |     grad_r = param_mul_scalar(grad_r, 1.0/me.num_sample)
373 |     grad_w /= me.num_sample
374 | 
375 |     return (grad_g, grad_r, grad_w) 
376 | 
377 |   def neg_lhood(me, data):
378 |     nlh = 0
379 |     V = np.array(data);
380 |     eps = me.rmodel.sample_eps(V.T)
381 |     xi = me.rmodel.sample(V.T, eps)
382 |     nlh -= me.gmodel.get_lhood(V.T, *xi)
383 |     return nlh
384 | 
385 |   def test(me, data, label):
386 |     predict = []
387 |     acc = 0
388 |     xi = me.rmodel.get_mu(data.T)
389 |     for (li, lb) in enumerate(label):
390 |       latent = me.__concat__([x[:,li] for x in xi])
391 |       resp = np.dot(latent, me.W)
392 |       yp = np.argmax(resp)
393 |       predict += [yp]
394 |       if yp == lb:
395 |         acc += 1
396 |     acc /= float(len(label))
397 |     return (predict, acc)
398 | 
399 |   def reconstruct(me, data):
400 |     eps = me.rmodel.sample_eps(data.T)
401 |     xi = me.rmodel.sample(data.T, eps)
402 |     recon = me.gmodel.activate(xi).T
403 |     return (recon, xi)
404 |       
405 |   def train(me, data, label, num_iter, test_data = [], test_label = []):
406 |     """
407 |       start the training algorithm.
408 |         > input
409 |           data: N x D data matrix, each row is a data of dimension D.
410 |     """
411 |     printBlue('> Start training neural nets')
412 | 
413 |     os.system('mkdir -p ../result/%s' % me.output_path)
414 | 
415 |     data = np.array(data).astype(np.float32)
416 |     if test_data != []:
417 |       test_data = np.array(test_data).astype(np.float32)
418 |     label = label.astype(np.float32)
419 |     if test_label != []:
420 |       test_label = test_label.astype(np.float32)
421 | 
422 |     lhood = []
423 |     test_lhood = []
424 |     recon_err = []
425 |     train_recon_err = []
426 |     accuracy = []
427 | 
428 |     LAG = 10
429 |     ta = time.time()
430 |     for it in range(num_iter):
431 |       allind = set(range(data.shape[0]))
432 |       while len(allind) >= me.batchsize:
433 |         "extract mini-batch" 
434 |         ind = npr.choice(list(allind), me.batchsize, replace=False)
435 |         allind -= set(ind)
436 |         V = data[ind, :]
437 |         Y = label[ind]
438 | 
439 |         "compute gradients"
440 | 
441 |         (grad_g, grad_r, grad_w) = me.process(V, Y)
442 |         
443 |         grad_g = param_mul_scalar(grad_g, 1.0/len(V));
444 |         grad_r = param_mul_scalar(grad_r, 1.0/len(V));
445 |         grad_w /= len(V)
446 | 
447 |         "aggregate gradients"
448 |         AdaGRAD(me.gmodel.param, grad_g, me.gmodel.G2, me.stepsize)
449 |         AdaGRAD(me.rmodel.param, grad_r, me.rmodel.G2, me.stepsize)
450 |         AdaGRAD([me.W], [grad_w], [me.W_G2], me.stepsize_w)
451 | 
452 |       "evaluate"
453 |       if test_data != [] and (it+1) % LAG == 0:
454 |         tb = time.time()
455 |         [predict, acc] = me.test(test_data, test_label)
456 |         accuracy += [acc]
457 |         # print '\tGenerative Model', me.gmodel.pack()
458 |         # print '\tRecognition Model', me.rmodel.pack()
459 |         (recon, xis) = me.reconstruct(test_data)
460 |         recon_err += [np.abs(recon - test_data).sum() / float(test_data.shape[0]) / float(test_data.shape[1])]
461 | 
462 |         test_lhood += [me.neg_lhood(test_data)]
463 |         lhood += [me.neg_lhood(data)]
464 | 
465 |         (recon_train, xis_train) = me.reconstruct(data)
466 |         train_recon_err += [np.abs(recon_train - data).sum() / float(data.shape[0]) / float(data.shape[1])]
467 | 
468 |         time_elapsed = (tb-ta) / float(LAG)
469 | 
470 |         print 'epoch = ', it, 'time elapsed = ', time_elapsed, '-lhood', test_lhood[-1], '-lhood(train)', lhood[-1], 'test recon err', \
471 |             recon_err[-1], 'train recon err', train_recon_err[-1], 'test acc', acc
472 | 
473 |         result = {'recon': recon, 'xi': xis, 'xi_train':xis_train, 'data':test_data, 
474 |                     'recon_train':recon_train, 'lhood':lhood, 'test_lhood':test_lhood, 'recon_err':recon_err, 
475 |                     'train_recon_err':train_recon_err, 'test_acc':accuracy, 'time_elapsed':time_elapsed}
476 |         result.update(me.rmodel.pack())
477 |         result.update(me.gmodel.pack())
478 |         sio.savemat('../result/%s/recon.mat' % me.output_path, result)
479 | 
480 | 
481 |     with open('../result/%s/log.txt' % me.output_path, "a") as output:
482 |         print >>output, 'epoch = ', it, 'time elapsed = ', time_elapsed, '-lhood', test_lhood[-1], '-lhood(train)', lhood[-1], 'test recon err', \
483 |             recon_err[-1], 'train recon err', train_recon_err[-1], 'test acc', acc
484 |         output.flush()
485 |         output.close()
486 |       
487 |     printBlue('> Training complete')
488 | 
489 | if __name__ == "__main__":
490 |   model = DeepLatentGM([2,4]) 
491 |   model.train(npr.randn(1024,2), 16)
492 |   print 'Generative Model', model.gmodel.pack()
493 |   print 'Recognition Model', model.rmodel.pack()
494 | 


--------------------------------------------------------------------------------
/RBM/rbm.py:
--------------------------------------------------------------------------------
  1 | """This tutorial introduces restricted boltzmann machines (RBM) using Theano.
  2 | 
  3 | Boltzmann Machines (BMs) are a particular form of energy-based model which
  4 | contain hidden variables. Restricted Boltzmann Machines further restrict BMs
  5 | to those without visible-visible and hidden-hidden connections.
  6 | """
  7 | import time
  8 | 
  9 | try:
 10 |   import PIL.Image as Image
 11 | except ImportError:
 12 |   import Image
 13 | 
 14 | import numpy
 15 | 
 16 | import theano
 17 | import theano.tensor as T
 18 | import os
 19 | 
 20 | from theano.tensor.shared_randomstreams import RandomStreams
 21 | 
 22 | from utils import tile_raster_images
 23 | from logistic_sgd import load_data
 24 | 
 25 | import pdb
 26 | 
 27 | 
 28 | # start-snippet-1
 29 | class RBM(object):
 30 |   """Restricted Boltzmann Machine (RBM)  """
 31 |   def __init__(
 32 |     self,
 33 |     input=None,
 34 |     label=None,
 35 |     n_visible=784,
 36 |     n_hidden=500,
 37 |     W=None,
 38 |     hbias=None,
 39 |     vbias=None,
 40 |     numpy_rng=None,
 41 |     theano_rng=None,
 42 |     c = 1,
 43 |     ell = 100, 
 44 |     n_class = 10,
 45 |   ):
 46 |     """
 47 |     RBM constructor. Defines the parameters of the model along with
 48 |     basic operations for inferring hidden from visible (and vice-versa),
 49 |     as well as for performing CD updates.
 50 | 
 51 |     :param input: None for standalone RBMs or symbolic variable if RBM is
 52 |     part of a larger graph.
 53 | 
 54 |     :param n_visible: number of visible units
 55 | 
 56 |     :param n_hidden: number of hidden units
 57 | 
 58 |     :param W: None for standalone RBMs or symbolic variable pointing to a
 59 |     shared weight matrix in case RBM is part of a DBN network; in a DBN,
 60 |     the weights are shared between RBMs and layers of a MLP
 61 | 
 62 |     :param hbias: None for standalone RBMs or symbolic variable pointing
 63 |     to a shared hidden units bias vector in case RBM is part of a
 64 |     different network
 65 | 
 66 |     :param vbias: None for standalone RBMs or a symbolic variable
 67 |     pointing to a shared visible units bias
 68 |     """
 69 | 
 70 |     self.n_visible = n_visible
 71 |     self.n_hidden = n_hidden
 72 | 
 73 |     if numpy_rng is None:
 74 |       # create a number generator
 75 |       numpy_rng = numpy.random.RandomState(1234)
 76 | 
 77 |     if theano_rng is None:
 78 |       theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
 79 | 
 80 |     if W is None:
 81 |       # W is initialized with `initial_W` which is uniformely
 82 |       # sampled from -4*sqrt(6./(n_visible+n_hidden)) and
 83 |       # 4*sqrt(6./(n_hidden+n_visible)) the output of uniform if
 84 |       # converted using asarray to dtype theano.config.floatX so
 85 |       # that the code is runable on GPU
 86 |       initial_W = numpy.asarray(
 87 |         numpy_rng.uniform(
 88 |           low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
 89 |           high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
 90 |           size=(n_visible, n_hidden)
 91 |         ),
 92 |         dtype=theano.config.floatX
 93 |       )
 94 |       # theano shared variables for weights and biases
 95 |       W = theano.shared(value=initial_W, name='W', borrow=True)
 96 | 
 97 |     if hbias is None:
 98 |       # create shared variable for hidden units bias
 99 |       hbias = theano.shared(
100 |         value=numpy.zeros(
101 |           n_hidden,
102 |           dtype=theano.config.floatX
103 |         ),
104 |         name='hbias',
105 |         borrow=True
106 |       )
107 | 
108 |     if vbias is None:
109 |       # create shared variable for visible units bias
110 |       vbias = theano.shared(
111 |         value=numpy.zeros(
112 |           n_visible,
113 |           dtype=theano.config.floatX
114 |         ),
115 |         name='vbias',
116 |         borrow=True
117 |       )
118 | 
119 |     # initialize input layer for standalone RBM or layer0 of DBN
120 |     self.input = input
121 |     if not input:
122 |       self.input = T.matrix('input')
123 |     self.label = label
124 |     if not label:
125 |       self.label = T.matrix('label')
126 | 
127 |     self.W = W
128 |     self.hbias = hbias
129 |     self.vbias = vbias
130 |     self.theano_rng = theano_rng
131 |     # **** WARNING: It is not a good idea to put things in this list
132 |     # other than shared variables created in this function.
133 | 
134 |     # initialize parameters for supervised learning. 
135 |     self.c = c
136 |     self.ell = 16
137 |     self.weights =  theano.shared(
138 |                       value=numpy.zeros(
139 |                         (n_visible, n_class),
140 |                         dtype=theano.config.floatX
141 |                       ),
142 |                       name='weights',
143 |                       borrow=True
144 |                     )
145 |     # parameter grouping.
146 |     self.params = [self.weights]
147 |     self.G2 = [
148 |                 theano.shared(value=numpy.zeros((n_hidden, n_class)), borrow=True)
149 |               ]
150 |     # end-snippet-1
151 | 
152 |   def free_energy(self, v_sample):
153 |     ''' Function to compute the free energy '''
154 |     wx_b = T.dot(v_sample, self.W) + self.hbias
155 |     vbias_term = T.dot(v_sample, self.vbias)
156 |     hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1)
157 |     return -hidden_term - vbias_term
158 | 
159 |   def loss(self, vis, y):
160 |     ell = T.cast(self.ell, dtype=theano.config.floatX)
161 |     true_resp = (T.dot(vis, self.weights) * y).sum(axis=1, keepdims=True)
162 |     T.addbroadcast(true_resp, 1)
163 |     return (self.ell * (1-y) + T.dot(vis, self.weights) - true_resp).max(axis=1).sum()
164 | 
165 |   def classify(self, vis):
166 |     pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(vis)
167 |     predict = T.dot(ph_mean, self.weights)
168 |     return predict
169 | 
170 |   def propup(self, vis):
171 |     '''This function propagates the visible units activation upwards to
172 |     the hidden units
173 | 
174 |     Note that we return also the pre-sigmoid activation of the
175 |     layer. As it will turn out later, due to how Theano deals with
176 |     optimizations, this symbolic variable will be needed to write
177 |     down a more stable computational graph (see details in the
178 |     reconstruction cost function)
179 | 
180 |     '''
181 |     pre_sigmoid_activation = T.dot(vis, self.W) + self.hbias
182 |     return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
183 | 
184 |   def sample_h_given_v(self, v0_sample):
185 |     ''' This function infers state of hidden units given visible units '''
186 |     # compute the activation of the hidden units given a sample of
187 |     # the visibles
188 |     pre_sigmoid_h1, h1_mean = self.propup(v0_sample)
189 |     # get a sample of the hiddens given their activation
190 |     # Note that theano_rng.binomial returns a symbolic sample of dtype
191 |     # int64 by default. If we want to keep our computations in floatX
192 |     # for the GPU we need to specify to return the dtype floatX
193 |     h1_sample = self.theano_rng.binomial(size=h1_mean.shape,
194 |                        n=1, p=h1_mean,
195 |                        dtype=theano.config.floatX)
196 |     return [pre_sigmoid_h1, h1_mean, h1_sample]
197 | 
198 |   def propdown(self, hid):
199 |     '''This function propagates the hidden units activation downwards to
200 |     the visible units
201 | 
202 |     Note that we return also the pre_sigmoid_activation of the
203 |     layer. As it will turn out later, due to how Theano deals with
204 |     optimizations, this symbolic variable will be needed to write
205 |     down a more stable computational graph (see details in the
206 |     reconstruction cost function)
207 | 
208 |     '''
209 |     pre_sigmoid_activation = T.dot(hid, self.W.T) + self.vbias
210 |     return [pre_sigmoid_activation, T.nnet.sigmoid(pre_sigmoid_activation)]
211 | 
212 |   def sample_v_given_h(self, h0_sample):
213 |     ''' This function infers state of visible units given hidden units '''
214 |     # compute the activation of the visible given the hidden sample
215 |     pre_sigmoid_v1, v1_mean = self.propdown(h0_sample)
216 |     # get a sample of the visible given their activation
217 |     # Note that theano_rng.binomial returns a symbolic sample of dtype
218 |     # int64 by default. If we want to keep our computations in floatX
219 |     # for the GPU we need to specify to return the dtype floatX
220 |     v1_sample = self.theano_rng.binomial(size=v1_mean.shape,
221 |                        n=1, p=v1_mean,
222 |                        dtype=theano.config.floatX)
223 |     return [pre_sigmoid_v1, v1_mean, v1_sample]
224 | 
225 |   def gibbs_hvh(self, h0_sample):
226 |     ''' This function implements one step of Gibbs sampling,
227 |       starting from the hidden state'''
228 |     pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h0_sample)
229 |     pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v1_sample)
230 |     return [pre_sigmoid_v1, v1_mean, v1_sample,
231 |         pre_sigmoid_h1, h1_mean, h1_sample]
232 | 
233 |   def gibbs_vhv(self, v0_sample):
234 |     ''' This function implements one step of Gibbs sampling,
235 |       starting from the visible state'''
236 |     pre_sigmoid_h1, h1_mean, h1_sample = self.sample_h_given_v(v0_sample)
237 |     pre_sigmoid_v1, v1_mean, v1_sample = self.sample_v_given_h(h1_sample)
238 |     return [pre_sigmoid_h1, h1_mean, h1_sample,
239 |         pre_sigmoid_v1, v1_mean, v1_sample]
240 | 
241 |   # start-snippet-2
242 |   def get_cost_updates(self, lr=0.1, persistent=None, k=1, update_method='adagrad'):
243 |     """This functions implements one step of CD-k or PCD-k
244 | 
245 |     :param lr: learning rate used to train the RBM
246 | 
247 |     :param persistent: None for CD. For PCD, shared variable
248 |       containing old state of Gibbs chain. This must be a shared
249 |       variable of size (batch size, number of hidden units).
250 | 
251 |     :param k: number of Gibbs steps to do in CD-k/PCD-k
252 | 
253 |     Returns a proxy for the cost and the updates dictionary. The
254 |     dictionary contains the update rules for weights and biases but
255 |     also an update of the shared variable used to store the persistent
256 |     chain, if one is used.
257 | 
258 |     """
259 |     cost = self.c * self.loss(self.input, self.label)
260 |     # We must not compute the gradient through the gibbs sampling
261 |     gparams = T.grad(cost, self.params)
262 |     # end-snippet-3 start-snippet-4
263 |     if update_method == 'sgd':
264 |       # constructs the update dictionary
265 |       for gparam, param in zip(gparams, self.params):
266 |         # make sure that the learning rate is of the right dtype
267 |         updates[param] = param - gparam * T.cast(
268 |           lr,
269 |           dtype=theano.config.floatX
270 |           )
271 |     elif update_method == 'adagrad':
272 |       for gparam, param, g2 in zip(gparams, self.params, self.G2):
273 |         # make sure that the learning rate is of the right dtype
274 |         updates[g2] = g2 + gparam * gparam
275 |         updates[param] = param - gparam * T.cast(lr,     \
276 |                             dtype=theano.config.floatX)  \
277 |                             / (1e-4 + T.sqrt(g2 + gparam * gparam))
278 | 
279 |     monitoring_cost = 0
280 |     train_err = 0
281 |     return monitoring_cost, train_err, updates
282 |     # end-snippet-4
283 | 
284 |   def get_error(self, predict, label):
285 |     return T.neq(T.argmax(predict, axis=1), 
286 |                 T.argmax(label, axis=1)).sum() / T.cast(label.shape[0], dtype=theano.config.floatX)
287 | 
288 | 
289 |   def get_pseudo_likelihood_cost(self, updates):
290 |     """Stochastic approximation to the pseudo-likelihood"""
291 | 
292 |     # index of bit i in expression p(x_i | x_{\i})
293 |     bit_i_idx = theano.shared(value=0, name='bit_i_idx')
294 | 
295 |     # binarize the input image by rounding to nearest integer
296 |     xi = T.round(self.input)
297 | 
298 |     # calculate free energy for the given bit configuration
299 |     fe_xi = self.free_energy(xi)
300 | 
301 |     # flip bit x_i of matrix xi and preserve all other bits x_{\i}
302 |     # Equivalent to xi[:,bit_i_idx] = 1-xi[:, bit_i_idx], but assigns
303 |     # the result to xi_flip, instead of working in place on xi.
304 |     xi_flip = T.set_subtensor(xi[:, bit_i_idx], 1 - xi[:, bit_i_idx])
305 | 
306 |     # calculate free energy with bit flipped
307 |     fe_xi_flip = self.free_energy(xi_flip)
308 | 
309 |     # equivalent to e^(-FE(x_i)) / (e^(-FE(x_i)) + e^(-FE(x_{\i})))
310 |     cost = T.mean(self.n_visible * T.log(T.nnet.sigmoid(fe_xi_flip -
311 |                               fe_xi)))
312 | 
313 |     # increment bit_i_idx % number as part of updates
314 |     updates[bit_i_idx] = (bit_i_idx + 1) % self.n_visible
315 | 
316 |     return cost
317 | 
318 |   def get_reconstruction_cost(self, updates, pre_sigmoid_nv):
319 |     """Approximation to the reconstruction error
320 | 
321 |     Note that this function requires the pre-sigmoid activation as
322 |     input.  To understand why this is so you need to understand a
323 |     bit about how Theano works. Whenever you compile a Theano
324 |     function, the computational graph that you pass as input gets
325 |     optimized for speed and stability.  This is done by changing
326 |     several parts of the subgraphs with others.  One such
327 |     optimization expresses terms of the form log(sigmoid(x)) in
328 |     terms of softplus.  We need this optimization for the
329 |     cross-entropy since sigmoid of numbers larger than 30. (or
330 |     even less then that) turn to 1. and numbers smaller than
331 |     -30. turn to 0 which in terms will force theano to compute
332 |     log(0) and therefore we will get either -inf or NaN as
333 |     cost. If the value is expressed in terms of softplus we do not
334 |     get this undesirable behaviour. This optimization usually
335 |     works fine, but here we have a special case. The sigmoid is
336 |     applied inside the scan op, while the log is
337 |     outside. Therefore Theano will only see log(scan(..)) instead
338 |     of log(sigmoid(..)) and will not apply the wanted
339 |     optimization. We can not go and replace the sigmoid in scan
340 |     with something else also, because this only needs to be done
341 |     on the last step. Therefore the easiest and more efficient way
342 |     is to get also the pre-sigmoid activation as an output of
343 |     scan, and apply both the log and sigmoid outside scan such
344 |     that Theano can catch and optimize the expression.
345 | 
346 |     """
347 | 
348 |     cross_entropy = T.mean(
349 |       T.sum(
350 |         self.input * T.log(T.nnet.sigmoid(pre_sigmoid_nv)) +
351 |         (1 - self.input) * T.log(1 - T.nnet.sigmoid(pre_sigmoid_nv)),
352 |         axis=1
353 |       )
354 |     )
355 | 
356 |     return cross_entropy
357 | 
358 | 
359 | def test_rbm(learning_rate=1, training_epochs=200,
360 |        dataset='../data/mnist/mnist.pkl.gz', batch_size=32,
361 |        n_chains=20, n_samples=10, output_folder='rbm_plots',
362 |        n_hidden=200, n_example=-1):
363 |   """
364 |   Demonstrate how to train and afterwards sample from it using Theano.
365 | 
366 |   This is demonstrated on MNIST.
367 | 
368 |   :param learning_rate: learning rate used for training the RBM
369 | 
370 |   :param training_epochs: number of epochs used for training
371 | 
372 |   :param dataset: path the the pickled dataset
373 | 
374 |   :param batch_size: size of a batch used to train the RBM
375 | 
376 |   :param n_chains: number of parallel Gibbs chains to be used for sampling
377 | 
378 |   :param n_samples: number of samples to plot for each chain
379 | 
380 |   """
381 |   datasets = load_data(dataset, n_example)
382 | 
383 |   train_set_x, train_set_y = datasets[0]
384 |   test_set_x, test_set_y = datasets[2]
385 | 
386 |   def convert_to_ind(y, borrow=True):
387 |     y = y.get_value()
388 |     label = numpy.unique(y)
389 |     newy = numpy.zeros((len(y), len(label)))
390 |     for i in range(len(y)):
391 |         newy[i, y[i]] = 1
392 |     sharedy = theano.shared(numpy.asarray(newy,
393 |                                           dtype=theano.config.floatX),
394 |                             borrow=borrow)
395 |     return sharedy
396 | 
397 |   train_set_y_ind = convert_to_ind(train_set_y)
398 |   test_set_y_ind = convert_to_ind(test_set_y)
399 | 
400 | 
401 |   # compute number of minibatches for training, validation and testing
402 |   n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
403 | 
404 |   # allocate symbolic variables for the data
405 |   index = T.lscalar()  # index to a [mini]batch
406 |   x = T.matrix('x')  # the data is presented as rasterized images
407 |   y = T.matrix('y')  # the label is a N x C matrix, each row only true class is 1.
408 | 
409 |   rng = numpy.random.RandomState(123)
410 |   theano_rng = RandomStreams(rng.randint(2 ** 30))
411 | 
412 |   # initialize storage for the persistent chain (state = hidden
413 |   # layer of chain)
414 |   persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden),
415 |                          dtype=theano.config.floatX),
416 |                    borrow=True)
417 | 
418 |   # construct the RBM class
419 |   rbm = RBM(input=x, label=y, n_visible=28 * 28,
420 |         n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng)
421 | 
422 |   # get the cost and the gradient corresponding to one step of CD-15
423 |   cost, train_err, updates = rbm.get_cost_updates(lr=learning_rate,
424 |                      persistent=persistent_chain, k=15)
425 | 
426 |   #################################
427 |   #   Training the RBM      #
428 |   #################################
429 |   if not os.path.isdir(output_folder):
430 |     os.makedirs(output_folder)
431 |   os.chdir(output_folder)
432 | 
433 |   # start-snippet-5
434 |   # it is ok for a theano function to have no output
435 |   # the purpose of train_rbm is solely to update the RBM parameters
436 |   train_rbm = theano.function(
437 |     [index],
438 |     [cost, train_err],
439 |     updates=updates,
440 |     givens={
441 |       x: train_set_x[index * batch_size: (index + 1) * batch_size], 
442 |       y: train_set_y_ind[index * batch_size : (index + 1) * batch_size]
443 |     },
444 |     name='train_rbm'
445 |   )
446 | 
447 |   tx = T.matrix('tx')  # the data is presented as rasterized images
448 |   ty = T.matrix('ty')  # the label is a N x C matrix, each row only true class is 1.
449 |   predict = rbm.classify(tx)
450 |   test_err = rbm.get_error(predict, ty)
451 |   test_rbm = theano.function(
452 |     [], 
453 |     [predict, test_err],
454 |     givens = {
455 |       tx: test_set_x,
456 |       ty: test_set_y_ind
457 |     },
458 |     name = 'test_rbm'
459 |   )
460 | 
461 |   plotting_time = 0.
462 |   start_time = time.clock()
463 | 
464 |   # go through training epochs
465 |   test_err_list = []
466 |   for epoch in xrange(training_epochs):
467 | 
468 |     # go through the training set
469 |     mean_cost = []
470 |     mean_train_err = []
471 |     for batch_index in xrange(n_train_batches):
472 |       [cost, train_err] = train_rbm(batch_index)
473 |       mean_cost += [cost]
474 |       mean_train_err += [train_err]
475 | 
476 |     # Test on test set.
477 |     [predict, test_err] = test_rbm()
478 |     test_err_list += [test_err]
479 |     print 'Training epoch %d, cost = %f, test err = %f' % (epoch, numpy.mean(mean_cost), test_err)
480 | 
481 |   end_time = time.clock()
482 | 
483 |   pretraining_time = (end_time - start_time) - plotting_time
484 | 
485 |   print ('Training took %f minutes' % (pretraining_time / 60.))
486 |   # end-snippet-5 start-snippet-6
487 |   #################################
488 |   #   Sampling from the RBM   #
489 |   #################################
490 |   # find out the number of test samples
491 |   number_of_test_samples = test_set_x.get_value(borrow=True).shape[0]
492 | 
493 |   # pick random test examples, with which to initialize the persistent chain
494 |   test_idx = rng.randint(number_of_test_samples - n_chains)
495 |   persistent_vis_chain = theano.shared(
496 |     numpy.asarray(
497 |       test_set_x.get_value(borrow=True)[test_idx:test_idx + n_chains],
498 |       dtype=theano.config.floatX
499 |     )
500 |   )
501 |   # end-snippet-6 start-snippet-7
502 |   plot_every = 1000
503 |   # define one step of Gibbs sampling (mf = mean-field) define a
504 |   # function that does `plot_every` steps before returning the
505 |   # sample for plotting
506 |   (
507 |     [
508 |       presig_hids,
509 |       hid_mfs,
510 |       hid_samples,
511 |       presig_vis,
512 |       vis_mfs,
513 |       vis_samples
514 |     ],
515 |     updates
516 |   ) = theano.scan(
517 |     rbm.gibbs_vhv,
518 |     outputs_info=[None, None, None, None, None, persistent_vis_chain],
519 |     n_steps=plot_every
520 |   )
521 | 
522 |   # add to updates the shared variable that takes care of our persistent
523 |   # chain :.
524 |   updates.update({persistent_vis_chain: vis_samples[-1]})
525 |   # construct the function that implements our persistent chain.
526 |   # we generate the "mean field" activations for plotting and the actual
527 |   # samples for reinitializing the state of our persistent chain
528 |   sample_fn = theano.function(
529 |     [],
530 |     [
531 |       vis_mfs[-1],
532 |       vis_samples[-1]
533 |     ],
534 |     updates=updates,
535 |     name='sample_fn'
536 |   )
537 | 
538 |   # create a space to store the image for plotting ( we need to leave
539 |   # room for the tile_spacing as well)
540 |   image_data = numpy.zeros(
541 |     (29 * n_samples + 1, 29 * n_chains - 1),
542 |     dtype='uint8'
543 |   )
544 |   for idx in xrange(n_samples):
545 |     # generate `plot_every` intermediate samples that we discard,
546 |     # because successive samples in the chain are too correlated
547 |     vis_mf, vis_sample = sample_fn()
548 |     print ' ... plotting sample ', idx
549 |     image_data[29 * idx:29 * idx + 28, :] = tile_raster_images(
550 |       X=vis_mf,
551 |       img_shape=(28, 28),
552 |       tile_shape=(1, n_chains),
553 |       tile_spacing=(1, 1)
554 |     )
555 | 
556 |   # construct image
557 |   image = Image.fromarray(image_data)
558 |   image.save('samples.png')
559 |   # end-snippet-7
560 |   os.chdir('../')
561 | 
562 | if __name__ == '__main__':
563 |   test_rbm(n_example=-1)
564 |   #test_rbm(n_example=100)
565 | 


--------------------------------------------------------------------------------