├── layers
    ├── __init__.py
    ├── merge.py
    └── pool_special.py
├── components
    ├── __init__.py
    ├── shortcuts.py
    └── objectives.py
├── images
    ├── ssl-mnist-data.png
    ├── ssl-norb-data.png
    ├── ssl-svhn-data.png
    ├── ssl-mnist-sample.png
    ├── ssl-norb-sample.png
    └── ssl-svhn-sample.png
├── utils
    ├── __init__.py
    ├── create_ssl_data.py
    ├── others.py
    └── paramgraphics.py
├── cdgm-mnist-sl.sh
├── cdgm-mnist-ssl_1000.sh
├── cdgm-norb-ssl_1000.sh
├── cdgm-svhn-ssl_1000.sh
├── cdgm-mnist-ssl_100.sh
├── README.md
├── datasets_norb.py
├── datasets.py
├── cdgm_x2y_xy2z_zy2x_sl.py
└── cdgm_x2y_xy2z_zy2x.py


/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .pool_special import *
2 | from .merge import *


--------------------------------------------------------------------------------
/components/__init__.py:
--------------------------------------------------------------------------------
1 | from .shortcuts import *
2 | from .objectives import *


--------------------------------------------------------------------------------
/images/ssl-mnist-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/mmdcgm-ssl/HEAD/images/ssl-mnist-data.png


--------------------------------------------------------------------------------
/images/ssl-norb-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/mmdcgm-ssl/HEAD/images/ssl-norb-data.png


--------------------------------------------------------------------------------
/images/ssl-svhn-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/mmdcgm-ssl/HEAD/images/ssl-svhn-data.png


--------------------------------------------------------------------------------
/images/ssl-mnist-sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/mmdcgm-ssl/HEAD/images/ssl-mnist-sample.png


--------------------------------------------------------------------------------
/images/ssl-norb-sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/mmdcgm-ssl/HEAD/images/ssl-norb-sample.png


--------------------------------------------------------------------------------
/images/ssl-svhn-sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-ml/mmdcgm-ssl/HEAD/images/ssl-svhn-sample.png


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .paramgraphics import *
2 | from .others import *
3 | from .create_ssl_data import *


--------------------------------------------------------------------------------
/cdgm-mnist-sl.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=$1 python cdgm_x2y_xy2z_zy2x_sl.py -name 'sl-mnist' -dataset mnist_real -flag evaluation -preprocess none -batch_norm_classifier true -top_mlp false -mlp_size 256 -nlayers_cla 5 -nk_cla 32,64,64,128,128  -str_cla 1,1,1,1,1 -ps_cla 2,1,2,1,1 -dk_cla 5,3,3,3,3 -pad_cla valid,same,valid,same,same -nonlin_cla rectify,rectify,rectify,rectify,rectify -dr_cla 0.5,0,0.5,0,0 -nz 100 -batch_norm_dgm false -nlayers_enc 5 -nk_enc 32,32,64,64,64 -dk_enc 5,3,3,3,3 -pad_enc valid,same,valid,same,same -str_enc 1,1,1,1,1 -ps_enc 2,1,2,1,1 -nonlin_enc rectify,rectify,rectify,rectify,rectify -dr_enc 0,0,0,0,0 -nlayers_dec 5 -nk_dec 64,64,32,32,1 -dk_dec 3,3,3,3,5 -pad_dec same,same,full,same,full -str_dec 1,1,1,1,1 -up_method none,none,unpool,none,unpool -ps_dec 1,1,2,1,2 -nonlin_dec rectify,rectify,rectify,rectify,sigmoid -dr_dec 0,0,0,0,0 -lr 3e-4  -nepochs 3000 -anneal_lr_epoch 1500 -anneal_lr_factor .995 -every_anneal 1 -delta 1.0 -batch_size 600 -alpha_decay 1e-4 -alpha .1


--------------------------------------------------------------------------------
/cdgm-mnist-ssl_1000.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=$1,lib.cnmem=0.2 python cdgm_x2y_xy2z_zy2x.py -name 'ssl-mnist-1000' -dataset mnist_real -flag evaluation -ssl_data_seed $2 -preprocess none -batch_norm_classifier true -top_mlp false -nlayers_cla 5 -nk_cla 32,64,64,128,10  -str_cla 1,1,1,1,1 -ps_cla 2,1,2,1,1 -dk_cla 5,3,3,3,1 -pad_cla valid,same,valid,same,same -nonlin_cla rectify,rectify,rectify,rectify,rectify -dr_cla 0.5,0,0.5,0,0 -nz 100 -batch_norm_dgm false -nlayers_enc 5 -nk_enc 32,32,64,64,64 -dk_enc 5,3,3,3,3 -pad_enc valid,same,valid,same,same -str_enc 1,1,1,1,1 -ps_enc 2,1,2,1,1 -nonlin_enc rectify,rectify,rectify,rectify,rectify -dr_enc 0,0,0,0,0 -nlayers_dec 5 -nk_dec 64,64,32,32,1 -dk_dec 3,3,3,3,5 -pad_dec same,same,full,same,full -str_dec 1,1,1,1,1 -up_method none,none,unpool,none,unpool -ps_dec 1,1,2,1,2 -nonlin_dec rectify,rectify,rectify,rectify,sigmoid -dr_dec 0,0,0,0,0 -lr 3e-4  -nepochs 3000 -anneal_lr_epoch 1500 -anneal_lr_factor .995 -delta 1.0 -num_labelled_per_batch 250 -num_labelled 1000 -batch_size 600 -alpha_decay 1e-4 -alpha_hinge 1. -alpha_hat .3 -alpha_reg 1e-3 -alpha .1 -alpha_straight_through 3e-4


--------------------------------------------------------------------------------
/cdgm-norb-ssl_1000.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=$1,lib.cnmem=.9 python cdgm_x2y_xy2z_zy2x.py -name 'ssl-norb-1000' -dataset norb -flag evaluation -ssl_data_seed $2 -preprocess none -batch_norm_classifier true -top_mlp false -nlayers_cla 6 -nk_cla 32,32,64,64,128,10  -str_cla 1,1,1,1,1,1 -ps_cla 1,2,1,2,1,1 -dk_cla 3,3,3,3,3,1 -pad_cla same,same,same,same,same,same -nonlin_cla rectify,rectify,rectify,rectify,rectify,rectify -dr_cla 0,0.2,0,0.2,0,0.2 -nz 100 -batch_norm_dgm false -nlayers_enc 5 -nk_enc 32,64,64,128,128 -dk_enc 5,3,3,3,3 -pad_enc same,same,same,same,same -str_enc 1,1,1,1,1 -ps_enc 2,1,2,1,2 -nonlin_enc rectify,rectify,rectify,rectify,rectify -dr_enc 0,0,0,0,0 -nlayers_dec 5 -nk_dec 128,64,64,32,1 -dk_dec 3,3,3,3,5 -pad_dec same,same,same,same,same -str_dec 1,1,1,1,1 -up_method unpool,none,unpool,none,unpool -ps_dec 2,1,2,1,2 -nonlin_dec rectify,rectify,rectify,rectify,sigmoid -dr_dec 0,0,0,0,0 -lr 3e-4 -nepochs 3000 -anneal_lr_epoch 2000 -anneal_lr_factor .995 -num_labelled_per_batch 1000 -num_labelled 1000 -batch_size 2000 -alpha_decay 1e-4 -alpha_hinge 1. -alpha_hat 0.3 -alpha_reg 1e-3 -alpha_straight_through 3e-5


--------------------------------------------------------------------------------
/cdgm-svhn-ssl_1000.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=$1,lib.cnmem=1 python cdgm_x2y_xy2z_zy2x.py -name 'ssl-svhn-1000' -dataset svhn -flag evaluation -ssl_data_seed $2 -preprocess none -batch_norm_classifier true -top_mlp false -mlp_size 256 -nlayers_cla 6 -nk_cla 32,32,64,64,128,128  -str_cla 1,1,1,1,1,1 -ps_cla 1,2,1,2,1,1 -dk_cla 3,3,3,3,3,3 -pad_cla same,same,same,same,same,same -nonlin_cla rectify,rectify,rectify,rectify,rectify,rectify -dr_cla 0,0.2,0,0.2,0,0.2 -nz 128 -batch_norm_dgm false -nlayers_enc 5 -nk_enc 32,64,64,128,128 -dk_enc 5,3,3,3,3 -pad_enc same,same,same,same,same -str_enc 1,1,1,1,1 -ps_enc 2,1,2,1,2 -nonlin_enc rectify,rectify,rectify,rectify,rectify -dr_enc 0,0,0,0,0 -nlayers_dec 5 -nk_dec 128,64,64,32,3 -dk_dec 3,3,3,3,5 -pad_dec same,same,same,same,same -str_dec 1,1,1,1,1 -up_method unpool,none,unpool,none,unpool -ps_dec 2,1,2,1,2 -nonlin_dec rectify,rectify,rectify,rectify,sigmoid -dr_dec 0,0,0,0,0 -lr 3e-4 -nepochs 500 -anneal_lr_epoch 250 -anneal_lr_factor .99 -num_labelled_per_batch 500 -num_labelled 1000 -batch_size 1000 -alpha_decay 1e-4 -alpha_hinge 1. -alpha_hat 0.3 -alpha_reg 1e-3 -alpha_straight_through 1e-4


--------------------------------------------------------------------------------
/cdgm-mnist-ssl_100.sh:
--------------------------------------------------------------------------------
1 | THEANO_FLAGS=device=$1,lib.cnmem=0.2 python cdgm_x2y_xy2z_zy2x.py -name 'ssl-mnist-100' -dataset mnist_real -flag evaluation -ssl_data_seed $2 -preprocess none -batch_norm_classifier true -top_mlp false -mlp_size 256 -nlayers_cla 5 -nk_cla 32,64,64,128,128  -str_cla 1,1,1,1,1 -ps_cla 2,1,2,1,1 -dk_cla 5,3,3,3,3 -pad_cla valid,same,valid,same,same -nonlin_cla rectify,rectify,rectify,rectify,rectify -dr_cla 0.5,0,0.5,0,0 -nz 100 -batch_norm_dgm false -nlayers_enc 5 -nk_enc 32,32,64,64,64 -dk_enc 5,3,3,3,3 -pad_enc valid,same,valid,same,same -str_enc 1,1,1,1,1 -ps_enc 2,1,2,1,1 -nonlin_enc rectify,rectify,rectify,rectify,rectify -dr_enc 0,0,0,0,0 -nlayers_dec 5 -nk_dec 64,64,32,32,1 -dk_dec 3,3,3,3,5 -pad_dec same,same,full,same,full -str_dec 1,1,1,1,1 -up_method none,none,unpool,none,unpool -ps_dec 1,1,2,1,2 -nonlin_dec rectify,rectify,rectify,rectify,sigmoid -dr_dec 0,0,0,0,0 -lr 3e-4  -nepochs 3000 -anneal_lr_epoch 1500 -anneal_lr_factor .995 -every_anneal 1 -delta 1.0 -num_labelled_per_batch 100 -num_labelled 100 -batch_size 600 -alpha_decay 1e-4 -alpha_hinge 1. -alpha_hat .3 -alpha_reg 1e-3 -alpha .1 -alpha_straight_through 3e-4


--------------------------------------------------------------------------------
/layers/merge.py:
--------------------------------------------------------------------------------
 1 | import lasagne
 2 | from lasagne import init
 3 | from lasagne import nonlinearities
 4 | 
 5 | import theano.tensor as T
 6 | import theano
 7 | import numpy as np
 8 | import theano.tensor.extra_ops as Textra
 9 | 
10 | __all__ = [
11 |     "ConvConcatLayer", #
12 |     "MLPConcatLayer", #
13 | ]
14 | 
15 | 
16 | class ConvConcatLayer(lasagne.layers.MergeLayer):
17 |     '''
18 |     concatenate a tensor and a vector on feature map axis 
19 |     '''
20 |     def __init__(self, incomings, num_cls, **kwargs):
21 |         super(ConvConcatLayer, self).__init__(incomings, **kwargs)
22 |         self.num_cls = num_cls
23 | 
24 |     def get_output_shape_for(self, input_shapes):
25 |         res = list(input_shapes[0])
26 |         res[1] += self.num_cls
27 |         return tuple(res)
28 | 
29 |     def get_output_for(self, input, **kwargs):
30 |         x, y = input
31 |         if y.ndim == 1:
32 |             y = T.extra_ops.to_one_hot(y, self.num_cls)
33 |         if y.ndim == 2:
34 |             y = y.dimshuffle(0, 1, 'x', 'x')
35 |         assert y.ndim == 4
36 |         return T.concatenate([x, y*T.ones((x.shape[0], y.shape[1], x.shape[2], x.shape[3]))], axis=1)
37 | 
38 | class MLPConcatLayer(lasagne.layers.MergeLayer):
39 |     '''
40 |     concatenate a matrix and a vector on feature axis 
41 |     '''
42 |     def __init__(self, incomings, num_cls, **kwargs):
43 |         super(MLPConcatLayer, self).__init__(incomings, **kwargs)
44 |         self.num_cls = num_cls
45 | 
46 |     def get_output_shape_for(self, input_shapes):
47 |         res = list(input_shapes[0])
48 |         res[1] += self.num_cls
49 |         return tuple(res)
50 | 
51 |     def get_output_for(self, input, **kwargs):
52 |         x, y = input
53 |         if y.ndim == 1:
54 |             y = T.extra_ops.to_one_hot(y, self.num_cls)
55 |         assert y.ndim == 2
56 |         return T.concatenate([x, y], axis=1)


--------------------------------------------------------------------------------
/utils/create_ssl_data.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Create semi-supervised datasets for different models
 3 | '''
 4 | import numpy as np
 5 | 
 6 | def create_ssl_data(x, y, n_classes, n_labelled, seed):
 7 |     # 'x': data matrix, nxk
 8 |     # 'y': label vector, n
 9 |     # 'n_classes': number of classes
10 |     # 'n_labelled': number of labelled data
11 |     # 'seed': random seed
12 | 
13 |     # check input
14 |     if n_labelled%n_classes != 0: 
15 |         print n_labelled
16 |         print n_classes
17 |         raise("n_labelled (wished number of labelled samples) not divisible by n_classes (number of classes)")
18 |     n_labels_per_class = n_labelled/n_classes
19 | 
20 |     rng = np.random.RandomState(seed)
21 |     index = rng.permutation(x.shape[0])
22 |     x = x[index]
23 |     y = y[index]
24 | 
25 |     # select first several data per class
26 |     data_labelled = [0]*n_classes
27 |     index_labelled = []
28 |     index_unlabelled = []
29 |     for i in xrange(x.shape[0]):
30 |         if data_labelled[y[i]] < n_labels_per_class:
31 |             data_labelled[y[i]] += 1
32 |             index_labelled.append(i)
33 |         else:
34 |             index_unlabelled.append(i)
35 |     
36 |     x_labelled = x[index_labelled]
37 |     y_labelled = y[index_labelled]
38 |     x_unlabelled = x[index_unlabelled]
39 |     y_unlabelled = y[index_unlabelled]
40 |     return x_labelled, y_labelled, x_unlabelled, y_unlabelled
41 | 
42 | 
43 | def create_ssl_data_subset(x, y, n_classes, n_labelled, n_labelled_per_time, seed):
44 |     assert n_labelled%n_labelled_per_time==0
45 |     times = n_labelled/n_labelled_per_time
46 |     x_labelled, y_labelled, x_unlabelled, y_unlabelled = create_ssl_data(x, y, n_classes, n_labelled_per_time, seed)
47 |     while (times > 1):
48 |         x_labelled_new, y_labelled_new, x_unlabelled, y_unlabelled = create_ssl_data(x_unlabelled, y_unlabelled, n_classes, n_labelled_per_time, seed)
49 |         x_labelled = np.vstack((x_labelled, x_labelled_new))
50 |         y_labelled = np.hstack((y_labelled, y_labelled_new))
51 |         times -= 1
52 |     return x_labelled, y_labelled, x_unlabelled, y_unlabelled


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Max-margin Deep Conditional Generative Models for Semi-Supervised Learning
 2 | ## [Chongxuan Li](https://github.com/zhenxuan00), Jun Zhu and Bo Zhang
 3 | 
 4 | Full [paper](https://arxiv.org/abs/1611.07119), a journal version of our NIPS15 paper (original [paper](https://arxiv.org/abs/1504.06787) and [code](https://github.com/zhenxuan00/mmdgm)). A novel class-condional variants of mmDGMs is proposed.
 5 | 
 6 | ## Summary of Max-margin Deep Conditional Generative Models (mmDCGMs)
 7 | 
 8 | - We boost the effectiveness and efficiency of DGMs in semi-supervised learning by
 9 |   - Employing advanced CNNs as the x2y, xy2z and zy2x networks
10 |   - Approximating the posterior inference of labels
11 |   - Proposing powerful max-margin discriminative losses for labeled and unlabeled data
12 | - and the arrived mmDCGMs can
13 |   - Perform efficient inference: constant time with respect to the number of classes
14 |   - Achieve state-of-the-art classification results on sevarl benchmarks: MNIST, SVHN and NORB with 1000 labels and MNIST with full labels
15 |   - Disentangle classes and styles on raw images without preprocessing like PCA given small amount of labels
16 | 
17 | ## Some libs we used in our experiments
18 | > Python
19 | > Numpy
20 | > Scipy
21 | > [Theano](https://github.com/Theano/Theano)
22 | > [Lasagne](https://github.com/Lasagne/Lasagne)
23 | > [Parmesan](https://github.com/casperkaae/parmesan)
24 | 
25 | ## State-of-the-art results on MNIST, SVHN and NORB datasets with 1000 labels and excellent results competitive to best CNNS given all labels on MNIST
26 | 
27 | > chmod +x *.sh
28 | 
29 | > ./cdgm-svhn-ssl_1000.sh gpu0 (Run .sh files to obtain corresponding results)
30 | 
31 | > For small norb dataset, please download the raw images in .MAT format from [http://www.cs.nyu.edu/~ylclab/data/norb-v1.0-small/](http://www.cs.nyu.edu/~ylclab/data/norb-v1.0-small/) and run datasets_norb.convert_orig_to_np() to convert it into numpy format. 
32 | 
33 | > See Table 6 and Table 7 in the paper for the classfication results.
34 | 
35 | ## Class conditional generation of raw images given a few labels
36 | 
37 | ### Results on MNIST given 100 labels (left: 100 labeled data sorted by class, right: samples, where each row shares same class and each column shares same style.)
38 | <img src="https://github.com/thu-ml/mmdcgm-ssl/blob/master/images/ssl-mnist-data.png" width="320">  <img src="https://github.com/thu-ml/mmdcgm-ssl/blob/master/images/ssl-mnist-sample.png" width="320">
39 | 
40 | 
41 | ### Results on SVHN given 1000 labels
42 | <img src="https://github.com/thu-ml/mmdcgm-ssl/blob/master/images/ssl-svhn-data.png" width="320">  <img src="https://github.com/thu-ml/mmdcgm-ssl/blob/master/images/ssl-svhn-sample.png" width="320">
43 | 
44 | ### Results on small NORB given 1000 labels
45 | <img src="https://github.com/thu-ml/mmdcgm-ssl/blob/master/images/ssl-norb-data.png" width="240">  <img src="https://github.com/thu-ml/mmdcgm-ssl/blob/master/images/ssl-norb-sample.png" width="240">
46 | 


--------------------------------------------------------------------------------
/utils/others.py:
--------------------------------------------------------------------------------
 1 | import shutil, gzip, os, cPickle, time, math, operator, argparse
 2 | 
 3 | import numpy as np
 4 | import theano.tensor as T
 5 | import theano, lasagne
 6 | 
 7 | 
 8 | def get_pad(pad):
 9 |     if pad not in ['same', 'valid', 'full']:
10 |         pad = tuple(map(int, pad.split('-')))
11 |     return pad
12 | 
13 | def get_pad_list(pad_list):
14 |     re_list = []
15 |     for p in pad_list:
16 |         re_list.append(get_pad(p))
17 |     return re_list
18 | 
19 | # nonlinearities
20 | def get_nonlin(nonlin):
21 |     if nonlin == 'rectify':
22 |         return lasagne.nonlinearities.rectify
23 |     elif nonlin == 'leaky_rectify':
24 |         return lasagne.nonlinearities.LeakyRectify(0.1)
25 |     elif nonlin == 'tanh':
26 |         return lasagne.nonlinearities.tanh
27 |     elif nonlin == 'sigmoid':
28 |         return lasagne.nonlinearities.sigmoid
29 |     elif nonlin == 'maxout':
30 |         return 'maxout'
31 |     elif nonlin == 'none':
32 |         return lasagne.nonlinearities.identity
33 |     else:
34 |         raise ValueError('invalid non-linearity \'' + nonlin + '\'')
35 | def get_nonlin_list(nonlin_list):
36 |     re_list = []
37 |     for n in nonlin_list:
38 |         re_list.append(get_nonlin(n))
39 |     return re_list
40 | 
41 | def bernoullisample(x):
42 |     return np.random.binomial(1,x,size=x.shape).astype(theano.config.floatX)
43 | 
44 | def build_log_file(args, filename_script, extra=None):
45 |     res_out = args.outfolder
46 |     res_out += '_'
47 |     res_out += args.name
48 |     res_out += '_'
49 |     if extra is not None:
50 |         res_out += extra
51 |         res_out += '_' 
52 |     res_out += str(int(time.time()))
53 |     if not os.path.exists(res_out):
54 |         os.makedirs(res_out)
55 | 
56 |     # write commandline parameters to header of logfile
57 |     args_dict = vars(args)
58 |     sorted_args = sorted(args_dict.items(), key=operator.itemgetter(0))
59 |     description = []
60 |     description.append('######################################################')
61 |     description.append('# --Commandline Params--')
62 |     for name, val in sorted_args:
63 |         description.append("# " + name + ":\t" + str(val))
64 |     description.append('######################################################')
65 |     
66 |     logfile = os.path.join(res_out, 'logfile.log')
67 |     model_out = os.path.join(res_out, 'model')
68 |     with open(logfile,'w') as f:
69 |         for l in description:
70 |             f.write(l + '\n')
71 |     return logfile, res_out
72 | 
73 | def array2file_2D(array,logfile):
74 |     assert len(array.shape) == 2, array.shape
75 |     with open(logfile,'a') as f:
76 |        for i in xrange(array.shape[0]):
77 |         for j in xrange(array.shape[1]):
78 |             f.write(str(array[i][j])+' ')
79 |         f.write('\n')
80 | 
81 | def printarray_2D(array, precise=2):
82 |     assert len(array.shape) == 2, array.shape
83 |     format = '%.'+str(precise)+'f'
84 |     for i in xrange(array.shape[0]):
85 |         for j in xrange(array.shape[1]):
86 |             print format %array[i][j],
87 |         print


--------------------------------------------------------------------------------
/components/shortcuts.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | shortcuts for compsited layers
 3 | '''
 4 | import numpy as np
 5 | import theano.tensor as T
 6 | import theano
 7 | import lasagne
 8 | 
 9 | from parmesan.distributions import log_stdnormal, log_normal2, log_bernoulli
10 | 
11 | import sys
12 | sys.path.append("..")
13 | from layers.pool_special import UnPoolLayer, UnPoolMaskLayer, MaxPoolLocationLayer, RepeatUnPoolLayer
14 | from layers.merge import ConvConcatLayer, MLPConcatLayer
15 | 
16 | # convolutional layer
17 | # following optional batch normalization, pooling and dropout
18 | def convlayer(l,bn,dr,ps,n_kerns,d_kerns,nonlinearity,pad,stride,name,output_mask=False,batch_size_act=0,W=lasagne.init.GlorotUniform(),b=lasagne.init.Constant(0.)):
19 |     mask = None
20 |     l = lasagne.layers.Conv2DLayer(l, num_filters=n_kerns, filter_size=(d_kerns,d_kerns), stride=stride, pad=pad, name="Conv-"+name, W=W, b=b, nonlinearity=nonlinearity)
21 |     if bn:
22 |         l = lasagne.layers.batch_norm(l, name="BN-"+name)
23 |     if ps > 1:
24 |         if output_mask:
25 |             mask = MaxPoolLocationLayer(l,factor=(ps,ps),batch_size=batch_size_act)
26 |         l = lasagne.layers.MaxPool2DLayer(l, pool_size=(ps,ps), name="Pool"+name)
27 |     if dr > 0:
28 |         l = lasagne.layers.DropoutLayer(l, p=dr, name="Drop-"+name)
29 |     return l, mask
30 | 
31 | # unpooling and convolutional layer
32 | # following optional batch normalization and dropout
33 | def unpoolconvlayer(l,bn,dr,ps,n_kerns,d_kerns,nonlinearity,pad,stride,name,type_='unpool',mask=None,W=lasagne.init.GlorotUniform(),b=lasagne.init.Constant(0.), noise_level=0):
34 |     if ps > 1:
35 |         if type_ == 'unpool':
36 |             l = UnPoolLayer(incoming=l, factor=(ps,ps), name="UP-"+name)
37 |         elif type_ == 'repeat':
38 |             l = RepeatUnPoolLayer(incoming=l, factor=(ps,ps), name="UP_REP-"+name)
39 |         elif type_ == 'unpoolmask':
40 |             l = UnPoolMaskLayer(incoming=l, mask=mask, factor=(ps,ps), name="UP_MUSK-"+name, noise_level=noise_level)
41 |     l = lasagne.layers.Conv2DLayer(l, num_filters=n_kerns, filter_size=(d_kerns,d_kerns), stride=stride, pad=pad, name="Conv-"+name, W=W, b=b, nonlinearity=nonlinearity)
42 |     if bn:
43 |         l = lasagne.layers.batch_norm(l, name="BN-"+name)
44 |     if dr > 0:
45 |         l = lasagne.layers.DropoutLayer(l, p=dr, name="Drop-"+name)
46 |     return l
47 | 
48 | # fractional strided convolutional layer
49 | # following optional batch normalization and dropout
50 | def fractionalstridedlayer(l,bn,dr,n_kerns,d_kerns,nonlinearity,pad,stride,name,W=lasagne.init.GlorotUniform(),b=lasagne.init.Constant(0.)):
51 |     # print bn,dr,n_kerns,d_kerns,nonlinearity,pad,stride,name
52 |     l = lasagne.layers.TransposedConv2DLayer(l, num_filters=n_kerns, filter_size=(d_kerns,d_kerns), stride=stride, crop=pad, name="FS_Conv-"+name, W=W, b=b, nonlinearity=nonlinearity)
53 |     if bn:
54 |         l = lasagne.layers.batch_norm(l, name="BN-"+name)
55 |     if dr > 0:
56 |         l = lasagne.layers.DropoutLayer(l, p=dr, name="Drop-"+name)
57 |     return l
58 | 
59 | # mlp layer
60 | # following optional batch normalization and dropout
61 | def mlplayer(l,bn,dr,num_units,nonlinearity,name):
62 |     l = lasagne.layers.DenseLayer(l,num_units=num_units,nonlinearity=nonlinearity,name="MLP-"+name)
63 |     if bn:
64 |         l = lasagne.layers.batch_norm(l, name="BN-"+name)
65 |     if dr > 0:
66 |         l = lasagne.layers.DropoutLayer(l, p=dr, name="Drop-"+name)
67 |     return l
68 | 


--------------------------------------------------------------------------------
/datasets_norb.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.io
  3 | import os
  4 | import gzip
  5 | import cPickle
  6 | 
  7 | # refer to kingma's code nips14-ssl: https://github.com/dpkingma/nips14-ssl 
  8 | path = '/home/chongxuan/mfs/data/small_norb/np/'
  9 | 
 10 | def load_numpy_dat(size=48):
 11 |     with gzip.open(path+'train_dat_'+str(size)+'.pkl.gz', 'rb') as f:
 12 |         train_dat = cPickle.load(f)
 13 |     with gzip.open(path+'test_dat_'+str(size)+'.pkl.gz', 'rb') as f:
 14 |         test_dat = cPickle.load(f)
 15 |     return train_dat, test_dat
 16 | 
 17 | def load_numpy_cat():
 18 |     with gzip.open(path+'train_cat.pkl.gz', 'rb') as f:
 19 |         train_cat = cPickle.load(f)
 20 |     with gzip.open(path+'test_cat.pkl.gz', 'rb') as f:
 21 |         test_cat = cPickle.load(f)
 22 |     return train_cat, test_cat
 23 | 
 24 | def load_numpy_info():
 25 |     with gzip.open(path+'train_info.pkl.gz', 'rb') as f:
 26 |         train_info = cPickle.load(f)
 27 |     with gzip.open(path+'test_info.pkl.gz', 'rb') as f:
 28 |         test_info = cPickle.load(f)
 29 |     return train_info, test_info
 30 | 
 31 | # Load dataset with 50 subclasses, merged to single matrices
 32 | def load_numpy_subclasses(size=48, normalize=False, centered=False, convert_to_five=True):
 33 |     train_dat, test_dat = load_numpy_dat(size)
 34 |     train_info, test_info = load_numpy_info()
 35 |     train_cat, test_cat = load_numpy_cat()
 36 |     
 37 |     n = train_dat.shape[0]
 38 |     
 39 |     n_class = 5 #number of classes
 40 |     n_ipc = 10 #number of instances per class
 41 |     
 42 |     train_x_left = train_dat[:,0].reshape((n, -1)).T
 43 |     train_x_right = train_dat[:,1].reshape((n, -1)).T
 44 |     train_y = (train_cat[:]*n_ipc + train_info[:,0]).reshape(1, n) # computes which of the 50 subclasses
 45 |     
 46 |     test_x_left = test_dat[:,0].reshape((n, -1)).T
 47 |     test_x_right = test_dat[:,1].reshape((n, -1)).T
 48 |     test_y = (test_cat[:]*n_ipc + test_info[:,0]).reshape(1, n)
 49 |     
 50 |     x = np.hstack((train_x_left, train_x_right, test_x_left, test_x_right)) # computes which of the 50 subclasses
 51 |     y = np.hstack((train_y, train_y, test_y, test_y))
 52 |     
 53 |     if convert_to_five:
 54 |         y = y/10
 55 |     if normalize:
 56 |         x = x/256.0
 57 |     if centered:
 58 |         x = x - x.mean(axis=0,keepdims=True)
 59 |     return x, y
 60 |     
 61 | # Original data to numpy-format data
 62 | def convert_orig_to_np():
 63 |     from pylearn2.datasets.filetensor import read
 64 |     import gzip
 65 |     import cPickle
 66 |     # Load data
 67 |     path_orig = './data/small_norb/mat/'
 68 |     prefix_train = path_orig+'smallnorb-5x46789x9x18x6x2x96x96-training-'
 69 |     train_cat = read(gzip.open(prefix_train+'cat.mat.gz'))
 70 |     train_dat = read(gzip.open(prefix_train+'dat.mat.gz'))
 71 |     train_info = read(gzip.open(prefix_train+'info.mat.gz'))
 72 |     prefix_test = path_orig+'smallnorb-5x01235x9x18x6x2x96x96-testing-'
 73 |     test_cat = read(gzip.open(prefix_test+'cat.mat.gz'))
 74 |     test_dat = read(gzip.open(prefix_test+'dat.mat.gz'))
 75 |     test_info = read(gzip.open(prefix_test+'info.mat.gz'))
 76 |     
 77 |     # Save originals matrices to file
 78 |     files = (('train_cat', train_cat), ('train_dat_96', train_dat), ('train_info', train_info), ('test_cat', test_cat), ('test_dat_96', test_dat), ('test_info', test_info))
 79 |     for fname, tensor in files:
 80 |         print 'Saving to ', fname, '...'
 81 |         with gzip.open(path+fname+'.pkl.gz','wb') as f:
 82 |             cPickle.dump(tensor, f)
 83 |     
 84 |     # Save downscaled version too
 85 |     w = 48
 86 |     files = (('train_dat', train_dat),('test_dat', test_dat))
 87 |     for fname, tensor in files:
 88 |         print 'Generating downscaled version ' + fname + '...'
 89 |         left = reshape_images(tensor[:,0,:,:], (w,w))
 90 |         right = reshape_images(tensor[:,1,:,:], (w,w)) 
 91 |         result = np.zeros((tensor.shape[0], 2, w,w), dtype=np.uint8)
 92 |         result[:,0,:,:] = left
 93 |         result[:,1,:,:] = right
 94 |         f = gzip.open(path+fname+'_'+str(w)+'.pkl.gz', 'wb')
 95 |         cPickle.dump(result, f)
 96 |         f.close()
 97 | 
 98 |     w = 32
 99 |     files = (('train_dat', train_dat),('test_dat', test_dat))
100 |     for fname, tensor in files:
101 |         print 'Generating downscaled version ' + fname + '...'
102 |         left = reshape_images(tensor[:,0,:,:], (w,w))
103 |         right = reshape_images(tensor[:,1,:,:], (w,w)) 
104 |         result = np.zeros((tensor.shape[0], 2, w, w), dtype=np.uint8)
105 |         result[:,0,:,:] = left
106 |         result[:,1,:,:] = right
107 |         f = gzip.open(path+fname+'_'+str(w)+'.pkl.gz', 'wb')
108 |         cPickle.dump(result, f)
109 |         f.close()
110 |         
111 | # Reshape digits
112 | def reshape_images(x, shape):
113 |     def rebin(_a, shape):
114 |         sh = shape[0],_a.shape[0]//shape[0],shape[1],_a.shape[1]//shape[1]
115 |         result = _a.reshape(sh).mean(-1).mean(1)
116 |         return np.floor(result).astype(np.uint8)
117 |     nrows = x.shape[0]
118 |     result = np.zeros((nrows, shape[0], shape[1]), dtype=np.uint8)
119 |     for i in range(nrows):
120 |         result[i,:,:] = rebin(x[i,:,:], shape)
121 |     return result
122 | 
123 | # Converts integer labels to binarized labels (1-of-K coding)
124 | def binarize_labels(y, n_classes=5):
125 |     new_y = np.zeros((n_classes, y.shape[0]))
126 |     for i in range(y.shape[0]):
127 |         new_y[y[i], i] = 1
128 |     return new_y
129 |     


--------------------------------------------------------------------------------
/utils/paramgraphics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | from scipy.misc import imsave
  4 | 
  5 | def scale_max_min(images, max_p, min_p):
  6 |     # scale the images according to the max and min
  7 |     # images f x n, column major
  8 |     ret = np.zeros(images.shape)
  9 |     for i in xrange(images.shape[1]):
 10 |         # clips at first
 11 |         tmp = np.clip(images[:,i], min_p[i], max_p[i])
 12 |         # scale
 13 |         ret[:,i] = (tmp - min_p[i]) / (max_p[i] - min_p[i])
 14 | 
 15 |     return ret
 16 | 
 17 | def scale_to_unit_interval(ndar, eps=1e-8):
 18 |     """ Scales all values in the ndarray ndar to be between 0 and 1 """
 19 |     ndar = ndar.copy()
 20 |     ndar -= ndar.min()
 21 |     ndar *= 1.0 / (ndar.max() + eps)
 22 |     return ndar
 23 | 
 24 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
 25 |                          scale=True,
 26 |                          output_pixel_vals=True,
 27 |                          colorImg=False):
 28 |     """
 29 |     Transform an array with one flattened image per row, into an array in
 30 |     which images are reshaped and layed out like tiles on a floor.
 31 | 
 32 |     This function is useful for visualizing datasets whose rows are images,
 33 |     and also columns of matrices for transforming those rows
 34 |     (such as the first layer of a neural net).
 35 | 
 36 |     :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
 37 |     be 2-D ndarrays or None;
 38 |     :param X: a 2-D array in which every row is a flattened image.
 39 | 
 40 |     :type img_shape: tuple; (height, width)
 41 |     :param img_shape: the original shape of each image
 42 | 
 43 |     :type tile_shape: tuple; (rows, cols)
 44 |     :param tile_shape: the number of images to tile (rows, cols)
 45 | 
 46 |     :param output_pixel_vals: if output should be pixel values (i.e. int8
 47 |     values) or floats
 48 | 
 49 |     :param scale_rows_to_unit_interval: if the values need to be scaled before
 50 |     being plotted to [0,1] or not
 51 | 
 52 | 
 53 |     :returns: array suitable for viewing as an image.
 54 |     """
 55 |     X = X * 1.0 # converts ints to floats
 56 |     
 57 |     if colorImg:
 58 |         channelSize = X.shape[1]/3
 59 |         X = (X[:,0:channelSize], X[:,channelSize:2*channelSize], X[:,2*channelSize:3*channelSize], None)
 60 |     
 61 |     assert len(img_shape) == 2
 62 |     assert len(tile_shape) == 2
 63 |     assert len(tile_spacing) == 2
 64 | 
 65 |     # The expression below can be re-written in a more C style as
 66 |     # follows :
 67 |     #
 68 |     # out_shape = [0,0]
 69 |     # out_shape[0] = (img_shape[0] + tile_spacing[0]) * tile_shape[0] -
 70 |     #                tile_spacing[0]
 71 |     # out_shape[1] = (img_shape[1] + tile_spacing[1]) * tile_shape[1] -
 72 |     #                tile_spacing[1]
 73 |     out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
 74 |                         in zip(img_shape, tile_shape, tile_spacing)]
 75 | 
 76 |     if isinstance(X, tuple):
 77 |         assert len(X) == 4
 78 |         # Create an output np ndarray to store the image
 79 |         if output_pixel_vals:
 80 |             out_array = np.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
 81 |         else:
 82 |             out_array = np.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
 83 | 
 84 |         #colors default to 0, alpha defaults to 1 (opaque)
 85 |         if output_pixel_vals:
 86 |             channel_defaults = [0, 0, 0, 255]
 87 |         else:
 88 |             channel_defaults = [0., 0., 0., 1.]
 89 | 
 90 |         
 91 |         for i in xrange(4):
 92 |             if X[i] is None:
 93 |                 # if channel is None, fill it with zeros of the correct
 94 |                 # dtype
 95 |                 out_array[:, :, i] = np.zeros(out_shape,
 96 |                         dtype='uint8' if output_pixel_vals else out_array.dtype
 97 |                         ) + channel_defaults[i]
 98 |             else:
 99 |                 # use a recurrent call to compute the channel and store it
100 |                 # in the output
101 |                 xi = X[i]
102 |                 if scale:
103 |                     xi = (X[i] - X[i].min()) / (X[i].max() - X[i].min())
104 |                 out_array[:, :, i] = tile_raster_images(xi, img_shape, tile_shape, tile_spacing, False, output_pixel_vals)
105 |         
106 |     
107 |         return out_array
108 | 
109 |     else:
110 |         # if we are dealing with only one channel
111 |         H, W = img_shape
112 |         Hs, Ws = tile_spacing
113 | 
114 |         # generate a matrix to store the output
115 |         out_array = np.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
116 | 
117 | 
118 |         for tile_row in xrange(tile_shape[0]):
119 |             for tile_col in xrange(tile_shape[1]):
120 |                 if tile_row * tile_shape[1] + tile_col < X.shape[0]:
121 |                     if scale:
122 |                         # if we should scale values to be between 0 and 1
123 |                         # do this by calling the `scale_to_unit_interval`
124 |                         # function
125 |                         tmp = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
126 |                         this_img = scale_to_unit_interval(tmp)
127 |                     else:
128 |                         this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
129 |                     # add the slice to the corresponding position in the
130 |                     # output array
131 |                     out_array[
132 |                         tile_row * (H+Hs): tile_row * (H + Hs) + H,
133 |                         tile_col * (W+Ws): tile_col * (W + Ws) + W
134 |                         ] \
135 |                         = this_img * (255 if output_pixel_vals else 1)
136 |         return out_array
137 | 
138 | # Matrix to image
139 | def mat_to_img(w, dim_input, scale=False, colorImg=False, tile_spacing=(1,1), tile_shape=0, save_path=None):
140 |     if tile_shape == 0:
141 |         rowscols = int(w.shape[1]**0.5)
142 |         tile_shape = (rowscols,rowscols)
143 |     imgs = tile_raster_images(X=w.T, img_shape=dim_input, tile_shape=tile_shape, tile_spacing=tile_spacing, scale=scale, colorImg=colorImg)
144 |     if save_path is not None:
145 |         imsave(save_path, imgs)
146 |     return imgs


--------------------------------------------------------------------------------
/layers/pool_special.py:
--------------------------------------------------------------------------------
  1 | import lasagne
  2 | from lasagne import init
  3 | from lasagne import nonlinearities
  4 | 
  5 | import theano.tensor as T
  6 | import theano
  7 | import numpy as np
  8 | import theano.tensor.extra_ops as Textra
  9 | 
 10 | 
 11 | __all__ = [
 12 |     "UnPoolLayer", # upsampling by setting input to the top-left corner
 13 |     "RepeatUnPoolLayer", # upsampling by repeating input
 14 |     "UnPoolMaskLayer", # upsampling with pooling location
 15 |     "MaxPoolLocationLayer", # get the location of max pooling
 16 | ]
 17 | 
 18 | class UnPoolLayer(lasagne.layers.Layer):
 19 |     '''
 20 |     Layer that upsampling the input
 21 |     
 22 |     Parameters
 23 |     ----------
 24 |     incoming: class `Layer` instance
 25 |         dim of incoming: B,C,0,1
 26 | 
 27 |     factor : tuple of length 2
 28 |         upsample factor
 29 |     ----------
 30 |     '''
 31 |     def __init__(self, incoming, factor, **kwargs):
 32 |         super(UnPoolLayer, self).__init__(incoming, **kwargs)
 33 |         assert len(factor) == 2
 34 |         assert len(self.input_shape) == 4
 35 |         self.factor = factor
 36 |         window = np.zeros(self.factor, dtype=np.float32)
 37 |         window[0, 0] = 1
 38 |         image_shape = self.input_shape[1:]
 39 |         self.mask = theano.shared(np.tile(window.reshape((1,)+self.factor), image_shape))
 40 |         self.mask = T.shape_padleft(self.mask,n_ones=1)
 41 | 
 42 |     def get_output_shape_for(self, input_shape):
 43 |         return input_shape[:2] + (input_shape[2]*self.factor[0], input_shape[3]*self.factor[1])
 44 | 
 45 |     def get_output_for(self, input, **kwargs):
 46 |         return Textra.repeat(Textra.repeat(input,self.factor[0],axis=2),self.factor[1],axis=3)*self.mask
 47 | 
 48 | class RepeatUnPoolLayer(lasagne.layers.Layer):
 49 |     '''
 50 |     Layer that upsampling the input
 51 |         one unit in the input corresponds a square of units in the output
 52 |         all values in the region are same as the corresponding value of input
 53 |     
 54 |     Parameters
 55 |     ----------
 56 |     incoming: class `Layer` instance
 57 |         dim of incoming: B,C,0,1
 58 | 
 59 |     factor : tuple of length 2
 60 |         upsample factor
 61 |     ----------
 62 |     '''
 63 |     def __init__(self, incoming, factor, **kwargs):
 64 |         super(RepeatUnPoolLayer, self).__init__(incoming, **kwargs)
 65 |         assert len(factor) == 2
 66 |         assert len(self.input_shape) == 4
 67 |         self.factor = factor
 68 | 
 69 |     def get_output_shape_for(self, input_shape):
 70 |         return input_shape[:2] + (input_shape[2]*self.factor[0], input_shape[3]*self.factor[1])
 71 | 
 72 |     def get_output_for(self, input, **kwargs):
 73 |         return Textra.repeat(Textra.repeat(input,self.factor[0],axis=2),self.factor[1],axis=3)
 74 | 
 75 | class UnPoolMaskLayer(lasagne.layers.MergeLayer):
 76 |     '''
 77 |     Layer that upsampling the input given the pooling location
 78 |     
 79 |     Parameters
 80 |     ----------
 81 |     incoming, mask : class `Layer` instances
 82 |         dim of incoming: B,C,0,1
 83 |         dim of mask: B,C,0*f1,1*f2
 84 | 
 85 |     factor : tuple of length 2
 86 |         upsample factor
 87 |     ----------
 88 |     '''
 89 |     def __init__(self, incoming, mask, factor, noise_level=0.7, **kwargs):
 90 |         super(UnPoolMaskLayer, self).__init__([incoming, mask], **kwargs)
 91 |         assert len(factor) == 2
 92 |         assert len(self.input_shapes[0]) == 4
 93 |         assert len(self.input_shapes[1]) == 4
 94 |         assert self.input_shapes[0][2]*factor[0] == self.input_shapes[1][2]
 95 |         assert self.input_shapes[0][3]*factor[1] == self.input_shapes[1][3]
 96 |         assert noise_level>=0 and noise_level<=1
 97 |         self.factor = factor
 98 |         self.noise = noise_level
 99 | 
100 |     def get_output_shape_for(self, input_shapes):
101 |         return input_shapes[1]
102 | 
103 |     def get_output_for(self, input, **kwargs):
104 |         data, mask_max = input
105 |         #return Textra.repeat(Textra.repeat(data, self.factor[0], axis=2), self.factor[1], axis=3) * mask_max
106 |         window = np.zeros(self.factor, dtype=np.float32)
107 |         window[0, 0] = 1
108 |         mask_unpool = np.tile(window.reshape((1,) + self.factor), self.input_shapes[0][1:])
109 |         mask_unpool = T.shape_padleft(mask_unpool, n_ones=1)
110 | 
111 |         rs = np.random.RandomState(1234)
112 |         rng = theano.tensor.shared_randomstreams.RandomStreams(rs.randint(999999))
113 |         mask_binomial = rng.binomial(n=1, p=self.noise, size= self.input_shapes[1][1:])
114 |         mask_binomial = T.shape_padleft(T.cast(mask_binomial, dtype='float32'), n_ones=1)
115 | 
116 |         mask =  mask_binomial * mask_unpool + (1 - mask_binomial) * mask_max
117 |         return Textra.repeat(Textra.repeat(data,self.factor[0],axis=2),self.factor[1],axis=3)*mask
118 | 
119 | class MaxPoolLocationLayer(lasagne.layers.Layer):
120 |     '''
121 |     Layer that computes the max-pool location
122 | 
123 |     Parameters
124 |     ----------
125 |     incoming : a class `Layer` instance
126 |         output shape is 4D
127 | 
128 |     factor : tuple of length 2
129 |         downsample, fixed to (2, 2) so far
130 | 
131 |     batch_size : tensor iscalar
132 | 
133 |     References
134 |     ----------
135 |     '''
136 |     def __init__(self, incoming, factor, batch_size, noise_level=0.5, **kwargs):
137 |         super(MaxPoolLocationLayer, self).__init__(incoming, **kwargs)
138 |         assert factor[0] == 2, factor # only for special (2,2) case
139 |         assert factor[1] == 2, factor
140 |         self.factor = factor
141 |         self.batch_size = batch_size
142 |         self.n_channels = self.input_shape[1]
143 |         self.i_s = self.input_shape[-2:]
144 |         self.noise = noise_level
145 | 
146 |     def get_output_shape_for(self, input_shape):
147 |         return input_shape
148 | 
149 |     def _get_output_for(self, input):
150 |         assert input.ndim == 3 # only for 3D
151 |         mask = T.zeros_like(input) # size (None, w, h)
152 |         tmp = T.concatenate([T.shape_padright(input[:, ::2, ::2]), 
153 |             T.shape_padright(input[:, ::2, 1::2]), T.shape_padright(input[:, 1::2, ::2]), 
154 |             T.shape_padright(input[:, 1::2, 1::2])], axis=-1)
155 |         index =  tmp.argmax(axis=-1) # size (None, w/2, h/2)
156 |         i_r = 2*(np.tile(np.arange(self.i_s[0]/2), (self.i_s[1]/2,1))).T
157 |         i_r = index/2 + T.shape_padleft(i_r)
158 |         i_c = 2*(np.tile(np.arange(self.i_s[1]/2), (self.i_s[0]/2,1)))
159 |         i_c = index%2 + T.shape_padleft(i_c)
160 |         i_b = T.tile(T.arange(self.batch_size*self.n_channels),(self.i_s[0]/2*self.i_s[1]/2,1)).T
161 |         mask = T.set_subtensor(mask[i_b.flatten(), i_r.flatten(), i_c.flatten()],1)
162 |         return mask
163 | 
164 |     def get_output_for(self, input, **kwargs):
165 |         assert input.ndim == 4 # only for 4D
166 |         input_3D = input.reshape((self.batch_size*self.n_channels,)+self.i_s)
167 |         mask_max = self._get_output_for(input_3D)
168 |         return mask_max.reshape((self.batch_size,self.n_channels)+self.i_s)
169 | 
170 | 


--------------------------------------------------------------------------------
/datasets.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pickle as pkl
  3 | import cPickle as cPkl
  4 | import gzip
  5 | import tarfile
  6 | import fnmatch
  7 | import os
  8 | import urllib
  9 | from scipy.io import loadmat
 10 | 
 11 | def _unpickle(f):
 12 |     import cPickle
 13 |     fo = open(f, 'rb')
 14 |     d = cPickle.load(fo)
 15 |     fo.close()
 16 |     return d
 17 | 
 18 | def _get_datafolder_path():
 19 |     #full_path = os.path.abspath('.')
 20 |     #path = full_path +'/data'
 21 |     path = '/home/chongxuan/mfs/data'
 22 |     return path
 23 | 
 24 | def _download_svhn(datasets_dir=_get_datafolder_path()+'/svhn/'):    
 25 |     url = 'http://ufldl.stanford.edu/housenumbers/'
 26 |     data_file_list = ['train_32x32.mat', 'test_32x32.mat', 'extra_32x32.mat']
 27 | 
 28 |     if not os.path.exists(datasets_dir):
 29 |         os.makedirs(datasets_dir)
 30 | 
 31 |     for data_file in data_file_list:
 32 |         if not os.path.isfile(os.path.join(datasets_dir,data_file)):
 33 |             urllib.urlretrieve(os.path.join(url,data_file), data_file)
 34 |     
 35 |     batch1_data = []
 36 |     batch1_labels = []
 37 |     batch2_data = []
 38 |     batch2_labels = []
 39 |     from random import shuffle
 40 | 
 41 |     train = loadmat(os.path.join(datasets_dir,data_file_list[0]))
 42 |     x = train['X'].transpose((2, 0, 1, 3)).reshape((3072, -1))
 43 |     y = train['y'].reshape((-1,))
 44 |     for i in np.arange(len(y)):
 45 |         if y[i] == 10:
 46 |             y[i] = 0
 47 |     index = np.arange(len(y))
 48 |     shuffle(index)
 49 |     x = x[:, index]
 50 |     y = y[index]
 51 | 
 52 |     count = np.zeros((10,), 'int32')
 53 |     for i in np.arange(len(y)):
 54 |         if count[y[i]] < 400:
 55 |             count[y[i]] += 1
 56 |             batch2_data.append(x[:, i])
 57 |             batch2_labels.append(y[i])
 58 |         else:
 59 |             batch1_data.append(x[:, i])
 60 |             batch1_labels.append(y[i])
 61 | 
 62 |     print '---train'
 63 |     extra = loadmat(os.path.join(datasets_dir,data_file_list[2]))
 64 |     x = extra['X'].transpose((2, 0, 1, 3)).reshape((3072, -1))
 65 |     y = extra['y'].reshape((-1,))
 66 |     del extra
 67 |     for i in np.arange(len(y)):
 68 |         if y[i] == 10:
 69 |             y[i] = 0
 70 |     index = np.arange(len(y))
 71 |     shuffle(index)
 72 |     x = x[:, index]
 73 |     y = y[index]
 74 | 
 75 |     count = np.zeros((10,), 'int32')
 76 |     for i in np.arange(len(y)):
 77 |         if count[y[i]] < 200:
 78 |             count[y[i]] += 1
 79 |             batch2_data.append(x[:, i])
 80 |             batch2_labels.append(y[i])
 81 |         else:
 82 |             batch1_data.append(x[:, i])
 83 |             batch1_labels.append(y[i])
 84 |     batch1_data = np.asarray(batch1_data)
 85 |     batch2_data = np.asarray(batch2_data)
 86 |     batch1_labels = np.asarray(batch1_labels)
 87 |     batch2_labels = np.asarray(batch2_labels)
 88 |     del x, y
 89 | 
 90 |     print '---extra'
 91 | 
 92 |     test = loadmat(os.path.join(datasets_dir,data_file_list[1]))
 93 |     x = test['X'].transpose((2, 0, 1, 3)).reshape((3072, -1))
 94 |     y = test['y'].reshape((-1,))
 95 |     for i in np.arange(len(y)):
 96 |         if y[i] == 10:
 97 |             y[i] = 0
 98 |     batch3_data = x
 99 |     batch3_labels = []
100 |     for i in np.arange(len(y)):
101 |         batch3_labels.append(y[i])
102 |     batch3_data = np.asarray(batch3_data).T
103 |     batch3_labels = np.asarray(batch3_labels)
104 | 
105 |     print 'Check n x f'
106 |     print batch1_data.shape
107 |     print batch1_labels.shape
108 |     print batch2_data.shape
109 |     print batch2_labels.shape
110 |     print batch3_data.shape
111 |     print batch3_labels.shape
112 | 
113 |     f = file(datasets_dir+"/svhn.bin","wb")
114 |     np.save(f,batch1_data)
115 |     np.save(f,batch1_labels)
116 |     np.save(f,batch2_data)
117 |     np.save(f,batch2_labels)
118 |     np.save(f,batch3_data)
119 |     np.save(f,batch3_labels)
120 |     f.close()
121 | 
122 | def load_svhn(datasets_dir=_get_datafolder_path()+'/svhn/', normalized=True, centered=True):
123 |     data_file = os.path.join(datasets_dir, 'svhn.bin')
124 | 
125 |     if not os.path.exists(datasets_dir):
126 |         os.makedirs(datasets_dir)
127 | 
128 |     if not os.path.isfile(data_file):
129 |         _download_svhn()
130 |     
131 |     f = file(data_file,"rb")
132 |     train_x = np.load(f)
133 |     train_y = np.load(f)
134 |     valid_x = np.load(f)
135 |     valid_y = np.load(f)
136 |     test_x = np.load(f)
137 |     test_y = np.load(f)
138 |     f.close()
139 |     if normalized:
140 |         train_x = train_x/256.0
141 |         valid_x = valid_x/256.0
142 |         test_x = test_x/256.0
143 | 
144 |     avg = None
145 |     if centered:
146 |         avg = train_x.mean(axis=0,keepdims=True)
147 |         train_x = train_x - avg
148 |         test_x = test_x - avg
149 |         valid_x = valid_x - avg
150 |     
151 |     return train_x, train_y, valid_x, valid_y, test_x, test_y, avg
152 | 
153 | def load_cifar10(datasets_dir=_get_datafolder_path()+'/cifar10', num_val=None, normalized=True, centered=True):
154 |     # this code is largely cp from Kyle Kastner:
155 |     #
156 |     # https://gist.github.com/kastnerkyle/f3f67424adda343fef40
157 |     
158 |     url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
159 |     data_file = os.path.join(datasets_dir, 'cifar-10-python.tar.gz')
160 |     data_dir = os.path.join(datasets_dir, 'cifar-10-batches-py')
161 | 
162 |     if not os.path.exists(datasets_dir):
163 |         os.makedirs(datasets_dir)
164 | 
165 |     if not os.path.isfile(data_file):
166 |         urllib.urlretrieve(url, data_file)
167 |         org_dir = os.getcwd()
168 |         with tarfile.open(data_file) as tar:
169 |             os.chdir(datasets_dir)
170 |             tar.extractall()
171 |         os.chdir(org_dir)
172 | 
173 |     train_files = []
174 |     for filepath in fnmatch.filter(os.listdir(data_dir), 'data*'):
175 |         train_files.append(os.path.join(data_dir, filepath))
176 |     train_files = sorted(train_files, key=lambda x: x.split("_")[-1])
177 | 
178 |     test_file = os.path.join(data_dir, 'test_batch')
179 | 
180 |     x_train, targets_train = [], []
181 |     for f in train_files:
182 |         d = _unpickle(f)
183 |         x_train.append(d['data'])
184 |         targets_train.append(d['labels'])
185 |     x_train = np.array(x_train, dtype='uint8')
186 |     shp = x_train.shape
187 |     x_train = x_train.reshape(shp[0] * shp[1], 3, 32, 32)
188 |     targets_train = np.array(targets_train)
189 |     targets_train = targets_train.ravel()
190 | 
191 |     d = _unpickle(test_file)
192 |     x_test = d['data']
193 |     targets_test = d['labels']
194 |     x_test = np.array(x_test, dtype='uint8')
195 |     x_test = x_test.reshape(-1, 3, 32, 32)
196 |     targets_test = np.array(targets_test)
197 |     targets_test = targets_test.ravel()
198 |     
199 |     if normalized:
200 |         x_train = x_train/256.0
201 |         x_test = x_test/256.0
202 |     if centered:
203 |         avg = x_train.mean(axis=0,keepdims=True)
204 |         x_train = x_train - avg
205 |         x_test = x_test - avg
206 | 
207 |     if num_val is not None:
208 |         perm = np.random.permutation(x_train.shape[0])
209 |         x = x_train[perm]
210 |         y = targets_train[perm]
211 | 
212 |         x_valid = x[:num_val]
213 |         targets_valid = y[:num_val]
214 |         x_train = x[num_val:]
215 |         targets_train = y[num_val:]
216 |         return (x_train, targets_train,
217 |                 x_valid, targets_valid,
218 |                 x_test, targets_test)
219 |     else:
220 |         return x_train, targets_train, x_test, targets_test


--------------------------------------------------------------------------------
/components/objectives.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | objectives
  3 | '''
  4 | import numpy as np
  5 | import theano.tensor as T
  6 | import theano
  7 | import lasagne
  8 | 
  9 | from parmesan.distributions import log_stdnormal, log_normal2, log_bernoulli
 10 | 
 11 | def margin_for_reinforce(predictions, num_labelled, delta=1):
 12 |     num_cls = predictions.shape[1]
 13 |     predictions=predictions[num_labelled:] # predictions U x nc
 14 |     p_max = T.max(predictions, axis=1)
 15 |     p_mean = T.mean(predictions, axis=1)
 16 |     margin = (p_max - p_mean) / (num_cls - 1) * num_cls
 17 |     return margin
 18 | 
 19 | def margin_for_reinforce1(predictions, num_labelled, delta=1):
 20 |     num_cls = predictions.shape[1]
 21 |     predictions=predictions[num_labelled:] # predictions U x nc
 22 |     p_max = T.max(predictions, axis=1)
 23 |     p_mean = T.mean(predictions, axis=1)
 24 |     margin = (p_max - p_mean) / (num_cls - 1) * num_cls
 25 |     return margin
 26 | 
 27 | def lowerbound_for_reinforce(z, z_mu, z_log_var, x_mu, x, num_features, num_labelled, num_classes, epsilon=1e-6):
 28 |     x = x.reshape((-1,num_features))
 29 |     x_mu = x_mu.reshape((-1,num_features))
 30 | 
 31 |     log_qz_given_xy = log_normal2(z, z_mu, z_log_var).sum(axis=1)
 32 |     log_pz = log_stdnormal(z).sum(axis=1)
 33 |     log_py = T.log(1.0/num_classes)
 34 |     log_px_given_zy = log_bernoulli(x, T.clip(x_mu, epsilon, 1 - epsilon)).sum(axis=1)
 35 |     ll_xy = log_px_given_zy + log_pz + log_py - log_qz_given_xy
 36 |     return ll_xy[num_labelled:]
 37 | 
 38 | def multiclass_s3vm_loss(predictions, targets, num_labelled, weight_decay, norm_type=2, form ='mean_class', alpha_hinge=1., alpha_hat=1., alpha_reg=1., alpha_decay=1., delta=1., entropy_term=False):
 39 |     '''
 40 |     predictions: 
 41 |         size L x nc
 42 |              U x nc
 43 |     targets: 
 44 |         size L x nc
 45 | 
 46 |     output:
 47 |         weighted sum of hinge loss, hat loss, balance constraint and weight decay
 48 |     '''
 49 |     num_cls = predictions.shape[1]
 50 |     if targets.ndim == predictions.ndim - 1:
 51 |         targets = theano.tensor.extra_ops.to_one_hot(targets, num_cls)
 52 |     elif targets.ndim != predictions.ndim:
 53 |         raise TypeError('rank mismatch between targets and predictions')
 54 | 
 55 |     hinge_loss = multiclass_hinge_loss_(predictions[:num_labelled], targets, delta)
 56 |     hat_loss = multiclass_hat_loss(predictions[num_labelled:], delta)
 57 |     regularization = balance_constraint(predictions, targets, num_labelled, norm_type, form)
 58 |     if not entropy_term:
 59 |         return alpha_hinge*hinge_loss.mean() + alpha_hat*hat_loss.mean() + alpha_reg*regularization + alpha_decay*weight_decay
 60 |     else:
 61 |         # given an unlabeled data, when treat hat loss as the entropy term derived from a lowerbound, it should conflict to current prediction, which is quite strange but true ... the entropy term enforce the discriminator to predict unlabeled data uniformly as a regularization
 62 |         # max entropy regularization provides a tighter lowerbound but hurt the semi-supervised learning performance as it conflicts to the hat loss ...
 63 |         return alpha_hinge*hinge_loss.mean() - alpha_hat*hat_loss.mean() + alpha_reg*regularization + alpha_decay*weight_decay
 64 | 
 65 | def multiclass_hinge_loss_(predictions, targets, delta=1):
 66 |     return lasagne.objectives.multiclass_hinge_loss(predictions, targets, delta)
 67 | 
 68 | def multiclass_hinge_loss(predictions, targets, weight_decay, alpha_decay=1., delta=1):
 69 |     return multiclass_hinge_loss_(predictions, targets, delta).mean() + alpha_decay*weight_decay
 70 | 
 71 | def multiclass_hat_loss(predictions, delta=1):
 72 |     targets = T.argmax(predictions, axis=1)
 73 |     return multiclass_hinge_loss(predictions, targets, delta)
 74 | 
 75 | def balance_constraint(predictions, targets, num_labelled, norm_type=2, form='mean_class'):
 76 |     '''
 77 |     balance constraint
 78 |     ------
 79 |     norm_type: type of norm 
 80 |             l2 or l1
 81 |     form: form of regularization
 82 |             mean_class: average mean activation of u and l data should be the same over each class
 83 |             mean_all: average mean activation of u and l data should be the same over all data
 84 |             ratio: 
 85 | 
 86 |     '''
 87 |     p_l = predictions[:num_labelled]
 88 |     p_u = predictions[num_labelled:]
 89 |     t_l = targets
 90 |     t_u = T.argmax(p_u, axis=1)
 91 |     num_cls = predictions.shape[1]
 92 |     t_u = theano.tensor.extra_ops.to_one_hot(t_u, num_cls)
 93 |     if form == 'mean_class':
 94 |         res = (p_l*t_l).mean(axis=0) - (p_u*t_u).mean(axis=0)
 95 |     elif form == 'mean_all':
 96 |         res = p_l.mean(axis=0) - p_u.mean(axis=0)
 97 |     elif form == 'ratio':
 98 |         pass
 99 | 
100 |     # res should be a vector with length number_class
101 |     return res.norm(norm_type)
102 | 
103 | def latent_gaussian_x_gaussian(z, z_mu, z_log_var, x_mu, x_log_var, x, latent_size, num_features, eq_samples, iw_samples, epsilon=1e-6):
104 |     # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
105 |     z = z.reshape((-1, eq_samples, iw_samples, latent_size))
106 |     x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))
107 |     x_log_var = x_log_var.reshape((-1, eq_samples, iw_samples, num_features))
108 | 
109 |     # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
110 |     x = x.reshape((-1,num_features))
111 |     x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
112 |     z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)              # size: (batch_size, eq_samples, iw_samples, num_latent)
113 |     z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1)    # size: (batch_size, eq_samples, iw_samples, num_latent)
114 | 
115 |     # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
116 |     # so we sum over feature/latent dimensions for multivariate pdfs
117 |     log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3)
118 |     log_pz = log_stdnormal(z).sum(axis=3)
119 |     #log_px_given_z = log_bernoulli(x, T.clip(x_mu, epsilon, 1 - epsilon)).sum(axis=3)
120 |     log_px_given_z = log_normal2(x, x_mu, x_log_var).sum(axis=3)
121 | 
122 |     #all log_*** should have dimension (batch_size, eq_samples, iw_samples)
123 |     # Calculate the LL using log-sum-exp to avoid underflow
124 |     a = log_pz + log_px_given_z - log_qz_given_x    # size: (batch_size, eq_samples, iw_samples)
125 |     a_max = T.max(a, axis=2, keepdims=True)         # size: (batch_size, eq_samples, 1)
126 | 
127 |     LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2) ) )
128 | 
129 |     return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
130 | 
131 | def latent_gaussian_x_bernoulli(z, z_mu, z_log_var, x_mu, x, latent_size, num_features, eq_samples, iw_samples, epsilon=1e-6):
132 |     """
133 |     Latent z       : gaussian with standard normal prior
134 |     decoder output : bernoulli
135 | 
136 |     When the output is bernoulli then the output from the decoder
137 |     should be sigmoid. The sizes of the inputs are
138 |     z: (batch_size*eq_samples*iw_samples, num_latent)
139 |     z_mu: (batch_size, num_latent)
140 |     z_log_var: (batch_size, num_latent)
141 |     x_mu: (batch_size*eq_samples*iw_samples, num_features)
142 |     x: (batch_size, num_features)
143 | 
144 |     Reference: Burda et al. 2015 "Importance Weighted Autoencoders"
145 |     """
146 | 
147 |     # reshape the variables so batch_size, eq_samples and iw_samples are separate dimensions
148 |     z = z.reshape((-1, eq_samples, iw_samples, latent_size))
149 |     x_mu = x_mu.reshape((-1, eq_samples, iw_samples, num_features))
150 | 
151 |     # dimshuffle x, z_mu and z_log_var since we need to broadcast them when calculating the pdfs
152 |     x = x.reshape((-1,num_features))
153 |     x = x.dimshuffle(0, 'x', 'x', 1)                    # size: (batch_size, eq_samples, iw_samples, num_features)
154 |     z_mu = z_mu.dimshuffle(0, 'x', 'x', 1)              # size: (batch_size, eq_samples, iw_samples, num_latent)
155 |     z_log_var = z_log_var.dimshuffle(0, 'x', 'x', 1)    # size: (batch_size, eq_samples, iw_samples, num_latent)
156 | 
157 |     # calculate LL components, note that the log_xyz() functions return log prob. for indepenedent components separately 
158 |     # so we sum over feature/latent dimensions for multivariate pdfs
159 |     log_qz_given_x = log_normal2(z, z_mu, z_log_var).sum(axis=3)
160 |     log_pz = log_stdnormal(z).sum(axis=3)
161 |     log_px_given_z = log_bernoulli(x, T.clip(x_mu, epsilon, 1 - epsilon)).sum(axis=3)
162 | 
163 |     #all log_*** should have dimension (batch_size, eq_samples, iw_samples)
164 |     # Calculate the LL using log-sum-exp to avoid underflow
165 |     a = log_pz + log_px_given_z - log_qz_given_x    # size: (batch_size, eq_samples, iw_samples)
166 |     a_max = T.max(a, axis=2, keepdims=True)         # size: (batch_size, eq_samples, 1)
167 | 
168 |     # LL is calculated using Eq (8) in Burda et al.
169 |     # Working from inside out of the calculation below:
170 |     # T.exp(a-a_max): (batch_size, eq_samples, iw_samples)
171 |     # -> subtract a_max to avoid overflow. a_max is specific for  each set of
172 |     # importance samples and is broadcasted over the last dimension.
173 |     #
174 |     # T.log( T.mean(T.exp(a-a_max), axis=2) ): (batch_size, eq_samples)
175 |     # -> This is the log of the sum over the importance weighted samples
176 |     #
177 |     # The outer T.mean() computes the mean over eq_samples and batch_size
178 |     #
179 |     # Lastly we add T.mean(a_max) to correct for the log-sum-exp trick
180 |     LL = T.mean(a_max) + T.mean( T.log( T.mean(T.exp(a-a_max), axis=2) ) )
181 | 
182 |     return LL, T.mean(log_qz_given_x), T.mean(log_pz), T.mean(log_px_given_z)
183 | 


--------------------------------------------------------------------------------
/cdgm_x2y_xy2z_zy2x_sl.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code implements max-margin conditional deep generative model which incorporates the side information in generative modelling and uses a discriminative classifier to infer the latent labels
  3 | for supervised learning
  4 | '''
  5 | 
  6 | import gzip, os, cPickle, time, math, argparse, shutil, sys
  7 | 
  8 | import numpy as np
  9 | import theano.tensor as T
 10 | import theano
 11 | import lasagne
 12 | from parmesan.datasets import load_mnist_realval, load_mnist_binarized, load_frey_faces
 13 | from datasets import load_cifar10, load_svhn
 14 | from parmesan.layers import SampleLayer
 15 | 
 16 | from layers.merge import ConvConcatLayer, MLPConcatLayer
 17 | from utils.others import get_nonlin_list, get_pad_list, bernoullisample, build_log_file, printarray_2D, array2file_2D
 18 | from components.shortcuts import convlayer, fractionalstridedlayer, unpoolconvlayer, mlplayer
 19 | from components.objectives import latent_gaussian_x_gaussian, latent_gaussian_x_bernoulli
 20 | from components.objectives import multiclass_s3vm_loss, multiclass_hinge_loss
 21 | import  utils.paramgraphics as paramgraphics
 22 | 
 23 | '''
 24 | parameters
 25 | '''
 26 | # global
 27 | theano.config.floatX = 'float32'
 28 | filename_script = os.path.basename(os.path.realpath(__file__))
 29 | parser = argparse.ArgumentParser()
 30 | parser.add_argument("-dataset", type=str, default="mnist_real")
 31 | parser.add_argument("-outfolder", type=str, default=os.path.join("results-ssl", os.path.splitext(filename_script)[0]))
 32 | parser.add_argument("-preprocess", type=str, default="none")
 33 | parser.add_argument("-subset_flag", type=str, default ='false')
 34 | # architecture
 35 | parser.add_argument("-nz", type=int, default=100)
 36 | parser.add_argument("-batch_norm_dgm", type=str, default='false')
 37 | parser.add_argument("-top_mlp", type=str, default='false')
 38 | parser.add_argument("-mlp_size", type=int, default=256)
 39 | parser.add_argument("-batch_norm_classifier", type=str, default='false')
 40 | # classifier
 41 | parser.add_argument("-batch_size", type=int, default=200)
 42 | parser.add_argument("-delta", type=float, default=1.)
 43 | parser.add_argument("-alpha_decay", type=float, default=1e-4)
 44 | parser.add_argument("-alpha", type=float, default=.1)
 45 | parser.add_argument("-norm_type", type=int, default=2)
 46 | parser.add_argument("-form", type=str, default='mean_class')
 47 | # feature extractor
 48 | parser.add_argument("-nlayers_cla", type=int, default=3)
 49 | parser.add_argument("-nk_cla", type=str, default='32,64,128')
 50 | parser.add_argument("-dk_cla", type=str, default='4,5,3')
 51 | parser.add_argument("-pad_cla", type=str, default='valid,valid,valid')
 52 | parser.add_argument("-str_cla", type=str, default='2,2,2')
 53 | parser.add_argument("-ps_cla", type=str, default='1,1,1')
 54 | parser.add_argument("-nonlin_cla", type=str, default='rectify,rectify,rectify')
 55 | parser.add_argument("-dr_cla", type=str, default='0,0,0')
 56 | # encoder
 57 | parser.add_argument("-nlayers_enc", type=int, default=3)
 58 | parser.add_argument("-nk_enc", type=str, default='32,64,128')
 59 | parser.add_argument("-dk_enc", type=str, default='4,5,3')
 60 | parser.add_argument("-pad_enc", type=str, default='valid,valid,valid')
 61 | parser.add_argument("-str_enc", type=str, default='2,2,2')
 62 | parser.add_argument("-ps_enc", type=str, default='1,1,1')
 63 | parser.add_argument("-nonlin_enc", type=str, default='rectify,rectify,rectify')
 64 | parser.add_argument("-dr_enc", type=str, default='0,0,0')
 65 | # decoder
 66 | parser.add_argument("-nlayers_dec", type=int, default=4)
 67 | parser.add_argument("-nk_dec", type=str, default='128,64,32,1')
 68 | parser.add_argument("-dk_dec", type=str, default='3,5,4,5')
 69 | parser.add_argument("-pad_dec", type=str, default='valid,valid,valid,same')
 70 | parser.add_argument("-str_dec", type=str, default='2,2,2,1')
 71 | parser.add_argument("-up_method", type=str, default='frac_strided,frac_strided,frac_strided,none')
 72 | parser.add_argument("-ps_dec", type=str, default='1,1,1,1')
 73 | parser.add_argument("-nonlin_dec", type=str, default='rectify,rectify,rectify,sigmoid')
 74 | parser.add_argument("-dr_dec", type=str, default='0,0,0,0')
 75 | # optimization
 76 | parser.add_argument("-flag", type=str, default='validation') # validation for anneal learning rate
 77 | parser.add_argument("-lr", type=float, default=0.0003)
 78 | parser.add_argument("-nepochs", type=int, default=200)
 79 | parser.add_argument("-anneal_lr_epoch", type=int, default=100)
 80 | parser.add_argument("-anneal_lr_factor", type=float, default=.99)
 81 | parser.add_argument("-every_anneal", type=int, default=1)
 82 | clip_grad = 1
 83 | max_norm = 5
 84 | # name
 85 | parser.add_argument("-name", type=str, default='')
 86 | # inference
 87 | parser.add_argument("-eq_samples", type=int,
 88 |         help="number of samples for the expectation over q(z|x)", default=1)
 89 | parser.add_argument("-iw_samples", type=int,
 90 |         help="number of importance weighted samples", default=1)
 91 | 
 92 | # random seeds for reproducibility
 93 | np.random.seed(1234)
 94 | from theano.tensor.shared_randomstreams import RandomStreams
 95 | srng = RandomStreams(seed=1234)
 96 | 
 97 | # get parameters
 98 | # global
 99 | args = parser.parse_args()
100 | dataset = args.dataset
101 | subset_flag = args.subset_flag == 'true' or args.subset_flag == 'True'
102 | eval_epoch = 1
103 | # architecture
104 | nz = args.nz
105 | bn_dgm = args.batch_norm_dgm == 'true' or args.batch_norm_dgm == 'True'
106 | top_mlp = args.top_mlp == 'true' or args.top_mlp == 'True'
107 | mlp_size = args.mlp_size
108 | bn_cla = args.batch_norm_classifier == 'true' or args.batch_norm_classifier == 'True'
109 | # classifier
110 | batch_size = args.batch_size
111 | delta = args.delta
112 | alpha_decay = args.alpha_decay
113 | alpha = args.alpha
114 | norm_type = args.norm_type
115 | form = args.form
116 | # feature extractor
117 | nlayers_cla = args.nlayers_cla
118 | nk_cla = map(int, args.nk_cla.split(','))
119 | dk_cla = map(int, args.dk_cla.split(','))
120 | pad_cla = map(str, args.pad_cla.split(','))
121 | str_cla = map(int, args.str_cla.split(','))
122 | ps_cla = map(int, args.ps_cla.split(','))
123 | dr_cla = map(float, args.dr_cla.split(','))
124 | nonlin_cla = get_nonlin_list(map(str, args.nonlin_cla.split(',')))
125 | # encoder
126 | nlayers_enc = args.nlayers_enc
127 | nk_enc = map(int, args.nk_enc.split(','))
128 | dk_enc = map(int, args.dk_enc.split(','))
129 | pad_enc = get_pad_list(map(str, args.pad_enc.split(',')))
130 | str_enc = map(int, args.str_enc.split(','))
131 | ps_enc = map(int, args.ps_enc.split(','))
132 | dr_enc = map(float, args.dr_enc.split(','))
133 | nonlin_enc = get_nonlin_list(map(str, args.nonlin_enc.split(',')))
134 | # decoder
135 | nlayers_dec = args.nlayers_dec
136 | nk_dec = map(int, args.nk_dec.split(','))
137 | dk_dec = map(int, args.dk_dec.split(','))
138 | pad_dec = get_pad_list(map(str, args.pad_dec.split(',')))
139 | str_dec = map(int, args.str_dec.split(','))
140 | ps_dec = map(int, args.ps_dec.split(','))
141 | dr_dec = map(float, args.dr_dec.split(','))
142 | nonlin_dec = get_nonlin_list(map(str, args.nonlin_dec.split(',')))
143 | up_method = map(str, args.up_method.split(','))
144 | # optimization
145 | flag = args.flag
146 | lr = args.lr
147 | num_epochs = args.nepochs
148 | anneal_lr_epoch = args.anneal_lr_epoch
149 | anneal_lr_factor = args.anneal_lr_factor
150 | every_anneal = args.every_anneal
151 | # inference
152 | iw_samples = args.iw_samples
153 | eq_samples = args.eq_samples
154 | # log file
155 | logfile, res_out = build_log_file(args, filename_script)
156 | shutil.copy(os.path.realpath(__file__), os.path.join(res_out, filename_script))
157 | 
158 | '''
159 | datasets
160 | '''
161 | if dataset == 'mnist_real':
162 |     colorImg = False
163 |     dim_input = (28,28)
164 |     in_channels = 1
165 |     num_classes = 10
166 |     generation_scale = False
167 |     num_generation = 100
168 |     vis_epoch = 100
169 |     distribution = 'bernoulli'
170 |     num_features = in_channels*dim_input[0]*dim_input[1]
171 |     print "Using real-valued mnist dataset"
172 |     train_x, train_t, valid_x, valid_t, test_x, test_t = load_mnist_realval()
173 |     if flag == 'validation':
174 |         test_x = valid_x
175 |         test_t = valid_t
176 |     else:
177 |         train_x = np.concatenate([train_x,valid_x])
178 |         train_t = np.hstack((train_t, valid_t))
179 |     train_x_size = train_t.shape[0]
180 |     train_t = np.int32(train_t)
181 |     test_t = np.int32(test_t)
182 |     train_x = train_x.astype(theano.config.floatX)
183 |     test_x = test_x.astype(theano.config.floatX)
184 |     train_x = train_x.reshape((-1, in_channels)+dim_input)
185 |     test_x = test_x.reshape((-1, in_channels)+dim_input)
186 |         
187 | elif dataset == 'cifar10':
188 |     colorImg = True
189 |     dim_input = (32,32)
190 |     in_channels = 3
191 |     num_classes = 10
192 |     generation_scale = False
193 |     num_generation = 100
194 |     vis_epoch = 100
195 |     distribution = 'bernoulli'
196 |     num_features = in_channels*dim_input[0]*dim_input[1]
197 |     print "Using cifar10 dataset"
198 |     train_x, train_t, valid_x, valid_t, test_x, test_t = load_cifar10(num_val=5000, normalized=True, centered=True)
199 |     if flag == 'validation':
200 |         test_x = valid_x
201 |         test_t = valid_t
202 |     else:
203 |         train_x = np.concatenate([train_x,valid_x])
204 |         train_t = np.hstack((train_t, valid_t))
205 |     train_x_size = train_t.shape[0]
206 |     train_t = np.int32(train_t)
207 |     test_t = np.int32(test_t)
208 |     train_x = train_x.astype(theano.config.floatX)
209 |     test_x = test_x.astype(theano.config.floatX)
210 |     train_x = train_x.reshape((-1, in_channels)+dim_input)
211 |     test_x = test_x.reshape((-1, in_channels)+dim_input)
212 | 
213 | elif dataset == 'svhn':
214 |     colorImg = True
215 |     dim_input = (32,32)
216 |     in_channels = 3
217 |     num_classes = 10
218 |     generation_scale = False
219 |     num_generation = 100
220 |     vis_epoch = 10
221 |     distribution = 'bernoulli'
222 |     num_features = in_channels*dim_input[0]*dim_input[1]
223 |     print "Using svhn dataset"
224 |     train_x, train_t, valid_x, valid_t, test_x, test_t = load_svhn(normalized=True, centered=False)
225 |     if flag == 'validation':
226 |         test_x = valid_x
227 |         test_t = valid_t
228 |     else:
229 |         train_x = np.concatenate([train_x,valid_x])
230 |         train_t = np.hstack((train_t, valid_t))
231 |     train_x_size = train_t.shape[0]
232 |     train_t = np.int32(train_t)
233 |     test_t = np.int32(test_t)
234 |     train_x = train_x.astype(theano.config.floatX)
235 |     test_x = test_x.astype(theano.config.floatX)
236 |     train_x = train_x.reshape((-1, in_channels)+dim_input)
237 |     test_x = test_x.reshape((-1, in_channels)+dim_input)
238 | 
239 | # preprocess
240 | 
241 | preprocesses_dataset = lambda dataset: dataset
242 | sh_x_train = theano.shared(preprocesses_dataset(train_x), borrow=True)
243 | sh_t_train = theano.shared(train_t, borrow=True)
244 | sh_x_test = theano.shared(preprocesses_dataset(test_x), borrow=True)
245 | sh_t_test = theano.shared(test_t, borrow=True)
246 | 
247 | '''
248 | building block
249 | '''
250 | # shortcuts
251 | encodelayer = convlayer
252 | 
253 | # decoder layer
254 | def decodelayer(l,up_method,bn,dr,ps,n_kerns,d_kerns,nonlinearity,pad,stride,name):
255 |     # upsampling
256 |     if up_method == 'unpool':
257 |         h_g = unpoolconvlayer(l,bn,dr,ps,n_kerns,d_kerns,nonlinearity,pad,stride,name,'unpool',None)
258 |     elif up_method == 'frac_strided':
259 |         h_g = fractionalstridedlayer(l,bn,dr,n_kerns,d_kerns,nonlinearity,pad,stride,name)
260 |     elif up_method == 'none':
261 |         h_g, _ = convlayer(l,bn,dr,ps,n_kerns,d_kerns,nonlinearity,pad,stride,name)
262 |     else:
263 |         raise Exception('Unknown upsampling method')
264 |     return h_g
265 | 
266 | 
267 | '''
268 | model
269 | '''
270 | # symbolic variables
271 | sym_iw_samples = T.iscalar('iw_samples')
272 | sym_eq_samples = T.iscalar('eq_samples')
273 | sym_lr = T.scalar('lr')
274 | sym_x = T.tensor4('x')
275 | sym_x_cla = T.tensor4('x_cla')
276 | sym_y = T.ivector('y')
277 | sym_index = T.iscalar('index')
278 | sym_batch_size = T.iscalar('batch_size')
279 | batch_slice = slice(sym_index * sym_batch_size, (sym_index + 1) * sym_batch_size)
280 | 
281 | # x2y
282 | l_in_x_cla = lasagne.layers.InputLayer((None, in_channels)+dim_input)
283 | l_cla = [l_in_x_cla,]
284 | print lasagne.layers.get_output_shape(l_cla[-1])
285 | # conv layers
286 | for i in xrange(nlayers_cla):
287 |     l, _= convlayer(l_cla[-1],bn_cla,dr_cla[i],ps_cla[i],nk_cla[i],dk_cla[i],nonlin_cla[i],pad_cla[i],str_cla[i],'CLA-'+str(i+1))
288 |     l_cla.append(l)
289 |     print lasagne.layers.get_output_shape(l_cla[-1])
290 | 
291 | # feature and classifier
292 | if top_mlp:
293 |     l_cla.append(lasagne.layers.FlattenLayer(l_cla[-1]))
294 |     feature = mlplayer(l_cla[-1],bn_cla,0.5,mlp_size,lasagne.nonlinearities.rectify,name='MLP-CLA')
295 | else:
296 |     feature = lasagne.layers.GlobalPoolLayer(l_cla[-1])
297 | classifier = lasagne.layers.DenseLayer(feature, num_units=num_classes, nonlinearity=lasagne.nonlinearities.identity, W=lasagne.init.Normal(1e-2, 0), name="classifier")
298 | 
299 | # encoder xy2z
300 | l_in_x = lasagne.layers.InputLayer((None, in_channels)+dim_input)
301 | l_in_y = lasagne.layers.InputLayer((None,))
302 | l_enc = [l_in_x,]
303 | for i in xrange(nlayers_enc):
304 |     l_enc.append(ConvConcatLayer([l_enc[-1], l_in_y], num_classes))
305 |     l, _ = encodelayer(l_enc[-1],bn_dgm,dr_enc[i],ps_enc[i],nk_enc[i],dk_enc[i],nonlin_enc[i],pad_enc[i],str_enc[i],'ENC-'+str(i+1),False,0)
306 |     l_enc.append(l)
307 |     print lasagne.layers.get_output_shape(l_enc[-1])
308 | 
309 | # reshape
310 | after_conv_shape = lasagne.layers.get_output_shape(l_enc[-1])
311 | after_conv_size = int(np.prod(after_conv_shape[1:]))
312 | l_enc.append(lasagne.layers.FlattenLayer(l_enc[-1]))
313 | print lasagne.layers.get_output_shape(l_enc[-1])
314 | 
315 | # compute parameters and sample z
316 | l_mu = mlplayer(l_enc[-1],False,0,nz,lasagne.nonlinearities.identity,'ENC-MU')
317 | l_log_var = mlplayer(l_enc[-1],False,0,nz,lasagne.nonlinearities.identity,'ENC-LOG_VAR')
318 | l_z = SampleLayer(mean=l_mu,log_var=l_log_var,eq_samples=sym_eq_samples,iw_samples=sym_iw_samples)
319 | 
320 | # decoder zy2x
321 | l_dec = [l_z,]
322 | print lasagne.layers.get_output_shape(l_dec[-1])
323 | 
324 | # reshape
325 | l_dec.append(mlplayer(l_dec[-1],bn_dgm,0,after_conv_size,lasagne.nonlinearities.rectify, 'DEC_l_Z'))
326 | print lasagne.layers.get_output_shape(l_dec[-1])
327 | l_dec.append(lasagne.layers.ReshapeLayer(l_dec[-1], shape=(-1,)+after_conv_shape[1:]))
328 | print lasagne.layers.get_output_shape(l_dec[-1])
329 | for i in (xrange(nlayers_dec-1)):
330 |     l_dec.append(ConvConcatLayer([l_dec[-1], l_in_y], num_classes))
331 |     l = decodelayer(l_dec[-1],up_method[i],False,dr_dec[i],ps_dec[i],nk_dec[i],dk_dec[i],nonlin_dec[i],pad_dec[i],str_dec[i],'DEC-'+str(i+1))
332 |     l_dec.append(l)
333 |     print lasagne.layers.get_output_shape(l_dec[-1])
334 | 
335 | # mu and var
336 | if distribution == 'gaussian':
337 |     l_dec_x_mu = decodelayer(l_dec[-1],up_method[-1],False,dr_dec[-1],ps_dec[-1],nk_dec[-1],dk_dec[-1],lasagne.nonlinearities.identity,pad_dec[-1],str_dec[-1],'DEC-MU')
338 |     l_dec_x_log_var = decodelayer(l_dec[-1],up_method[-1],False,dr_dec[-1],ps_dec[-1],nk_dec[-1],dk_dec[-1],lasagne.nonlinearities.identity,pad_dec[-1],str_dec[-1],'DEC-LOG_VAR')
339 | elif distribution == 'bernoulli':
340 |     l_dec_x_mu = decodelayer(l_dec[-1],up_method[-1],False,dr_dec[-1],ps_dec[-1],nk_dec[-1],dk_dec[-1],lasagne.nonlinearities.sigmoid,pad_dec[-1],str_dec[-1],'DEC-MU')
341 | print lasagne.layers.get_output_shape(l_dec_x_mu)
342 | 
343 | # predictions and accuracies 
344 | predictions_train = lasagne.layers.get_output(classifier, sym_x_cla, deterministic=False)
345 | predictions_eval = lasagne.layers.get_output(classifier, sym_x_cla, deterministic=True)
346 | accurracy_train = lasagne.objectives.categorical_accuracy(predictions_train, sym_y)
347 | accurracy_eval = lasagne.objectives.categorical_accuracy(predictions_eval, sym_y)
348 | 
349 | # weight decays
350 | weight_decay_classifier = lasagne.regularization.regularize_layer_params_weighted({classifier:1}, lasagne.regularization.l2)
351 | 
352 | 
353 | '''
354 | learning
355 | '''
356 | # discriminative objective
357 | classifier_cost_train = multiclass_hinge_loss(predictions=predictions_train, targets=sym_y, weight_decay=weight_decay_classifier, alpha_decay=alpha_decay)
358 | classifier_cost_eval = multiclass_hinge_loss(predictions=predictions_eval, targets=sym_y, weight_decay=weight_decay_classifier, alpha_decay=alpha_decay)
359 | 
360 | cost_cla = classifier_cost_train
361 | 
362 | # generative objective
363 | predictions_train_hard = predictions_train.argmax(axis=1)
364 | predictions_eval_hard = predictions_eval.argmax(axis=1)
365 | 
366 | if distribution == 'bernoulli':
367 |     z_train, z_mu_train, z_log_var_train, x_mu_train = lasagne.layers.get_output([l_z, l_mu, l_log_var, l_dec_x_mu], {l_in_x:sym_x,l_in_y:predictions_train_hard}, deterministic=False)
368 |     z_eval, z_mu_eval, z_log_var_eval, x_mu_eval = lasagne.layers.get_output([l_z, l_mu, l_log_var, l_dec_x_mu], {l_in_x:sym_x, l_in_y:predictions_eval_hard}, deterministic=True)
369 | 
370 |     # lower bounds
371 |     LL_train, log_qz_given_xy_train, log_pz_train, log_px_given_zy_train = latent_gaussian_x_bernoulli(z_train, z_mu_train, z_log_var_train, x_mu_train, sym_x, latent_size=nz, num_features=num_features, eq_samples=sym_eq_samples, iw_samples=sym_iw_samples)
372 |     LL_eval, log_qz_given_xy_eval, log_pz_eval, log_px_given_zy_eval = latent_gaussian_x_bernoulli(z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, sym_x, latent_size=nz, num_features=num_features, eq_samples=sym_eq_samples, iw_samples=sym_iw_samples)
373 | 
374 | elif distribution == 'gaussian':
375 |     z_train, z_mu_train, z_log_var_train, x_mu_train, x_log_var_train = lasagne.layers.get_output([l_z, l_mu, l_log_var, l_dec_x_mu, l_dec_x_log_var], {l_in_x:sym_x, l_in_y:predictions_train_hard}, deterministic=False)
376 |     z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, x_log_var_eval = lasagne.layers.get_output([l_z, l_mu, l_log_var, l_dec_x_mu, l_dec_x_log_var], {l_in_x:sym_x, l_in_y:predictions_eval_hard}, deterministic=True)
377 | 
378 |     LL_train, log_qz_given_xy_train, log_pz_train, log_px_given_zy_train = latent_gaussian_x_gaussian(z_train, z_mu_train, z_log_var_train, x_mu_train, x_log_var_train, sym_x, latent_size=nz, num_features=num_features, eq_samples=sym_eq_samples, iw_samples=sym_iw_samples)
379 |     LL_eval, log_qz_given_xy_eval, log_pz_eval, log_px_given_zy_eval = latent_gaussian_x_gaussian(z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, x_log_var_eval, sym_x, latent_size=nz, num_features=num_features, eq_samples=sym_eq_samples, iw_samples=sym_iw_samples)
380 | 
381 | cost_gen = -LL_train
382 | cost = cost_gen + alpha*cost_cla
383 | 
384 | # count parameters
385 | if distribution == 'bernoulli':
386 |     params = lasagne.layers.get_all_params([classifier, l_dec_x_mu], trainable=True)
387 |     for p in params:
388 |         print p, p.get_value().shape
389 |     params_count = lasagne.layers.count_params([classifier,l_dec_x_mu], trainable=True)
390 | elif distribution == 'gaussian':
391 |     params = lasagne.layers.get_all_params([classifier,l_dec_x_mu, l_dec_x_log_var], trainable=True)
392 |     for p in params:
393 |         print p, p.get_value().shape
394 |     params_count = lasagne.layers.count_params([classifier,l_dec_x_mu, l_dec_x_log_var], trainable=True)
395 | print 'Number of parameters:', params_count
396 | 
397 | # functions
398 | grads = T.grad(cost, params)
399 | # mgrads = lasagne.updates.total_norm_constraint(grads,max_norm=max_norm)
400 | # cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
401 | updates = lasagne.updates.adam(grads, params, beta1=0.9, beta2=0.999, epsilon=1e-8, learning_rate=sym_lr)
402 | 
403 | train_model = theano.function([sym_index, sym_batch_size, sym_lr, sym_eq_samples, sym_iw_samples], [LL_train, log_qz_given_xy_train, log_pz_train, log_px_given_zy_train, classifier_cost_train, accurracy_train], givens={sym_x_cla: sh_x_train[batch_slice], sym_x: sh_x_train[batch_slice], sym_y: sh_t_train[batch_slice]}, updates=updates)
404 | test_model = theano.function([sym_index, sym_batch_size, sym_eq_samples, sym_iw_samples], [LL_eval, log_qz_given_xy_eval, log_pz_eval, log_px_given_zy_eval, classifier_cost_eval, accurracy_eval], givens={sym_x_cla: sh_x_test[batch_slice], sym_x: sh_x_test[batch_slice], sym_y: sh_t_test[batch_slice]})
405 | 
406 | 
407 | # random generation for visualization
408 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
409 | srng_ran = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
410 | srng_ran_share = theano.tensor.shared_randomstreams.RandomStreams(1234)
411 | sym_ran_y = T.ivector('ran_y')
412 | 
413 | ran_z = T.tile(srng_ran.normal((10,nz)), (10, 1))
414 | if distribution == 'bernoulli':
415 |     random_x_mean = lasagne.layers.get_output(l_dec_x_mu, {l_z:ran_z, l_in_y:sym_ran_y}, deterministic=True)
416 |     random_x = srng_ran_share.binomial(n=1, p=random_x_mean, dtype=theano.config.floatX)
417 | elif distribution == 'gaussian':
418 |     random_x_mean, random_x_log_var = lasagne.layers.get_output([l_dec_x_mu, l_dec_x_log_var], {l_z:ran_z}, deterministic=True)
419 |     random_x = srng_ran_share.normal(avg=random_x_mean, std=T.exp(0.5*random_x_log_var))
420 | generate = theano.function(inputs=[sym_ran_y], outputs=[random_x_mean, random_x])
421 | 
422 | 
423 | '''
424 | run
425 | '''
426 | # Training and Testing functions
427 | def train_epoch(lr, eq_samples, iw_samples, batch_size):
428 |     costs,log_qz_given_xy,log_pz,log_px_given_zy, loss, accurracy = [],[],[],[],[],[]
429 |     n_train_batches =  train_x.shape[0] / (batch_size)
430 | 
431 |     for i in range(n_train_batches):
432 |         costs_batch, log_qz_given_xy_batch,log_pz_batch,log_px_given_zy_batch, loss_batch, accurracy_batch = train_model(i, batch_size, lr, eq_samples, iw_samples)
433 |         costs += [costs_batch]
434 |         log_qz_given_xy += [log_qz_given_xy_batch]
435 |         log_pz += [log_pz_batch]
436 |         log_px_given_zy += [log_px_given_zy_batch]
437 |         loss += [loss_batch]
438 |         accurracy += [accurracy_batch]
439 |     return np.mean(costs), np.mean(log_qz_given_xy), np.mean(log_pz), np.mean(log_px_given_zy), np.mean(loss), np.mean(accurracy)
440 | 
441 | def test_epoch(eq_samples, iw_samples, batch_size):
442 |     n_test_batches = test_x.shape[0] / batch_size
443 |     costs,log_qz_given_xy,log_pz,log_px_given_zy,loss,accurracy = [],[],[],[],[],[]
444 |     for i in range(n_test_batches):
445 |         costs_batch, log_qz_given_xy_batch,log_pz_batch,log_px_given_zy_batch, loss_batch, accurracy_batch = test_model(i, batch_size, eq_samples, iw_samples)
446 |         costs += [costs_batch]
447 |         log_qz_given_xy += [log_qz_given_xy_batch]
448 |         log_pz += [log_pz_batch]
449 |         log_px_given_zy += [log_px_given_zy_batch]
450 |         loss += [loss_batch]
451 |         accurracy += [accurracy_batch]
452 |     return np.mean(costs), np.mean(log_qz_given_xy), np.mean(log_pz), np.mean(log_px_given_zy), np.mean(loss), np.mean(accurracy)
453 | 
454 | 
455 | print "Training"
456 | 
457 | # TRAIN LOOP
458 | LL_train, log_qz_given_x_train, log_pz_train, log_px_given_z_train, loss_train, acc_train = [],[],[],[],[],[]
459 | LL_test, log_qz_given_x_test, log_pz_test, log_px_given_z_test, loss_test, acc_test = [],[],[],[],[],[]
460 | 
461 | for epoch in range(1, 1+num_epochs):
462 |     start = time.time()
463 | 
464 |     # randomly permute data and labels
465 |     p = np.random.permutation(train_x.shape[0]) 
466 |     sh_x_train.set_value(preprocesses_dataset(train_x[p]))
467 |     sh_t_train.set_value(train_t[p])
468 | 
469 |     train_out = train_epoch(lr, eq_samples, iw_samples, batch_size)
470 | 
471 |     if np.isnan(train_out[0]):
472 |         ValueError("NAN in train LL!")
473 | 
474 |     if epoch >= anneal_lr_epoch and epoch % every_anneal == 0:
475 |         #annealing learning rate
476 |         lr = lr*anneal_lr_factor
477 | 
478 |     if epoch % eval_epoch == 0:
479 |         t = time.time() - start
480 |         LL_train += [train_out[0]]
481 |         log_qz_given_x_train += [train_out[1]]
482 |         log_pz_train += [train_out[2]]
483 |         log_px_given_z_train += [train_out[3]]
484 |         loss_train +=[train_out[4]]
485 |         acc_train += [train_out[5]]
486 | 
487 |         print "calculating LL eq=1, iw=1"
488 |         test_out = test_epoch(eq_samples, iw_samples, batch_size=500)
489 |         LL_test += [test_out[0]]
490 |         log_qz_given_x_test += [test_out[1]]
491 |         log_pz_test += [test_out[2]]
492 |         log_px_given_z_test += [test_out[3]]
493 |         loss_test += [test_out[4]]
494 |         acc_test += [test_out[5]]
495 | 
496 | 
497 |         line = "*Epoch=%d\tTime=%.2f\tLR=%.5f\n" %(epoch, t, lr) + \
498 |                "  TRAIN:\tGen_loss=%.5f\tlogq(z|x)=%.5f\tlogp(z)=%.5f\tlogp(x|z)=%.5f\tdis_loss=%.5f\tlabel_error=%.5f\n" %(LL_train[-1], log_qz_given_x_train[-1], log_pz_train[-1], log_px_given_z_train[-1], loss_train[-1], 1-acc_train[-1]) + \
499 |                "  EVAL-L1:\tGen_loss=%.5f\tlogq(z|x)=%.5f\tlogp(z)=%.5f\tlogp(x|z)=%.5f\tdis_loss=%.5f\terror=%.5f\n" %(LL_test[-1], log_qz_given_x_test[-1], log_pz_test[-1], log_px_given_z_test[-1], loss_test[-1], 1-acc_test[-1])
500 |         print line
501 |         with open(logfile,'a') as f:
502 |             f.write(line + "\n")
503 | 
504 |     # random generation for visualization
505 |     if epoch % vis_epoch == 0:
506 |         tail='-'+str(epoch)+'.png'
507 |         ran_y = np.int32(np.repeat(np.arange(10), 10))
508 |         _x_mean, _x = generate(ran_y)
509 |         _x_mean = _x_mean.reshape((100,-1))
510 |         _x = _x.reshape((100,-1))
511 |         image = paramgraphics.mat_to_img(_x_mean.T, dim_input, colorImg=colorImg, scale=generation_scale, 
512 |             save_path=os.path.join(res_out, 'mean'+tail))
513 | 
514 |     #save model
515 |     model_out = os.path.join(res_out, 'model')
516 |     if epoch % (vis_epoch*10) == 0:
517 |         if distribution == 'bernoulli':
518 |             all_params=lasagne.layers.get_all_params([classifier, l_dec_x_mu])
519 |         elif distribution == 'gaussian':
520 |             all_params=lasagne.layers.get_all_params([classifier, l_dec_x_mu, l_dec_x_log_var])
521 |         f = gzip.open(model_out + 'epoch%i'%(epoch), 'wb')
522 |         cPickle.dump(all_params, f, protocol=cPickle.HIGHEST_PROTOCOL)
523 |         f.close()


--------------------------------------------------------------------------------
/cdgm_x2y_xy2z_zy2x.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code implements max-margin deep conditional generative model which incorporates the side information in generative modelling and uses a semi-supervised classifier to infer the latent labels
  3 | '''
  4 | 
  5 | import gzip, os, cPickle, time, math, argparse, shutil, sys
  6 | 
  7 | import numpy as np
  8 | import theano.tensor as T
  9 | import theano
 10 | import lasagne
 11 | from theano.tensor.extra_ops import to_one_hot
 12 | from parmesan.datasets import load_mnist_realval, load_mnist_binarized, load_frey_faces, load_norb_small
 13 | from datasets import load_cifar10, load_svhn
 14 | from datasets_norb import load_numpy_subclasses
 15 | from parmesan.layers import SampleLayer
 16 | 
 17 | from layers.merge import ConvConcatLayer, MLPConcatLayer
 18 | from utils.others import get_nonlin_list, get_pad_list, bernoullisample, build_log_file, printarray_2D, array2file_2D
 19 | from components.shortcuts import convlayer, fractionalstridedlayer, unpoolconvlayer, mlplayer
 20 | from components.objectives import latent_gaussian_x_gaussian, latent_gaussian_x_bernoulli
 21 | from components.objectives import multiclass_s3vm_loss, multiclass_hinge_loss
 22 | from utils.create_ssl_data import create_ssl_data, create_ssl_data_subset
 23 | import  utils.paramgraphics as paramgraphics
 24 | 
 25 | '''
 26 | parameters
 27 | '''
 28 | # global
 29 | theano.config.floatX = 'float32'
 30 | filename_script = os.path.basename(os.path.realpath(__file__))
 31 | parser = argparse.ArgumentParser()
 32 | parser.add_argument("-dataset", type=str, default="mnist_real")
 33 | parser.add_argument("-outfolder", type=str, default=os.path.join("results-ssl", os.path.splitext(filename_script)[0]))
 34 | parser.add_argument("-preprocess", type=str, default="none")
 35 | parser.add_argument("-subset_flag", type=str, default ='false')
 36 | # architecture
 37 | parser.add_argument("-nz", type=int, default=100)
 38 | parser.add_argument("-batch_norm_dgm", type=str, default='false')
 39 | parser.add_argument("-top_mlp", type=str, default='false')
 40 | parser.add_argument("-mlp_size", type=int, default=256)
 41 | parser.add_argument("-batch_norm_classifier", type=str, default='false')
 42 | # classifier
 43 | parser.add_argument("-num_labelled", type=int, default=100)
 44 | parser.add_argument("-num_labelled_per_batch", type=int, default=100)
 45 | parser.add_argument("-batch_size", type=int, default=200)
 46 | parser.add_argument("-delta", type=float, default=1.)
 47 | parser.add_argument("-alpha_decay", type=float, default=1e-4)
 48 | parser.add_argument("-alpha_hinge", type=float, default=1.)
 49 | parser.add_argument("-alpha_hat", type=float, default=.3)
 50 | parser.add_argument("-alpha_reg", type=float, default=0)
 51 | parser.add_argument("-alpha", type=float, default=.1)
 52 | parser.add_argument("-alpha_straight_through", type=float, default=1e-4)
 53 | parser.add_argument("-norm_type", type=int, default=2)
 54 | parser.add_argument("-form", type=str, default='mean_class')
 55 | # feature extractor
 56 | parser.add_argument("-nlayers_cla", type=int, default=3)
 57 | parser.add_argument("-nk_cla", type=str, default='32,64,128')
 58 | parser.add_argument("-dk_cla", type=str, default='4,5,3')
 59 | parser.add_argument("-pad_cla", type=str, default='valid,valid,valid')
 60 | parser.add_argument("-str_cla", type=str, default='2,2,2')
 61 | parser.add_argument("-ps_cla", type=str, default='1,1,1')
 62 | parser.add_argument("-nonlin_cla", type=str, default='rectify,rectify,rectify')
 63 | parser.add_argument("-dr_cla", type=str, default='0,0,0')
 64 | # encoder
 65 | parser.add_argument("-nlayers_enc", type=int, default=3)
 66 | parser.add_argument("-nk_enc", type=str, default='32,64,128')
 67 | parser.add_argument("-dk_enc", type=str, default='4,5,3')
 68 | parser.add_argument("-pad_enc", type=str, default='valid,valid,valid')
 69 | parser.add_argument("-str_enc", type=str, default='2,2,2')
 70 | parser.add_argument("-ps_enc", type=str, default='1,1,1')
 71 | parser.add_argument("-nonlin_enc", type=str, default='rectify,rectify,rectify')
 72 | parser.add_argument("-dr_enc", type=str, default='0,0,0')
 73 | # decoder
 74 | parser.add_argument("-nlayers_dec", type=int, default=4)
 75 | parser.add_argument("-nk_dec", type=str, default='128,64,32,1')
 76 | parser.add_argument("-dk_dec", type=str, default='3,5,4,5')
 77 | parser.add_argument("-pad_dec", type=str, default='valid,valid,valid,same')
 78 | parser.add_argument("-str_dec", type=str, default='2,2,2,1')
 79 | parser.add_argument("-up_method", type=str, default='frac_strided,frac_strided,frac_strided,none')
 80 | parser.add_argument("-ps_dec", type=str, default='1,1,1,1')
 81 | parser.add_argument("-nonlin_dec", type=str, default='rectify,rectify,rectify,sigmoid')
 82 | parser.add_argument("-dr_dec", type=str, default='0,0,0,0')
 83 | # optimization
 84 | parser.add_argument("-flag", type=str, default='validation') # validation for anneal learning rate
 85 | parser.add_argument("-ssl_data_seed", type=int, default=0) # random seed for ssl data generation
 86 | parser.add_argument("-lr", type=float, default=0.0003)
 87 | parser.add_argument("-nepochs", type=int, default=200)
 88 | parser.add_argument("-anneal_lr_epoch", type=int, default=100)
 89 | parser.add_argument("-anneal_lr_factor", type=float, default=.99)
 90 | parser.add_argument("-every_anneal", type=int, default=1)
 91 | clip_grad = 1
 92 | max_norm = 5
 93 | # name
 94 | parser.add_argument("-name", type=str, default='')
 95 | # inference
 96 | parser.add_argument("-eq_samples", type=int,
 97 |         help="number of samples for the expectation over q(z|x)", default=1)
 98 | parser.add_argument("-iw_samples", type=int,
 99 |         help="number of importance weighted samples", default=1)
100 | 
101 | # random seeds for reproducibility
102 | np.random.seed(1234)
103 | from theano.tensor.shared_randomstreams import RandomStreams
104 | srng = RandomStreams(seed=1234)
105 | 
106 | # get parameters
107 | # global
108 | args = parser.parse_args()
109 | dataset = args.dataset
110 | subset_flag = args.subset_flag == 'true' or args.subset_flag == 'True'
111 | eval_epoch = 1
112 | # architecture
113 | nz = args.nz
114 | bn_dgm = args.batch_norm_dgm == 'true' or args.batch_norm_dgm == 'True'
115 | top_mlp = args.top_mlp == 'true' or args.top_mlp == 'True'
116 | mlp_size = args.mlp_size
117 | bn_cla = args.batch_norm_classifier == 'true' or args.batch_norm_classifier == 'True'
118 | # classifier
119 | num_labelled = args.num_labelled
120 | batch_size = args.batch_size
121 | num_labelled_per_batch = args.num_labelled_per_batch
122 | assert num_labelled % num_labelled_per_batch == 0
123 | delta = args.delta
124 | alpha_straight_through = args.alpha_straight_through
125 | alpha_decay = args.alpha_decay
126 | alpha_hinge = args.alpha_hinge
127 | alpha_reg = args.alpha_reg
128 | alpha_hat = args.alpha_hat
129 | alpha = args.alpha
130 | norm_type = args.norm_type
131 | form = args.form
132 | # feature extractor
133 | nlayers_cla = args.nlayers_cla
134 | nk_cla = map(int, args.nk_cla.split(','))
135 | dk_cla = map(int, args.dk_cla.split(','))
136 | pad_cla = map(str, args.pad_cla.split(','))
137 | str_cla = map(int, args.str_cla.split(','))
138 | ps_cla = map(int, args.ps_cla.split(','))
139 | dr_cla = map(float, args.dr_cla.split(','))
140 | nonlin_cla = get_nonlin_list(map(str, args.nonlin_cla.split(',')))
141 | # encoder
142 | nlayers_enc = args.nlayers_enc
143 | nk_enc = map(int, args.nk_enc.split(','))
144 | dk_enc = map(int, args.dk_enc.split(','))
145 | pad_enc = get_pad_list(map(str, args.pad_enc.split(',')))
146 | str_enc = map(int, args.str_enc.split(','))
147 | ps_enc = map(int, args.ps_enc.split(','))
148 | dr_enc = map(float, args.dr_enc.split(','))
149 | nonlin_enc = get_nonlin_list(map(str, args.nonlin_enc.split(',')))
150 | # decoder
151 | nlayers_dec = args.nlayers_dec
152 | nk_dec = map(int, args.nk_dec.split(','))
153 | dk_dec = map(int, args.dk_dec.split(','))
154 | pad_dec = get_pad_list(map(str, args.pad_dec.split(',')))
155 | str_dec = map(int, args.str_dec.split(','))
156 | ps_dec = map(int, args.ps_dec.split(','))
157 | dr_dec = map(float, args.dr_dec.split(','))
158 | nonlin_dec = get_nonlin_list(map(str, args.nonlin_dec.split(',')))
159 | up_method = map(str, args.up_method.split(','))
160 | # optimization
161 | flag = args.flag
162 | ssl_data_seed = args.ssl_data_seed
163 | if ssl_data_seed == -1:
164 |     ssl_data_seed = int(time.time())
165 | lr = args.lr
166 | num_epochs = args.nepochs
167 | anneal_lr_epoch = args.anneal_lr_epoch
168 | anneal_lr_factor = args.anneal_lr_factor
169 | every_anneal = args.every_anneal
170 | # inference
171 | iw_samples = args.iw_samples
172 | eq_samples = args.eq_samples
173 | # log file
174 | logfile, res_out = build_log_file(args, filename_script, extra=str(args.ssl_data_seed))
175 | shutil.copy(os.path.realpath(__file__), os.path.join(res_out, filename_script))
176 | 
177 | '''
178 | datasets
179 | '''
180 | if dataset == 'mnist_real':
181 |     colorImg = False
182 |     dim_input = (28,28)
183 |     in_channels = 1
184 |     num_classes = 10
185 |     generation_scale = False
186 |     num_generation = num_classes*num_classes
187 |     vis_epoch = 100
188 |     distribution = 'bernoulli'
189 |     num_features = in_channels*dim_input[0]*dim_input[1]
190 |     print "Using real-valued mnist dataset"
191 |     train_x, train_t, valid_x, valid_t, test_x, test_t = load_mnist_realval()
192 |     if flag == 'validation':
193 |         test_x = valid_x
194 |         test_t = valid_t
195 |     else:
196 |         train_x = np.concatenate([train_x,valid_x])
197 |         train_t = np.hstack((train_t, valid_t))
198 |     train_x_size = train_t.shape[0]
199 |     train_t = np.int32(train_t)
200 |     test_t = np.int32(test_t)
201 |     train_x = train_x.astype(theano.config.floatX)
202 |     test_x = test_x.astype(theano.config.floatX)
203 |     train_x = train_x.reshape((-1, in_channels)+dim_input)
204 |     test_x = test_x.reshape((-1, in_channels)+dim_input)
205 |     # prepare data for semi-supervised learning
206 |     if subset_flag:
207 |         # instead of sampling from 60000 data, sample 100 data for 10 times to make sure that the labelled data with smaller size is a subset of that with larger size. 
208 |         x_labelled, y_labelled, x_unlabelled, _ = create_ssl_data_subset(train_x, train_t, num_classes, num_labelled, 100, ssl_data_seed)
209 |     else:
210 |         x_labelled, y_labelled, x_unlabelled, _ = create_ssl_data(train_x, train_t, num_classes, num_labelled, ssl_data_seed)
211 |     y_labelled = np.int32(y_labelled)   
212 | elif dataset == 'cifar10':
213 |     colorImg = True
214 |     dim_input = (32,32)
215 |     in_channels = 3
216 |     num_classes = 10
217 |     generation_scale = False
218 |     num_generation = num_classes*num_classes
219 |     vis_epoch = 100
220 |     distribution = 'bernoulli'
221 |     num_features = in_channels*dim_input[0]*dim_input[1]
222 |     print "Using cifar10 dataset"
223 |     train_x, train_t, valid_x, valid_t, test_x, test_t = load_cifar10(num_val=5000, normalized=True, centered=True)
224 |     if flag == 'validation':
225 |         test_x = valid_x
226 |         test_t = valid_t
227 |     else:
228 |         train_x = np.concatenate([train_x,valid_x])
229 |         train_t = np.hstack((train_t, valid_t))
230 |     train_x_size = train_t.shape[0]
231 |     train_t = np.int32(train_t)
232 |     test_t = np.int32(test_t)
233 |     train_x = train_x.astype(theano.config.floatX)
234 |     test_x = test_x.astype(theano.config.floatX)
235 |     train_x = train_x.reshape((-1, in_channels)+dim_input)
236 |     test_x = test_x.reshape((-1, in_channels)+dim_input)
237 |     # prepare data for semi-supervised learning
238 |     x_labelled, y_labelled, x_unlabelled, _ = create_ssl_data(train_x, train_t, num_classes, num_labelled, ssl_data_seed)
239 |     y_labelled = np.int32(y_labelled)
240 | elif dataset == 'svhn':
241 |     colorImg = True
242 |     dim_input = (32,32)
243 |     in_channels = 3
244 |     num_classes = 10
245 |     generation_scale = False
246 |     num_generation = num_classes*num_classes
247 |     vis_epoch = 10
248 |     distribution = 'bernoulli'
249 |     num_features = in_channels*dim_input[0]*dim_input[1]
250 |     print "Using svhn dataset"
251 |     train_x, train_t, valid_x, valid_t, test_x, test_t, avg = load_svhn(normalized=True, centered=False)
252 |     if flag == 'validation':
253 |         test_x = valid_x
254 |         test_t = valid_t
255 |     else:
256 |         train_x = np.concatenate([train_x,valid_x])
257 |         train_t = np.hstack((train_t, valid_t))
258 |     train_x_size = train_t.shape[0]
259 |     train_t = np.int32(train_t)
260 |     test_t = np.int32(test_t)
261 |     train_x = train_x.astype(theano.config.floatX)
262 |     test_x = test_x.astype(theano.config.floatX)
263 |     train_x = train_x.reshape((-1, in_channels)+dim_input)
264 |     test_x = test_x.reshape((-1, in_channels)+dim_input)
265 |     # prepare data for semi-supervised learning
266 |     x_labelled, y_labelled, x_unlabelled, _ = create_ssl_data(train_x, train_t, num_classes, num_labelled, ssl_data_seed)
267 |     y_labelled = np.int32(y_labelled)
268 | elif dataset == 'norb':
269 |     colorImg = False
270 |     dim_input = (32,32)
271 |     in_channels = 1
272 |     num_classes = 5
273 |     generation_scale = False
274 |     num_generation = num_classes*num_classes
275 |     vis_epoch = 100
276 |     distribution = 'bernoulli'
277 |     num_features = in_channels*dim_input[0]*dim_input[1]
278 |     print "Using small norb dataset"
279 |     x, t = load_numpy_subclasses(size=dim_input[0], normalize=True, centered=False)
280 |     x = np.transpose(x)
281 |     t = t.flatten()
282 |     train_x = x[:24300]
283 |     test_x = x[24300*2:24300*3]
284 |     train_t = t[:24300]
285 |     test_t = t[24300*2:24300*3]
286 |     if flag == 'validation':
287 |         test_x = train_x[:1000]
288 |         test_t = train_t[:1000]
289 |         train_x = train_x[1000:]
290 |         train_t = train_t[1000:]
291 |     train_x_size = train_t.shape[0]
292 |     train_t = np.int32(train_t)
293 |     test_t = np.int32(test_t)
294 |     train_x = train_x.astype(theano.config.floatX)
295 |     test_x = test_x.astype(theano.config.floatX)
296 |     train_x = train_x.reshape((-1, in_channels)+dim_input)
297 |     test_x = test_x.reshape((-1, in_channels)+dim_input)
298 |     # prepare data for semi-supervised learning
299 |     x_labelled, y_labelled, x_unlabelled, _ = create_ssl_data(train_x, train_t, num_classes, num_labelled, ssl_data_seed)
300 |     y_labelled = np.int32(y_labelled)
301 | 
302 | # preprocess
303 | if args.preprocess == 'none':
304 |     preprocesses_dataset = None
305 | elif args.preprocess == 'bernoullisample':
306 |     preprocesses_dataset = bernoullisample
307 | elif args.preprocess == 'dequantify':
308 |     pass
309 | 
310 | # shared variables for semi-supervised learning
311 | sh_x_train_labelled = theano.shared(x_labelled, borrow=True)
312 | sh_x_train_unlabelled = theano.shared(x_unlabelled, borrow=True)
313 | sh_t_train_labelled = theano.shared(y_labelled, borrow=True)
314 | sh_x_test = theano.shared(test_x, borrow=True)
315 | sh_t_test = theano.shared(test_t, borrow=True)
316 | if preprocesses_dataset is not None:
317 |     sh_x_train_labelled_preprocessed = theano.shared(preprocesses_dataset(x_labelled), borrow=True)
318 |     sh_x_train_unlabelled_preprocessed = theano.shared(preprocesses_dataset(x_unlabelled), borrow=True)
319 |     sh_x_test_preprocessed = theano.shared(preprocesses_dataset(test_x), borrow=True)
320 | 
321 | # visualize labeled data
322 | if True:
323 |     print 'size of training data ', x_labelled.shape, y_labelled.shape, x_unlabelled.shape
324 |     _x_mean = x_labelled.reshape((num_labelled,-1))
325 |     _x_mean = _x_mean[:num_generation]
326 |     y_order = np.argsort(y_labelled[:num_generation])
327 |     _x_mean = _x_mean[y_order]
328 |     image = paramgraphics.mat_to_img(_x_mean.T, dim_input, colorImg=colorImg, scale=generation_scale, 
329 |         save_path=os.path.join(res_out, 'labeled_data'+str(ssl_data_seed)+'.png'))
330 | 
331 | '''
332 | building block
333 | '''
334 | # shortcuts
335 | encodelayer = convlayer
336 | 
337 | # decoder layer
338 | def decodelayer(l,up_method,bn,dr,ps,n_kerns,d_kerns,nonlinearity,pad,stride,name):
339 |     # upsampling
340 |     if up_method == 'unpool':
341 |         h_g = unpoolconvlayer(l,bn,dr,ps,n_kerns,d_kerns,nonlinearity,pad,stride,name,'unpool',None)
342 |     elif up_method == 'frac_strided':
343 |         h_g = fractionalstridedlayer(l,bn,dr,n_kerns,d_kerns,nonlinearity,pad,stride,name)
344 |     elif up_method == 'none':
345 |         h_g, _ = convlayer(l,bn,dr,ps,n_kerns,d_kerns,nonlinearity,pad,stride,name)
346 |     else:
347 |         raise Exception('Unknown upsampling method')
348 |     return h_g
349 | 
350 | 
351 | '''
352 | model
353 | '''
354 | # symbolic variables
355 | sym_iw_samples = T.iscalar('iw_samples')
356 | sym_eq_samples = T.iscalar('eq_samples')
357 | sym_lr = T.scalar('lr')
358 | sym_x = T.tensor4('x')
359 | sym_x_cla = T.tensor4('x_cla')
360 | sym_y = T.ivector('y')
361 | sym_index = T.iscalar('index')
362 | sym_batch_size = T.iscalar('batch_size')
363 | batch_slice = slice(sym_index * sym_batch_size, (sym_index + 1) * sym_batch_size)
364 | sym_index_l = T.iscalar('index_l')
365 | sym_index_u = T.iscalar('index_u')
366 | sym_batch_size_l = T.iscalar('batch_size_l')
367 | sym_batch_size_u = T.iscalar('batch_size_u')
368 | batch_slice_l = slice(sym_index_l * sym_batch_size_l, (sym_index_l + 1) * sym_batch_size_l)
369 | batch_slice_u = slice(sym_index_u * sym_batch_size_u, (sym_index_u + 1) * sym_batch_size_u)
370 | 
371 | # x2y
372 | l_in_x_cla = lasagne.layers.InputLayer((None, in_channels)+dim_input)
373 | l_cla = [l_in_x_cla,]
374 | print lasagne.layers.get_output_shape(l_cla[-1])
375 | # conv layers
376 | for i in xrange(nlayers_cla):
377 |     l, _= convlayer(l_cla[-1],bn_cla,dr_cla[i],ps_cla[i],nk_cla[i],dk_cla[i],nonlin_cla[i],pad_cla[i],str_cla[i],'CLA-'+str(i+1))
378 |     l_cla.append(l)
379 |     print lasagne.layers.get_output_shape(l_cla[-1])
380 | 
381 | # feature and classifier
382 | if top_mlp:
383 |     l_cla.append(lasagne.layers.FlattenLayer(l_cla[-1]))
384 |     feature = mlplayer(l_cla[-1],bn_cla,0.5,mlp_size,lasagne.nonlinearities.rectify,name='MLP-CLA')
385 | else:
386 |     feature = lasagne.layers.GlobalPoolLayer(l_cla[-1])
387 | classifier = lasagne.layers.DenseLayer(feature, num_units=num_classes, nonlinearity=lasagne.nonlinearities.identity, W=lasagne.init.Normal(1e-2, 0), name="classifier")
388 | 
389 | # encoder xy2z
390 | l_in_x = lasagne.layers.InputLayer((None, in_channels)+dim_input)
391 | l_in_y = lasagne.layers.InputLayer((None,))
392 | l_enc = [l_in_x,]
393 | for i in xrange(nlayers_enc):
394 |     l_enc.append(ConvConcatLayer([l_enc[-1], l_in_y], num_classes))
395 |     l, _ = encodelayer(l_enc[-1],bn_dgm,dr_enc[i],ps_enc[i],nk_enc[i],dk_enc[i],nonlin_enc[i],pad_enc[i],str_enc[i],'ENC-'+str(i+1),False,0)
396 |     l_enc.append(l)
397 |     print lasagne.layers.get_output_shape(l_enc[-1])
398 | 
399 | # reshape
400 | after_conv_shape = lasagne.layers.get_output_shape(l_enc[-1])
401 | after_conv_size = int(np.prod(after_conv_shape[1:]))
402 | l_enc.append(lasagne.layers.FlattenLayer(l_enc[-1]))
403 | print lasagne.layers.get_output_shape(l_enc[-1])
404 | 
405 | # compute parameters and sample z
406 | l_mu = mlplayer(l_enc[-1],False,0,nz,lasagne.nonlinearities.identity,'ENC-MU')
407 | l_log_var = mlplayer(l_enc[-1],False,0,nz,lasagne.nonlinearities.identity,'ENC-LOG_VAR')
408 | l_z = SampleLayer(mean=l_mu,log_var=l_log_var,eq_samples=sym_eq_samples,iw_samples=sym_iw_samples)
409 | 
410 | # decoder zy2x
411 | l_dec = [l_z,]
412 | print lasagne.layers.get_output_shape(l_dec[-1])
413 | 
414 | # reshape
415 | l_dec.append(mlplayer(l_dec[-1],bn_dgm,0,after_conv_size,lasagne.nonlinearities.rectify, 'DEC_l_Z'))
416 | print lasagne.layers.get_output_shape(l_dec[-1])
417 | l_dec.append(lasagne.layers.ReshapeLayer(l_dec[-1], shape=(-1,)+after_conv_shape[1:]))
418 | print lasagne.layers.get_output_shape(l_dec[-1])
419 | for i in (xrange(nlayers_dec-1)):
420 |     l_dec.append(ConvConcatLayer([l_dec[-1], l_in_y], num_classes))
421 |     l = decodelayer(l_dec[-1],up_method[i],bn_dgm,dr_dec[i],ps_dec[i],nk_dec[i],dk_dec[i],nonlin_dec[i],pad_dec[i],str_dec[i],'DEC-'+str(i+1))
422 |     l_dec.append(l)
423 |     print lasagne.layers.get_output_shape(l_dec[-1])
424 | 
425 | # mu and var
426 | if distribution == 'gaussian':
427 |     l_dec_x_mu = decodelayer(l_dec[-1],up_method[-1],bn_dgm,dr_dec[-1],ps_dec[-1],nk_dec[-1],dk_dec[-1],lasagne.nonlinearities.sigmoid,pad_dec[-1],str_dec[-1],'DEC-MU')
428 |     l_dec_x_log_var = decodelayer(l_dec[-1],up_method[-1],bn_dgm,dr_dec[-1],ps_dec[-1],nk_dec[-1],dk_dec[-1],lasagne.nonlinearities.identity,pad_dec[-1],str_dec[-1],'DEC-LOG_VAR')
429 | elif distribution == 'bernoulli':
430 |     l_dec_x_mu = decodelayer(l_dec[-1],up_method[-1],bn_dgm,dr_dec[-1],ps_dec[-1],nk_dec[-1],dk_dec[-1],lasagne.nonlinearities.sigmoid,pad_dec[-1],str_dec[-1],'DEC-MU')
431 | print lasagne.layers.get_output_shape(l_dec_x_mu)
432 | 
433 | # predictions and accuracies 
434 | predictions_train = lasagne.layers.get_output(classifier, sym_x_cla, deterministic=False)
435 | predictions_eval = lasagne.layers.get_output(classifier, sym_x_cla, deterministic=True)
436 | accurracy_train_labeled = lasagne.objectives.categorical_accuracy(predictions_train[:sym_batch_size_l], sym_y)
437 | accurracy_eval = lasagne.objectives.categorical_accuracy(predictions_eval, sym_y)
438 | 
439 | # weight decays
440 | weight_decay_classifier = lasagne.regularization.regularize_layer_params_weighted({classifier:1}, lasagne.regularization.l2)
441 | 
442 | '''
443 | learning
444 | '''
445 | # discriminative objective
446 | #classifier_cost_train = multiclass_hinge_loss(predictions=predictions_train[:num_labelled], targets=sym_y[:num_labelled], weight_decay=weight_decay_classifier, alpha_decay=alpha_decay)
447 | classifier_cost_train = multiclass_s3vm_loss(predictions=predictions_train, targets=sym_y, weight_decay=weight_decay_classifier, norm_type=norm_type, form=form, num_labelled=sym_batch_size_l, alpha_decay=alpha_decay, alpha_reg=alpha_reg, alpha_hat=alpha_hat, alpha_hinge=alpha_hinge, delta=delta)
448 | classifier_cost_eval = multiclass_hinge_loss(predictions=predictions_eval, targets=sym_y, weight_decay=weight_decay_classifier, alpha_decay=alpha_decay) # no hat loss for testing
449 | 
450 | cost_cla = classifier_cost_train
451 | 
452 | # generative objective
453 | predictions_train_hard = predictions_train.argmax(axis=1)
454 | predictions_eval_hard = predictions_eval.argmax(axis=1)
455 | 
456 | sym_l_in_y_train = to_one_hot(T.concatenate([sym_y,predictions_train_hard[sym_batch_size_l:]], axis=0), num_classes)
457 | if distribution == 'bernoulli':
458 |     z_train, z_mu_train, z_log_var_train, x_mu_train = lasagne.layers.get_output([l_z, l_mu, l_log_var, l_dec_x_mu], {l_in_x:sym_x, l_in_y:sym_l_in_y_train}, deterministic=False)
459 |     z_eval, z_mu_eval, z_log_var_eval, x_mu_eval = lasagne.layers.get_output([l_z, l_mu, l_log_var, l_dec_x_mu], {l_in_x:sym_x, l_in_y:to_one_hot(predictions_eval_hard, num_classes)}, deterministic=True)
460 | 
461 |     # lower bounds
462 |     LL_train, log_qz_given_xy_train, log_pz_train, log_px_given_zy_train = latent_gaussian_x_bernoulli(z_train, z_mu_train, z_log_var_train, x_mu_train, sym_x, latent_size=nz, num_features=num_features, eq_samples=sym_eq_samples, iw_samples=sym_iw_samples)
463 |     LL_eval, log_qz_given_xy_eval, log_pz_eval, log_px_given_zy_eval = latent_gaussian_x_bernoulli(z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, sym_x, latent_size=nz, num_features=num_features, eq_samples=sym_eq_samples, iw_samples=sym_iw_samples)
464 | 
465 | elif distribution == 'gaussian':
466 |     z_train, z_mu_train, z_log_var_train, x_mu_train, x_log_var_train = lasagne.layers.get_output([l_z, l_mu, l_log_var, l_dec_x_mu, l_dec_x_log_var], {l_in_x:sym_x, l_in_y:sym_l_in_y_train}, deterministic=False)
467 |     z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, x_log_var_eval = lasagne.layers.get_output([l_z, l_mu, l_log_var, l_dec_x_mu, l_dec_x_log_var], {l_in_x:sym_x,l_in_y:to_one_hot(predictions_eval_hard, num_classes)}, deterministic=True)
468 | 
469 |     LL_train, log_qz_given_xy_train, log_pz_train, log_px_given_zy_train = latent_gaussian_x_gaussian(z_train, z_mu_train, z_log_var_train, x_mu_train, x_log_var_train, sym_x, latent_size=nz, num_features=num_features, eq_samples=sym_eq_samples, iw_samples=sym_iw_samples)
470 |     LL_eval, log_qz_given_xy_eval, log_pz_eval, log_px_given_zy_eval = latent_gaussian_x_gaussian(z_eval, z_mu_eval, z_log_var_eval, x_mu_eval, x_log_var_eval, sym_x, latent_size=nz, num_features=num_features, eq_samples=sym_eq_samples, iw_samples=sym_iw_samples)
471 | 
472 | cost_gen = -LL_train
473 | cost = cost_gen + alpha*cost_cla
474 | 
475 | # count parameters
476 | if distribution == 'bernoulli':
477 |     params = lasagne.layers.get_all_params([classifier, l_dec_x_mu], trainable=True)
478 |     for p in params:
479 |         print p, p.get_value().shape
480 |     params_count = lasagne.layers.count_params([classifier,l_dec_x_mu], trainable=True)
481 | elif distribution == 'gaussian':
482 |     params = lasagne.layers.get_all_params([classifier,l_dec_x_mu, l_dec_x_log_var], trainable=True)
483 |     for p in params:
484 |         print p, p.get_value().shape
485 |     params_count = lasagne.layers.count_params([classifier,l_dec_x_mu, l_dec_x_log_var], trainable=True)
486 | print 'Number of parameters:', params_count
487 | 
488 | # gradients
489 | grads = T.grad(cost, params)
490 | 
491 | '''
492 |     Straight Through Estimator
493 |     forward pass: logits -> y=argmax -> f
494 |     backward pass: f/y * p/theta
495 | '''
496 | cla_params = lasagne.layers.get_all_params(classifier, trainable=True)
497 | grad_one_hot_y = T.grad(-LL_train, sym_l_in_y_train)
498 | cla_loss_gen = (grad_one_hot_y*lasagne.nonlinearities.softmax(predictions_train)).sum()
499 | cla_grads_gen = T.grad(cla_loss_gen,cla_params)
500 | 
501 | for i in xrange(len(cla_grads_gen)):
502 |     grads[i] += alpha_straight_through*cla_grads_gen[i]
503 | 
504 | # mgrads = lasagne.updates.total_norm_constraint(grads,max_norm=max_norm)
505 | # cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
506 | 
507 | # functions
508 | updates = lasagne.updates.adam(grads, params, beta1=0.9, beta2=0.999, epsilon=1e-8, learning_rate=sym_lr)
509 | 
510 | if preprocesses_dataset is not None:
511 |     train_model = theano.function([sym_index_l, sym_index_u, sym_batch_size_l, sym_batch_size_u, sym_lr, sym_eq_samples, sym_iw_samples], [LL_train, log_qz_given_xy_train, log_pz_train, log_px_given_zy_train, classifier_cost_train, accurracy_train_labeled], givens={sym_x_cla:T.concatenate([sh_x_train_labelled[batch_slice_l],sh_x_train_unlabelled[batch_slice_u]], axis=0), sym_x: T.concatenate([sh_x_train_labelled_preprocessed[batch_slice_l],sh_x_train_unlabelled_preprocessed[batch_slice_u]], axis=0), sym_y:sh_t_train_labelled[batch_slice_l]}, updates=updates)
512 |     test_model = theano.function([sym_index, sym_batch_size, sym_eq_samples, sym_iw_samples], [LL_eval, log_qz_given_xy_eval, log_pz_eval, log_px_given_zy_eval, classifier_cost_eval, accurracy_eval], givens={sym_x_cla: sh_x_test[batch_slice], sym_x: sh_x_test_preprocessed[batch_slice], sym_y: sh_t_test[batch_slice]})
513 | else:
514 |     train_model = theano.function([sym_index_l, sym_index_u, sym_batch_size_l, sym_batch_size_u, sym_lr, sym_eq_samples, sym_iw_samples], [LL_train, log_qz_given_xy_train, log_pz_train, log_px_given_zy_train, classifier_cost_train,accurracy_train_labeled], givens={sym_x_cla:T.concatenate([sh_x_train_labelled[batch_slice_l],sh_x_train_unlabelled[batch_slice_u]], axis=0), sym_x: T.concatenate([sh_x_train_labelled[batch_slice_l],sh_x_train_unlabelled[batch_slice_u]], axis=0), sym_y: sh_t_train_labelled[batch_slice_l]}, updates=updates)
515 |     test_model = theano.function([sym_index, sym_batch_size, sym_eq_samples, sym_iw_samples], [LL_eval, log_qz_given_xy_eval, log_pz_eval, log_px_given_zy_eval, classifier_cost_eval, accurracy_eval], givens={sym_x_cla: sh_x_test[batch_slice], sym_x: sh_x_test[batch_slice], sym_y: sh_t_test[batch_slice]})
516 | 
517 | # random generation for visualization
518 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
519 | srng_ran = RandomStreams(lasagne.random.get_rng().randint(1, 2147462579))
520 | srng_ran_share = theano.tensor.shared_randomstreams.RandomStreams(1234)
521 | sym_ran_y = T.ivector('ran_y')
522 | 
523 | ran_z = T.tile(srng_ran.normal((num_classes,nz)), (num_classes, 1))
524 | if distribution == 'bernoulli':
525 |     random_x_mean = lasagne.layers.get_output(l_dec_x_mu, {l_z:ran_z, l_in_y:to_one_hot(sym_ran_y, num_classes)}, deterministic=True)
526 |     random_x = srng_ran_share.binomial(n=1, p=random_x_mean, dtype=theano.config.floatX)
527 | elif distribution == 'gaussian':
528 |     random_x_mean, random_x_log_var = lasagne.layers.get_output([l_dec_x_mu, l_dec_x_log_var], {l_z:ran_z, l_in_y:to_one_hot(sym_ran_y, num_classes)}, deterministic=True)
529 |     random_x = srng_ran_share.normal(avg=random_x_mean, std=T.exp(0.5*random_x_log_var))
530 | generate = theano.function(inputs=[sym_ran_y], outputs=[random_x_mean, random_x])
531 | 
532 | 
533 | '''
534 | run
535 | '''
536 | # Training and Testing functions
537 | def train_epoch(lr, eq_samples, iw_samples, batch_size):
538 |     costs,log_qz_given_xy,log_pz,log_px_given_zy, loss, accurracy, accurracy_labeled = [],[],[],[],[],[],[]
539 |     n_train_batches_labelled = x_labelled.shape[0] / num_labelled_per_batch
540 |     n_train_batches_unlabelled = x_unlabelled.shape[0] / (batch_size - num_labelled_per_batch)
541 | 
542 |     for i in range(n_train_batches_unlabelled):
543 |         costs_batch, log_qz_given_xy_batch,log_pz_batch,log_px_given_zy_batch, loss_batch, accurracy_labeled_batch = train_model(i % n_train_batches_labelled, i, num_labelled_per_batch, batch_size-num_labelled_per_batch, lr, eq_samples, iw_samples)
544 |         costs += [costs_batch]
545 |         log_qz_given_xy += [log_qz_given_xy_batch]
546 |         log_pz += [log_pz_batch]
547 |         log_px_given_zy += [log_px_given_zy_batch]
548 |         loss += [loss_batch]
549 |         accurracy_labeled += [accurracy_labeled_batch]
550 |     return np.mean(costs), np.mean(log_qz_given_xy), np.mean(log_pz), np.mean(log_px_given_zy), np.mean(loss), np.mean(accurracy_labeled)
551 | 
552 | def test_epoch(eq_samples, iw_samples, batch_size):
553 |     n_test_batches = test_x.shape[0] / batch_size
554 |     costs,log_qz_given_xy,log_pz,log_px_given_zy,loss, accurracy = [],[],[],[],[],[]
555 |     for i in range(n_test_batches):
556 |         costs_batch, log_qz_given_xy_batch,log_pz_batch,log_px_given_zy_batch, loss_batch, accurracy_batch = test_model(i, batch_size, eq_samples, iw_samples)
557 |         costs += [costs_batch]
558 |         log_qz_given_xy += [log_qz_given_xy_batch]
559 |         log_pz += [log_pz_batch]
560 |         log_px_given_zy += [log_px_given_zy_batch]
561 |         loss += [loss_batch]
562 |         accurracy += [accurracy_batch]
563 |     return np.mean(costs), np.mean(log_qz_given_xy), np.mean(log_pz), np.mean(log_px_given_zy), np.mean(loss), np.mean(accurracy)
564 | 
565 | 
566 | print "Training"
567 | 
568 | # TRAIN LOOP
569 | LL_train, log_qz_given_x_train, log_pz_train, log_px_given_z_train, loss_train, acc_labeled_train = [],[],[],[],[],[]
570 | LL_test, log_qz_given_x_test, log_pz_test, log_px_given_z_test, loss_test, acc_test = [],[],[],[],[],[]
571 | 
572 | for epoch in range(1, 1+num_epochs):
573 |     start = time.time()
574 | 
575 |     # randomly permute data and labels
576 |     p_l = np.random.permutation(x_labelled.shape[0]) 
577 |     sh_x_train_labelled.set_value(x_labelled[p_l])
578 |     sh_t_train_labelled.set_value((y_labelled[p_l]))
579 |     p_u = np.random.permutation(x_unlabelled.shape[0]) 
580 |     sh_x_train_unlabelled.set_value(x_unlabelled[p_u])
581 |     if preprocesses_dataset is not None:
582 |         sh_x_train_labelled_preprocessed.set_value(preprocesses_dataset(x_labelled[p_l]))
583 |         sh_x_train_unlabelled_preprocessed.set_value(preprocesses_dataset(x_unlabelled[p_u]))
584 | 
585 |     train_out = train_epoch(lr, eq_samples, iw_samples, batch_size)
586 | 
587 |     if np.isnan(train_out[0]):
588 |         ValueError("NAN in train LL!")
589 | 
590 |     if epoch >= anneal_lr_epoch and epoch % every_anneal == 0:
591 |         #annealing learning rate
592 |         lr = lr*anneal_lr_factor
593 | 
594 |     if epoch % eval_epoch == 0:
595 |         t = time.time() - start
596 |         LL_train += [train_out[0]]
597 |         log_qz_given_x_train += [train_out[1]]
598 |         log_pz_train += [train_out[2]]
599 |         log_px_given_z_train += [train_out[3]]
600 |         loss_train +=[train_out[4]]
601 |         acc_labeled_train += [train_out[5]]
602 | 
603 |         print "calculating LL eq=1, iw=1"
604 |         test_out = test_epoch(eq_samples, iw_samples, batch_size=500)
605 |         LL_test += [test_out[0]]
606 |         log_qz_given_x_test += [test_out[1]]
607 |         log_pz_test += [test_out[2]]
608 |         log_px_given_z_test += [test_out[3]]
609 |         loss_test += [test_out[4]]
610 |         acc_test += [test_out[5]]
611 | 
612 |         line = "*Epoch=%d\tTime=%.2f\tLR=%.5f\n" %(epoch, t, lr) + \
613 |                "  TRAIN:\tGen_loss=%.5f\tlogq(z|x)=%.5f\tlogp(z)=%.5f\tlogp(x|z)=%.5f\tdis_loss=%.5f\tlabel_error=%.5f\n" %(LL_train[-1], log_qz_given_x_train[-1], log_pz_train[-1], log_px_given_z_train[-1], loss_train[-1], 1-acc_labeled_train[-1]) + \
614 |                "  EVAL-L1:\tGen_loss=%.5f\tlogq(z|x)=%.5f\tlogp(z)=%.5f\tlogp(x|z)=%.5f\tdis_loss=%.5f\terror=%.5f\n" %(LL_test[-1], log_qz_given_x_test[-1], log_pz_test[-1], log_px_given_z_test[-1], loss_test[-1], 1-acc_test[-1])
615 |         print line
616 |         with open(logfile,'a') as f:
617 |             f.write(line + "\n")
618 | 
619 |     # random generation for visualization
620 |     if epoch % vis_epoch == 0:
621 |         tail='-'+str(epoch)+'.png'
622 |         ran_y = np.int32(np.repeat(np.arange(num_classes), num_classes))
623 |         _x_mean, _x = generate(ran_y)
624 |         _x_mean = _x_mean.reshape((num_generation,-1))
625 |         _x = _x.reshape((num_generation,-1))
626 |         image = paramgraphics.mat_to_img(_x_mean.T, dim_input, colorImg=colorImg, scale=generation_scale, 
627 |             save_path=os.path.join(res_out, 'mean'+tail))
628 | 
629 |     #save model
630 |     model_out = os.path.join(res_out, 'model')
631 |     if epoch % (vis_epoch*10) == 0:
632 |         if distribution == 'bernoulli':
633 |             all_params=lasagne.layers.get_all_params([classifier, l_dec_x_mu])
634 |         elif distribution == 'gaussian':
635 |             all_params=lasagne.layers.get_all_params([classifier, l_dec_x_mu, l_dec_x_log_var])
636 |         f = gzip.open(model_out + 'epoch%i'%(epoch), 'wb')
637 |         cPickle.dump(all_params, f, protocol=cPickle.HIGHEST_PROTOCOL)
638 |         f.close()


--------------------------------------------------------------------------------