├── .DS_Store ├── .gitignore ├── almost_log_gamma.m ├── autodisc ├── DexNet.py ├── LICENSE ├── MnistTests.py ├── NetTrainers.py ├── load_data.py ├── output_losses.py └── utils.py ├── basic_sear ├── FrankeNet.py ├── LICENSE ├── LICENSE.md ├── MnistTests.py ├── NetTrainers.py ├── load_data.py ├── output_losses.py └── utils.py ├── generalized_ear ├── ConvDemo.py ├── EarNet.py ├── LICENSE ├── MnistTests.py ├── NetLayers.py ├── NetTrainers.py ├── load_data.py ├── output_losses.py └── utils.py ├── generative_models ├── ADPair.py ├── AEDPair.py ├── BlocksAttention.py ├── BlocksModels.py ├── ClassModel.py ├── DKCode.py ├── GCPair.py ├── GIPair.py ├── GIPair2.py ├── GIStack.py ├── GITonGIP.py ├── GITrip.py ├── GPSImputer.py ├── HelperFuncs.py ├── HydraNet.py ├── InfNet.py ├── LICENSE ├── LogPDFs.py ├── MCSampler.py ├── MSDUtils.py ├── MnistTests.py ├── MnistWalkReg.py ├── MnistWalkoutTest.py ├── MultiStageModel.py ├── MultiStageModelSS.py ├── MultiStageModelSS2.py ├── NetLayers.py ├── OneStageModel.py ├── PeaNet.py ├── PeaNetSeq.py ├── SVHNWalkReg.py ├── SVHNWalkoutTest.py ├── TFDWalkoutTest.py ├── TempTests.py ├── TestBlocksCLModels.py ├── TestBlocksDDModels.py ├── TestBlocksESModels.py ├── TestBlocksImpModels.py ├── TestBlocksOLModels.py ├── TestClassModel.py ├── TestImpGPSI_MNIST.py ├── TestImpGPSI_SVHN.py ├── TestImpGPSI_TFD.py ├── TestImpTM.py ├── TestImpVAE.py ├── TestMSM.py ├── TestMSMSS.py ├── TestMSMSS2.py ├── TestTSM.py ├── TwoStageModel.py ├── VCGLoop.py ├── VideoUtils.py ├── WalkoutResults.py ├── blocks_models │ ├── attention.py │ ├── binarized_mnist_converter.py │ ├── lib │ │ ├── __init__.py │ │ ├── myutils.py │ │ └── prob_layers.py │ ├── models.py │ ├── plot-log.py │ ├── run-att-rw.py │ ├── sample.py │ ├── simple_script.sh │ ├── train-dotmatrix.py │ ├── train-draw.py │ └── train-imodraw.py ├── load_data.py ├── output_losses.py ├── result_parsing_script.py └── utils.py └── nlp ├── CorpusUtils.py ├── CythonFuncs.py ├── CythonFuncsPyx.pyx ├── DataLoaders.py ├── GPULayers.py ├── HelperFuncs.py ├── LICENSE.md ├── NLMLayers.py ├── NLModels.py ├── NumbaFuncs.py ├── TestCuBlas.py ├── gensim_code ├── GensimUtils.py ├── TestGensim.py ├── W2VInner.pyx └── W2VSimple.py ├── gnumpy.py ├── nlp_convnet ├── LNFuncs.py ├── LNLayers.py ├── LayerNets.py ├── STBTests.py └── StanfordTrees.py ├── npmat.py └── voidptr.h /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Philip-Bachman/NN-Python/e9a7619806c5ccbe2bd648b2a2e0af7967dc6996/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py~ 2 | *.swp 3 | *.npy 4 | *.csv 5 | *.dat 6 | *.data 7 | *.npz 8 | *.pkl 9 | *.pkl.gz 10 | *.png 11 | *.pyc 12 | *.DS_STORE 13 | .DS_STORE 14 | *.DS_Store 15 | .DS_Store 16 | *.lprof 17 | /basic_sear/data/* 18 | /basic_sear/test_results/* 19 | /generalized_ear/data/* 20 | /generalized_ear/test_results/* 21 | /autodisc/data/* 22 | /nlp/trees/* 23 | /nlp/training_text/* 24 | -------------------------------------------------------------------------------- /almost_log_gamma.m: -------------------------------------------------------------------------------- 1 | % 2 | % APPROXIMATION FOR GAMMALN (I.E. THE LOG GAMMA FUNCTION) 3 | % 4 | small_approx = @( x, c ) log(1 ./ x) - (0.57721566490153 * x) + (c * x.^2); 5 | large_approx = @( x, c ) (((x-0.5) .* log(x)) - x) + 0.5*log(2*pi) + c*(1./x); 6 | 7 | X = linspace(0.01, 4.0, 500); 8 | Y = almost_gammaln(X,0.25,0.025); -------------------------------------------------------------------------------- /autodisc/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014 Philip Bachman 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | 21 | 22 | ******************************************************************************** 23 | * The copyright notice below comes from code which has been _heavily_ modified * 24 | * in the production of the code in this directory. * 25 | ******************************************************************************** 26 | 27 | 28 | Copyright (C) 2012 Misha Denil 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining a copy of 31 | this software and associated documentation files (the "Software"), to deal in 32 | the Software without restriction, including without limitation the rights to 33 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 34 | of the Software, and to permit persons to whom the Software is furnished to do 35 | so, subject to the following conditions: 36 | 37 | The above copyright notice and this permission notice shall be included in all 38 | copies or substantial portions of the Software. 39 | 40 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 43 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 44 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 45 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 46 | SOFTWARE. 47 | 48 | -------------------------------------------------------------------------------- /autodisc/load_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cPickle 3 | import gzip 4 | import os 5 | import sys 6 | 7 | import theano 8 | import theano.tensor as T 9 | 10 | def _shared_dataset(data_xy): 11 | """ Function that loads the dataset into shared variables 12 | 13 | The reason we store our dataset in shared variables is to allow 14 | Theano to copy it into the GPU memory (when code is run on GPU). 15 | Since copying data into the GPU is slow, copying a minibatch everytime 16 | is needed (the default behaviour if the data is not in a shared 17 | variable) would lead to a large decrease in performance. 18 | """ 19 | data_x, data_y = data_xy 20 | shared_x = theano.shared(np.asarray(data_x, 21 | dtype=theano.config.floatX)) 22 | shared_y = theano.shared(np.asarray(data_y, 23 | dtype=theano.config.floatX)) 24 | # When storing data on the GPU it has to be stored as floats 25 | # therefore we will store the labels as ``floatX`` as well 26 | # (``shared_y`` does exactly that). 27 | return shared_x, shared_y 28 | 29 | def load_mnist(path, zero_mean=True): 30 | mnist = np.load(path) 31 | train_set_x = mnist['train_data'] 32 | train_set_y = mnist['train_labels'] + 1 33 | test_set_x = mnist['test_data'] 34 | test_set_y = mnist['test_labels'] + 1 35 | 36 | if zero_mean: 37 | obs_mean = np.mean(train_set_x, axis=0, keepdims=True) 38 | train_set_x = train_set_x - obs_mean 39 | test_set_x = test_set_x - obs_mean 40 | 41 | train_set_x, train_set_y = _shared_dataset((train_set_x, train_set_y)) 42 | test_set_x, test_set_y = _shared_dataset((test_set_x, test_set_y)) 43 | valid_set_x, valid_set_y = test_set_x, test_set_y 44 | 45 | rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), 46 | (test_set_x, test_set_y)] 47 | return rval 48 | 49 | def load_udm_ss(dataset, sup_count, rng, zero_mean=True): 50 | """Load semi-supervised version of the standard UdM MNIST data. 51 | 52 | For this, the training data is split into labeled and unlabeled portions. 53 | The number of labeled examples is 'sup_count', and an equal number of 54 | labeled examples will be selected for each class. The remaining (50000 - 55 | sup_count) examples are provided as unlabeled training data. The validate 56 | and test sets are left unchanged. 57 | 58 | Note: labels for the normal digit classes will range from 1-10, i.e. +1 59 | compared to their standard value, as 'un-classed' examples take label 0. 60 | """ 61 | 62 | udm_data = load_udm(dataset, as_shared=False, zero_mean=zero_mean) 63 | Xtr = udm_data[0][0] 64 | Ytr = udm_data[0][1][:,np.newaxis] 65 | 66 | all_count = Xtr.shape[0] 67 | pc_count = int(np.ceil(sup_count / 10.0)) 68 | sup_count = int(10 * pc_count) 69 | unsup_count = all_count - sup_count 70 | 71 | Xtr_su = [] 72 | Ytr_su = [] 73 | Xtr_un = [] 74 | Ytr_un = [] 75 | 76 | # Sample supervised and unsupervised subsets of each class' observations 77 | for c_label in np.unique(Ytr): 78 | c_idx = [i for i in range(all_count) if (Ytr[i] == c_label)] 79 | rng.shuffle(c_idx) 80 | Xtr_su.append(Xtr[c_idx[0:pc_count],:]) 81 | Ytr_su.append(Ytr[c_idx[0:pc_count],:]) 82 | Xtr_un.append(Xtr[c_idx[pc_count:],:]) 83 | Ytr_un.append(Ytr[c_idx[pc_count:],:]) 84 | 85 | # Stack per-class supervised/unsupervised splits into matrices 86 | Xtr_su = np.vstack(Xtr_su) 87 | Ytr_su = np.vstack(Ytr_su) 88 | Xtr_un = np.vstack(Xtr_un) 89 | Ytr_un = np.vstack(Ytr_un) 90 | # Also keep "unsupervised" copies of the "supervised" data 91 | Xtr_un = Xtr_un #np.vstack([Xtr_un, Xtr_su]) 92 | Ytr_un = 0 * Ytr_un #np.vstack([Ytr_un, Ytr_su]) 93 | 94 | # Shuffle the rows so that observations are not grouped by class 95 | shuf_idx = rng.permutation(Xtr_su.shape[0]) 96 | Xtr_su = Xtr_su[shuf_idx,:] 97 | Ytr_su = Ytr_su[shuf_idx].ravel() + 1 98 | shuf_idx = rng.permutation(Xtr_un.shape[0]) 99 | Xtr_un = Xtr_un[shuf_idx,:] 100 | Ytr_un = Ytr_un[shuf_idx].ravel() 101 | 102 | # Put matrices into GPU shared variables, for great justice 103 | Xtr_su, Ytr_su = _shared_dataset((Xtr_su, Ytr_su)) 104 | Xtr_un, Ytr_un = _shared_dataset((Xtr_un, Ytr_un)) 105 | Xva, Yva = _shared_dataset((udm_data[1][0], (udm_data[1][1] + 1))) 106 | Xte, Yte = _shared_dataset((udm_data[2][0], (udm_data[2][1] + 1))) 107 | 108 | rval = [(Xtr_su, Ytr_su), (Xtr_un, Ytr_un), (Xva, Yva), (Xte, Yte)] 109 | 110 | return rval 111 | 112 | def load_udm(dataset, as_shared=True, zero_mean=True): 113 | """Loads the UdM train/validate/test split of MNIST.""" 114 | 115 | ############# 116 | # LOAD DATA # 117 | ############# 118 | 119 | # Download the MNIST dataset if it is not present 120 | data_dir, data_file = os.path.split(dataset) 121 | if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': 122 | import urllib 123 | origin = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' 124 | print 'Downloading data from %s' % origin 125 | urllib.urlretrieve(origin, dataset) 126 | 127 | print '... loading data' 128 | 129 | # Load the dataset 130 | f = gzip.open(dataset, 'rb') 131 | train_set, valid_set, test_set = cPickle.load(f) 132 | f.close() 133 | #train_set, valid_set, test_set format: tuple(input, target) 134 | #input is an np.ndarray of 2 dimensions (a matrix) 135 | #witch row's correspond to an example. target is a 136 | #np.ndarray of 1 dimensions (vector)) that have the same length as 137 | #the number of rows in the input. It should give the target 138 | #target to the example with the same index in the input. 139 | train_set = [v for v in train_set] 140 | valid_set = [v for v in valid_set] 141 | test_set = [v for v in test_set] 142 | train_set[0] = np.asarray(train_set[0]).astype(np.float32) 143 | valid_set[0] = np.asarray(valid_set[0]).astype(np.float32) 144 | test_set[0] = np.asarray(test_set[0]).astype(np.float32) 145 | if zero_mean: 146 | obs_mean = np.mean(train_set[0], axis=0, keepdims=True) 147 | train_set[0] = train_set[0] - obs_mean 148 | valid_set[0] = valid_set[0] - obs_mean 149 | test_set[0] = test_set[0] - obs_mean 150 | if as_shared: 151 | test_set_x, test_set_y = _shared_dataset((test_set[0],test_set[1]+1)) 152 | valid_set_x, valid_set_y = _shared_dataset((valid_set[0],valid_set[1]+1)) 153 | train_set_x, train_set_y = _shared_dataset((train_set[0],train_set[1]+1)) 154 | else: 155 | test_set_x, test_set_y = test_set 156 | valid_set_x, valid_set_y = valid_set 157 | train_set_x, train_set_y = train_set 158 | 159 | rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), 160 | (test_set_x, test_set_y)] 161 | return rval 162 | 163 | -------------------------------------------------------------------------------- /autodisc/output_losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | 5 | class LogisticRegression(object): 6 | """Multi-class Logistic Regression loss dangler.""" 7 | 8 | def __init__(self, linear_layer): 9 | """Dangle a logistic regression from the given linear layer. 10 | 11 | The given linear layer should be a HiddenLayer (or subclass) object, 12 | for HiddenLayer as defined in LayerNet.py.""" 13 | self.input_layer = linear_layer 14 | 15 | def loss_func(self, y): 16 | """Return the multiclass logistic regression loss for y. 17 | 18 | The class labels in y are assumed to be in correspondence with the 19 | set of column indices for self.input_layer.linear_output. 20 | """ 21 | p_y_given_x = T.nnet.softmax(self.input_layer.linear_output) 22 | loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]),y]) 23 | return loss 24 | 25 | def errors(self, y): 26 | """Compute the number of wrong predictions by self.input_layer. 27 | 28 | Predicted class labels are computed as the indices of the columns of 29 | self.input_layer.linear_output which are maximal. Wrong predictions are 30 | those for which max indices do not match their corresponding y values. 31 | """ 32 | # Compute class memberships predicted by self.input_layer 33 | y_pred = T.argmax(self.input_layer.linear_output, axis=1) 34 | errs = 0 35 | # check if y has same dimension of y_pred 36 | if y.ndim != y_pred.ndim: 37 | raise TypeError('y should have the same shape as self.y_pred', 38 | ('y', y.type, 'y_pred', y_pred.type)) 39 | # check if y is of the correct datatype 40 | if y.dtype.startswith('int'): 41 | # the T.neq operator returns a vector of 0s and 1s, where 1 42 | # represents a mistake in prediction 43 | errs = T.sum(T.neq(y_pred, y)) 44 | else: 45 | raise NotImplementedError() 46 | return errs 47 | 48 | class LogRegSS(object): 49 | """Multi-class semi-supervised Logistic Regression loss dangler.""" 50 | 51 | def __init__(self, linear_layer): 52 | """Dangle a logistic regression from the given linear layer. 53 | 54 | The given linear layer should be a HiddenLayer (or subclass) object, 55 | for HiddenLayer as defined in LayerNet.py.""" 56 | self.input_layer = linear_layer 57 | 58 | def safe_softmax_ss(self, x): 59 | """Softmax that shouldn't overflow.""" 60 | e_x = T.exp(x - T.max(x, axis=1, keepdims=True)) 61 | x_sm = e_x / T.sum(e_x, axis=1, keepdims=True) 62 | return x_sm 63 | 64 | def loss_func(self, y): 65 | """Return the multiclass logistic regression loss for y. 66 | 67 | The class labels in y are assumed to be in correspondence with the 68 | set of column indices for self.input_layer.linear_output. 69 | """ 70 | row_idx = T.arange(y.shape[0]) 71 | row_mask = T.neq(y, 0).reshape((y.shape[0], 1)) 72 | p_y_given_x = self.safe_softmax_ss(self.input_layer.linear_output) 73 | wacky_mat = (p_y_given_x * row_mask) + (1. - row_mask) 74 | loss = -T.sum(T.log(wacky_mat[row_idx,y])) / T.sum(row_mask) 75 | return loss 76 | 77 | def errors(self, y): 78 | """Compute the number of wrong predictions by self.input_layer. 79 | 80 | Predicted class labels are computed as the indices of the columns of 81 | self.input_layer.linear_output which are maximal. Wrong predictions are 82 | those for which max indices do not match their corresponding y values. 83 | """ 84 | # Compute class memberships predicted by self.input_layer 85 | y_pred = T.argmax(self.input_layer.linear_output[:,1:], axis=1) 86 | y_pred = y_pred + 1 87 | errs = 0 88 | # check if y has same dimension of y_pred 89 | if y.ndim != y_pred.ndim: 90 | raise TypeError('y should have the same shape as self.y_pred', 91 | ('y', y.type, 'y_pred', y_pred.type)) 92 | # check if y is of the correct datatype 93 | if y.dtype.startswith('int'): 94 | # the T.neq operator returns a vector of 0s and 1s, where 1 95 | # represents a mistake in prediction 96 | errs = T.sum(T.neq(y_pred, y) * T.neq(y, 0)) 97 | else: 98 | raise NotImplementedError() 99 | return errs 100 | 101 | class MCL2Hinge(object): 102 | """Multi-class one-vs-all L2 hinge loss dangler.""" 103 | 104 | def __init__(self, linear_layer): 105 | """Dangle a squred hinge loss from the given linear layer. 106 | 107 | The given linear layer should be a HiddenLayer (or subclass) object, 108 | for HiddenLayer as defined in LayerNet.py.""" 109 | self.input_layer = linear_layer 110 | 111 | def loss_func(self, y): 112 | """Return the multiclass squared hinge loss for y. 113 | 114 | The class labels in y are assumed to be in correspondence with the 115 | set of column indices for self.input_layer.linear_output. 116 | """ 117 | y_hat = self.input_layer.linear_output 118 | margin_pos = T.maximum(0.0, (1.0 - y_hat)) 119 | margin_neg = T.maximum(0.0, (1.0 + y_hat)) 120 | obs_idx = T.arange(y.shape[0]) 121 | loss_pos = T.sum(margin_pos[obs_idx,y]**2.0) 122 | loss_neg = T.sum(margin_neg**2.0) - T.sum(margin_neg[obs_idx,y]**2.0) 123 | loss = (loss_pos + loss_neg) / y.shape[0] 124 | return loss 125 | 126 | def errors(self, y): 127 | """Compute the number of wrong predictions by self.input_layer. 128 | 129 | Predicted class labels are computed as the indices of the columns of 130 | self.input_layer.linear_output which are maximal. Wrong predictions are 131 | those for which max indices do not match their corresponding y values. 132 | """ 133 | # Compute class memberships predicted by self.input_layer 134 | y_pred = T.argmax(self.input_layer.linear_output, axis=1) 135 | errs = 0 136 | # check if y has same dimension of y_pred 137 | if y.ndim != y_pred.ndim: 138 | raise TypeError('y should have the same shape as self.y_pred', 139 | ('y', y.type, 'y_pred', y_pred.type)) 140 | # check if y is of the correct datatype 141 | if y.dtype.startswith('int'): 142 | # the T.neq operator returns a vector of 0s and 1s, where 1 143 | # represents a mistake in prediction 144 | errs = T.sum(T.neq(y_pred, y)) 145 | else: 146 | raise NotImplementedError() 147 | return errs 148 | 149 | class MCL2HingeSS(object): 150 | """Multi-class one-vs-all L2 hinge loss dangler. 151 | 152 | For this loss, class index 0 is never penalized, and errors for inputs 153 | with class index 0 are similarly ignored. This is for semi-supervised 154 | training, constrained by Theano's programming model.""" 155 | 156 | def __init__(self, linear_layer): 157 | """Dangle a squred hinge loss from the given linear layer. 158 | 159 | The given linear layer should be a HiddenLayer (or subclass) object, 160 | for HiddenLayer as defined in LayerNet.py.""" 161 | self.input_layer = linear_layer 162 | 163 | def loss_func(self, y): 164 | """Return the multiclass squared hinge loss for y. 165 | 166 | The class labels in y are assumed to be in correspondence with the 167 | set of column indices for self.input_layer.linear_output. 168 | """ 169 | y_hat = self.input_layer.linear_output 170 | row_idx = T.arange(y.shape[0]) 171 | row_mask = T.neq(y, 0).reshape((y_hat.shape[0], 1)) 172 | margin_pos = T.maximum(0.0, (1.0 - y_hat)) * row_mask 173 | margin_neg = T.maximum(0.0, (1.0 + y_hat)) * row_mask 174 | loss_pos = T.sum(margin_pos[row_idx,y]**2.0) 175 | loss_neg = T.sum(margin_neg**2.0) - T.sum(margin_neg[row_idx,y]**2.0) 176 | loss = (loss_pos + loss_neg) / T.sum(row_mask) 177 | return loss 178 | 179 | def errors(self, y): 180 | """Compute the number of wrong predictions by self.input_layer. 181 | 182 | Predicted class labels are computed as the indices of the columns of 183 | self.input_layer.linear_output which are maximal. Wrong predictions are 184 | those for which max indices do not match their corresponding y values. 185 | """ 186 | # Compute class memberships predicted by self.input_layer 187 | y_pred = T.argmax(self.input_layer.linear_output[:,1:], axis=1) 188 | y_pred = y_pred + 1 189 | errs = 0 190 | # check if y has same dimension of y_pred 191 | if y.ndim != y_pred.ndim: 192 | raise TypeError('y should have the same shape as self.y_pred', 193 | ('y', y.type, 'y_pred', y_pred.type)) 194 | # check if y is of the correct datatype 195 | if y.dtype.startswith('int'): 196 | # the T.neq operator returns a vector of 0s and 1s, where 1 197 | # represents a mistake in prediction 198 | errs = T.sum(T.neq(y_pred, y) * T.neq(y, 0)) 199 | else: 200 | raise NotImplementedError() 201 | return errs 202 | -------------------------------------------------------------------------------- /autodisc/utils.py: -------------------------------------------------------------------------------- 1 | """ This file contains different utility functions that are not connected 2 | in anyway to the networks presented in the tutorials, but rather help in 3 | processing the outputs into a more understandable way. 4 | 5 | For example ``tile_raster_images`` helps in generating a easy to grasp 6 | image from a set of samples or weights. 7 | """ 8 | 9 | import numpy as np 10 | import pylab as plt 11 | import PIL as PIL 12 | 13 | class batch(object): 14 | def __init__(self,batch_size): 15 | self.batch_size = batch_size 16 | 17 | def __call__(self,f): 18 | def wrapper(t,X): 19 | X = np.array(X) 20 | p = 0 21 | rem = 0 22 | results = [] 23 | while p < len(X): 24 | Z = X[p:p+self.batch_size] 25 | if Z.shape[0] != self.batch_size: 26 | zeros = np.zeros((self.batch_size-len(Z),X.shape[1])) 27 | rem = len(Z) 28 | Z = np.array(np.vstack((Z,zeros)),dtype=X.dtype) 29 | 30 | temp_results = f(t,Z) 31 | if rem != 0: 32 | temp_results = temp_results[:rem] 33 | 34 | results.extend(temp_results) 35 | p += self.batch_size 36 | return np.array(results,dtype='float32') 37 | return wrapper 38 | 39 | def scale_to_unit_interval(ndar, eps=1e-8): 40 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 41 | ndar = ndar.copy() 42 | ndar -= ndar.min() 43 | ndar *= 1.0 / (ndar.max() + eps) 44 | return ndar 45 | 46 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 47 | scale_rows_to_unit_interval=True, 48 | output_pixel_vals=True): 49 | """ 50 | Transform an array with one flattened image per row, into an array in 51 | which images are reshaped and layed out like tiles on a floor. 52 | 53 | This function is useful for visualizing datasets whose rows are images, 54 | and also columns of matrices for transforming those rows 55 | (such as the first layer of a neural net). 56 | 57 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 58 | be 2-D ndarrays or None; 59 | :param X: a 2-D array in which every row is a flattened image. 60 | 61 | :type img_shape: tuple; (height, width) 62 | :param img_shape: the original shape of each image 63 | 64 | :type tile_shape: tuple; (rows, cols) 65 | :param tile_shape: the number of images to tile (rows, cols) 66 | 67 | :param output_pixel_vals: if output should be pixel values (i.e. int8 68 | values) or floats 69 | 70 | :param scale_rows_to_unit_interval: if the values need to be scaled before 71 | being plotted to [0,1] or not 72 | 73 | 74 | :returns: array suitable for viewing as an image. 75 | (See:`PIL.Image.fromarray`.) 76 | :rtype: a 2-d array with same dtype as X. 77 | 78 | """ 79 | 80 | assert len(img_shape) == 2 81 | assert len(tile_shape) == 2 82 | assert len(tile_spacing) == 2 83 | 84 | # The expression below can be re-written in a more C style as 85 | # follows : 86 | # 87 | # out_shape = [0,0] 88 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - 89 | # tile_spacing[0] 90 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - 91 | # tile_spacing[1] 92 | out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 93 | in zip(img_shape, tile_shape, tile_spacing)] 94 | 95 | if isinstance(X, tuple): 96 | assert len(X) == 4 97 | # Create an output numpy ndarray to store the image 98 | if output_pixel_vals: 99 | out_array = np.zeros((out_shape[0], out_shape[1], 4), 100 | dtype='uint8') 101 | else: 102 | out_array = np.zeros((out_shape[0], out_shape[1], 4), 103 | dtype=X.dtype) 104 | 105 | #colors default to 0, alpha defaults to 1 (opaque) 106 | if output_pixel_vals: 107 | channel_defaults = [0, 0, 0, 255] 108 | else: 109 | channel_defaults = [0., 0., 0., 1.] 110 | 111 | for i in xrange(4): 112 | if X[i] is None: 113 | # if channel is None, fill it with zeros of the correct 114 | # dtype 115 | dt = out_array.dtype 116 | if output_pixel_vals: 117 | dt = 'uint8' 118 | out_array[:, :, i] = np.zeros(out_shape, 119 | dtype=dt) + channel_defaults[i] 120 | else: 121 | # use a recurrent call to compute the channel and store it 122 | # in the output 123 | out_array[:, :, i] = tile_raster_images( 124 | X[i], img_shape, tile_shape, tile_spacing, 125 | scale_rows_to_unit_interval, output_pixel_vals) 126 | return out_array 127 | else: 128 | # if we are dealing with only one channel 129 | H, W = img_shape 130 | Hs, Ws = tile_spacing 131 | # generate a matrix to store the output 132 | dt = X.dtype 133 | if output_pixel_vals: 134 | dt = 'uint8' 135 | out_array = np.zeros(out_shape, dtype=dt) 136 | for tile_row in xrange(tile_shape[0]): 137 | for tile_col in xrange(tile_shape[1]): 138 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 139 | this_x = X[tile_row * tile_shape[1] + tile_col] 140 | if scale_rows_to_unit_interval: 141 | # if we should scale values to be between 0 and 1 142 | # do this by calling the `scale_to_unit_interval` 143 | # function 144 | this_img = scale_to_unit_interval( 145 | this_x.reshape(img_shape)) 146 | else: 147 | this_img = this_x.reshape(img_shape) 148 | # add the slice to the corresponding position in the 149 | # output array 150 | c = 1 151 | if output_pixel_vals: 152 | c = 255 153 | out_array[ 154 | tile_row * (H + Hs): tile_row * (H + Hs) + H, 155 | tile_col * (W + Ws): tile_col * (W + Ws) + W 156 | ] = this_img * c 157 | return out_array 158 | 159 | 160 | def plot_histograms(firings): 161 | N = int(np.ceil(np.sqrt(firings.shape[1]))) 162 | plt.figure(figsize=(N,N)) 163 | axisNum = 0 164 | for row in range(N): 165 | for col in range(N): 166 | axisNum += 1 167 | ax = plt.subplot(N, N, axisNum) 168 | ax.set_xticklabels([]) 169 | ax.set_yticklabels([]) 170 | plt.hist(firings[:,row*N+col],bins=50) 171 | plt.show() 172 | return 173 | 174 | def visualize(EN, proto_key, layer_num, file_name): 175 | W = EN.proto_nets[proto_key][layer_num].W.get_value(borrow=True).T 176 | size = int(np.sqrt(W.shape[1])) 177 | # hist(W.flatten(),bins=50) 178 | image = PIL.Image.fromarray(tile_raster_images(X=W, \ 179 | img_shape=(size, size), tile_shape=(10,W.shape[0]/10),tile_spacing=(1, 1))) 180 | image.save(file_name) 181 | return 182 | 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /basic_sear/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014 Philip Bachman 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | 21 | 22 | ******************************************************************************** 23 | * The copyright notice below comes from code which has been _heavily_ modified * 24 | * in the production of the code in this directory. * 25 | ******************************************************************************** 26 | 27 | 28 | Copyright (C) 2012 Misha Denil 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining a copy of 31 | this software and associated documentation files (the "Software"), to deal in 32 | the Software without restriction, including without limitation the rights to 33 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 34 | of the Software, and to permit persons to whom the Software is furnished to do 35 | so, subject to the following conditions: 36 | 37 | The above copyright notice and this permission notice shall be included in all 38 | copies or substantial portions of the Software. 39 | 40 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 43 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 44 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 45 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 46 | SOFTWARE. 47 | 48 | -------------------------------------------------------------------------------- /basic_sear/LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) {{{2014}}} {{{Philip Bachman}}} 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /basic_sear/load_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cPickle 3 | import gzip 4 | import os 5 | import sys 6 | 7 | import theano 8 | import theano.tensor as T 9 | 10 | def _shared_dataset(data_xy): 11 | """ Function that loads the dataset into shared variables 12 | 13 | The reason we store our dataset in shared variables is to allow 14 | Theano to copy it into the GPU memory (when code is run on GPU). 15 | Since copying data into the GPU is slow, copying a minibatch everytime 16 | is needed (the default behaviour if the data is not in a shared 17 | variable) would lead to a large decrease in performance. 18 | """ 19 | data_x, data_y = data_xy 20 | shared_x = theano.shared(np.asarray(data_x, 21 | dtype=theano.config.floatX)) 22 | shared_y = theano.shared(np.asarray(data_y, 23 | dtype=theano.config.floatX)) 24 | # When storing data on the GPU it has to be stored as floats 25 | # therefore we will store the labels as ``floatX`` as well 26 | # (``shared_y`` does exactly that). 27 | return shared_x, shared_y 28 | 29 | def load_mnist(path): 30 | mnist = np.load(path) 31 | train_set_x = mnist['train_data'] 32 | train_set_y = mnist['train_labels'] + 1 33 | test_set_x = mnist['test_data'] 34 | test_set_y = mnist['test_labels'] + 1 35 | 36 | train_set_x, train_set_y = _shared_dataset((train_set_x, train_set_y)) 37 | test_set_x, test_set_y = _shared_dataset((test_set_x, test_set_y)) 38 | valid_set_x, valid_set_y = test_set_x, test_set_y 39 | 40 | rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), 41 | (test_set_x, test_set_y)] 42 | return rval 43 | 44 | def load_udm_ss(dataset, sup_count, rng): 45 | """Load semi-supervised version of the standard UdM MNIST data. 46 | 47 | For this, the training data is split into labeled and unlabeled portions. 48 | The number of labeled examples is 'sup_count', and an equal number of 49 | labeled examples will be selected for each class. The remaining (50000 - 50 | sup_count) examples are provided as unlabeled training data. The validate 51 | and test sets are left unchanged. 52 | 53 | Note: labels for the normal digit classes will range from 1-10, i.e. +1 54 | compared to their standard value, as 'un-classed' examples take label 0. 55 | """ 56 | 57 | udm_data = load_udm(dataset,as_shared=False) 58 | Xtr = udm_data[0][0] 59 | Ytr = udm_data[0][1][:,np.newaxis] 60 | 61 | all_count = Xtr.shape[0] 62 | pc_count = int(np.ceil(sup_count / 10.0)) 63 | sup_count = int(10 * pc_count) 64 | unsup_count = all_count - sup_count 65 | 66 | Xtr_su = [] 67 | Ytr_su = [] 68 | Xtr_un = [] 69 | Ytr_un = [] 70 | 71 | # Sample supervised and unsupervised subsets of each class' observations 72 | for c_label in np.unique(Ytr): 73 | c_idx = [i for i in range(all_count) if (Ytr[i] == c_label)] 74 | rng.shuffle(c_idx) 75 | Xtr_su.append(Xtr[c_idx[0:pc_count],:]) 76 | Ytr_su.append(Ytr[c_idx[0:pc_count],:]) 77 | Xtr_un.append(Xtr[c_idx[pc_count:],:]) 78 | Ytr_un.append(Ytr[c_idx[pc_count:],:]) 79 | 80 | # Stack per-class supervised/unsupervised splits into matrices 81 | Xtr_su = np.vstack(Xtr_su) 82 | Ytr_su = np.vstack(Ytr_su) 83 | Xtr_un = np.vstack(Xtr_un) 84 | Ytr_un = np.vstack(Ytr_un) 85 | # Also keep "unsupervised" copies of the "supervised" data 86 | Xtr_un = Xtr_un #np.vstack([Xtr_un, Xtr_su]) 87 | Ytr_un = 0 * Ytr_un #np.vstack([Ytr_un, Ytr_su]) 88 | 89 | # Shuffle the rows so that observations are not grouped by class 90 | shuf_idx = rng.permutation(Xtr_su.shape[0]) 91 | Xtr_su = Xtr_su[shuf_idx,:] 92 | Ytr_su = Ytr_su[shuf_idx].ravel() + 1 93 | shuf_idx = rng.permutation(Xtr_un.shape[0]) 94 | Xtr_un = Xtr_un[shuf_idx,:] 95 | Ytr_un = Ytr_un[shuf_idx].ravel() 96 | 97 | # Put matrices into GPU shared variables, for great justice 98 | Xtr_su, Ytr_su = _shared_dataset((Xtr_su, Ytr_su)) 99 | Xtr_un, Ytr_un = _shared_dataset((Xtr_un, Ytr_un)) 100 | Xva, Yva = _shared_dataset((udm_data[1][0], (udm_data[1][1] + 1))) 101 | Xte, Yte = _shared_dataset((udm_data[2][0], (udm_data[2][1] + 1))) 102 | 103 | rval = [(Xtr_su, Ytr_su), (Xtr_un, Ytr_un), (Xva, Yva), (Xte, Yte)] 104 | 105 | return rval 106 | 107 | def load_udm(dataset, as_shared=True): 108 | """Loads the UdM train/validate/test split of MNIST.""" 109 | 110 | ############# 111 | # LOAD DATA # 112 | ############# 113 | 114 | # Download the MNIST dataset if it is not present 115 | data_dir, data_file = os.path.split(dataset) 116 | if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': 117 | import urllib 118 | origin = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' 119 | print 'Downloading data from %s' % origin 120 | urllib.urlretrieve(origin, dataset) 121 | 122 | print '... loading data' 123 | 124 | # Load the dataset 125 | f = gzip.open(dataset, 'rb') 126 | train_set, valid_set, test_set = cPickle.load(f) 127 | f.close() 128 | #train_set, valid_set, test_set format: tuple(input, target) 129 | #input is an np.ndarray of 2 dimensions (a matrix) 130 | #witch row's correspond to an example. target is a 131 | #np.ndarray of 1 dimensions (vector)) that have the same length as 132 | #the number of rows in the input. It should give the target 133 | #target to the example with the same index in the input. 134 | train_set = [v for v in train_set] 135 | valid_set = [v for v in valid_set] 136 | test_set = [v for v in test_set] 137 | train_set[0] = np.asarray(train_set[0]).astype(np.float32) 138 | valid_set[0] = np.asarray(valid_set[0]).astype(np.float32) 139 | test_set[0] = np.asarray(test_set[0]).astype(np.float32) 140 | obs_mean = 1.0 * np.mean(train_set[0], axis=0, keepdims=True) 141 | train_set[0] = train_set[0] - obs_mean 142 | valid_set[0] = valid_set[0] - obs_mean 143 | test_set[0] = test_set[0] - obs_mean 144 | if as_shared: 145 | test_set_x, test_set_y = _shared_dataset((test_set[0],test_set[1]+1)) 146 | valid_set_x, valid_set_y = _shared_dataset((valid_set[0],valid_set[1]+1)) 147 | train_set_x, train_set_y = _shared_dataset((train_set[0],train_set[1]+1)) 148 | else: 149 | test_set_x, test_set_y = test_set 150 | valid_set_x, valid_set_y = valid_set 151 | train_set_x, train_set_y = train_set 152 | 153 | rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), 154 | (test_set_x, test_set_y)] 155 | return rval 156 | 157 | -------------------------------------------------------------------------------- /basic_sear/output_losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | 5 | class LogisticRegression(object): 6 | """Multi-class Logistic Regression loss dangler.""" 7 | 8 | def __init__(self, linear_layer): 9 | """Dangle a logistic regression from the given linear layer. 10 | 11 | The given linear layer should be a HiddenLayer (or subclass) object, 12 | for HiddenLayer as defined in LayerNet.py.""" 13 | self.input_layer = linear_layer 14 | 15 | def loss_func(self, y): 16 | """Return the multiclass logistic regression loss for y. 17 | 18 | The class labels in y are assumed to be in correspondence with the 19 | set of column indices for self.input_layer.linear_output. 20 | """ 21 | p_y_given_x = T.nnet.softmax(self.input_layer.linear_output) 22 | loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]),y]) 23 | return loss 24 | 25 | def errors(self, y): 26 | """Compute the number of wrong predictions by self.input_layer. 27 | 28 | Predicted class labels are computed as the indices of the columns of 29 | self.input_layer.linear_output which are maximal. Wrong predictions are 30 | those for which max indices do not match their corresponding y values. 31 | """ 32 | # Compute class memberships predicted by self.input_layer 33 | y_pred = T.argmax(self.input_layer.linear_output, axis=1) 34 | errs = 0 35 | # check if y has same dimension of y_pred 36 | if y.ndim != y_pred.ndim: 37 | raise TypeError('y should have the same shape as self.y_pred', 38 | ('y', y.type, 'y_pred', y_pred.type)) 39 | # check if y is of the correct datatype 40 | if y.dtype.startswith('int'): 41 | # the T.neq operator returns a vector of 0s and 1s, where 1 42 | # represents a mistake in prediction 43 | errs = T.sum(T.neq(y_pred, y)) 44 | else: 45 | raise NotImplementedError() 46 | return errs 47 | 48 | class LogRegSS(object): 49 | """Multi-class semi-supervised Logistic Regression loss dangler.""" 50 | 51 | def __init__(self, linear_layer): 52 | """Dangle a logistic regression from the given linear layer. 53 | 54 | The given linear layer should be a HiddenLayer (or subclass) object, 55 | for HiddenLayer as defined in LayerNet.py.""" 56 | self.input_layer = linear_layer 57 | 58 | def safe_softmax_ss(self, x): 59 | """Softmax that shouldn't overflow.""" 60 | e_x = T.exp(x - T.max(x, axis=1, keepdims=True)) 61 | x_sm = e_x / T.sum(e_x, axis=1, keepdims=True) 62 | return x_sm 63 | 64 | def loss_func(self, y): 65 | """Return the multiclass logistic regression loss for y. 66 | 67 | The class labels in y are assumed to be in correspondence with the 68 | set of column indices for self.input_layer.linear_output. 69 | """ 70 | row_idx = T.arange(y.shape[0]) 71 | row_mask = T.neq(y, 0).reshape((y.shape[0], 1)) 72 | p_y_given_x = self.safe_softmax_ss(self.input_layer.linear_output) 73 | wacky_mat = (p_y_given_x * row_mask) + (1. - row_mask) 74 | loss = -T.sum(T.log(wacky_mat[row_idx,y])) / T.sum(row_mask) 75 | return loss 76 | 77 | def errors(self, y): 78 | """Compute the number of wrong predictions by self.input_layer. 79 | 80 | Predicted class labels are computed as the indices of the columns of 81 | self.input_layer.linear_output which are maximal. Wrong predictions are 82 | those for which max indices do not match their corresponding y values. 83 | """ 84 | # Compute class memberships predicted by self.input_layer 85 | y_pred = T.argmax(self.input_layer.linear_output[:,1:], axis=1) 86 | y_pred = y_pred + 1 87 | errs = 0 88 | # check if y has same dimension of y_pred 89 | if y.ndim != y_pred.ndim: 90 | raise TypeError('y should have the same shape as self.y_pred', 91 | ('y', y.type, 'y_pred', y_pred.type)) 92 | # check if y is of the correct datatype 93 | if y.dtype.startswith('int'): 94 | # the T.neq operator returns a vector of 0s and 1s, where 1 95 | # represents a mistake in prediction 96 | errs = T.sum(T.neq(y_pred, y) * T.neq(y, 0)) 97 | else: 98 | raise NotImplementedError() 99 | return errs 100 | 101 | class MCL2Hinge(object): 102 | """Multi-class one-vs-all L2 hinge loss dangler.""" 103 | 104 | def __init__(self, linear_layer): 105 | """Dangle a squred hinge loss from the given linear layer. 106 | 107 | The given linear layer should be a HiddenLayer (or subclass) object, 108 | for HiddenLayer as defined in LayerNet.py.""" 109 | self.input_layer = linear_layer 110 | 111 | def loss_func(self, y): 112 | """Return the multiclass squared hinge loss for y. 113 | 114 | The class labels in y are assumed to be in correspondence with the 115 | set of column indices for self.input_layer.linear_output. 116 | """ 117 | y_hat = self.input_layer.linear_output 118 | margin_pos = T.maximum(0.0, (1.0 - y_hat)) 119 | margin_neg = T.maximum(0.0, (1.0 + y_hat)) 120 | obs_idx = T.arange(y.shape[0]) 121 | loss_pos = T.sum(margin_pos[obs_idx,y]**2.0) 122 | loss_neg = T.sum(margin_neg**2.0) - T.sum(margin_neg[obs_idx,y]**2.0) 123 | loss = (loss_pos + loss_neg) / y.shape[0] 124 | return loss 125 | 126 | def errors(self, y): 127 | """Compute the number of wrong predictions by self.input_layer. 128 | 129 | Predicted class labels are computed as the indices of the columns of 130 | self.input_layer.linear_output which are maximal. Wrong predictions are 131 | those for which max indices do not match their corresponding y values. 132 | """ 133 | # Compute class memberships predicted by self.input_layer 134 | y_pred = T.argmax(self.input_layer.linear_output, axis=1) 135 | errs = 0 136 | # check if y has same dimension of y_pred 137 | if y.ndim != y_pred.ndim: 138 | raise TypeError('y should have the same shape as self.y_pred', 139 | ('y', y.type, 'y_pred', y_pred.type)) 140 | # check if y is of the correct datatype 141 | if y.dtype.startswith('int'): 142 | # the T.neq operator returns a vector of 0s and 1s, where 1 143 | # represents a mistake in prediction 144 | errs = T.sum(T.neq(y_pred, y)) 145 | else: 146 | raise NotImplementedError() 147 | return errs 148 | 149 | class MCL2HingeSS(object): 150 | """Multi-class one-vs-all L2 hinge loss dangler. 151 | 152 | For this loss, class index 0 is never penalized, and errors for inputs 153 | with class index 0 are similarly ignored. This is for semi-supervised 154 | training, constrained by Theano's programming model.""" 155 | 156 | def __init__(self, linear_layer): 157 | """Dangle a squred hinge loss from the given linear layer. 158 | 159 | The given linear layer should be a HiddenLayer (or subclass) object, 160 | for HiddenLayer as defined in LayerNet.py.""" 161 | self.input_layer = linear_layer 162 | 163 | def loss_func(self, y): 164 | """Return the multiclass squared hinge loss for y. 165 | 166 | The class labels in y are assumed to be in correspondence with the 167 | set of column indices for self.input_layer.linear_output. 168 | """ 169 | y_hat = self.input_layer.linear_output 170 | row_idx = T.arange(y.shape[0]) 171 | row_mask = T.neq(y, 0).reshape((y_hat.shape[0], 1)) 172 | margin_pos = T.maximum(0.0, (1.0 - y_hat)) * row_mask 173 | margin_neg = T.maximum(0.0, (1.0 + y_hat)) * row_mask 174 | loss_pos = T.sum(margin_pos[row_idx,y]**2.0) 175 | loss_neg = T.sum(margin_neg**2.0) - T.sum(margin_neg[row_idx,y]**2.0) 176 | loss = (loss_pos + loss_neg) / T.sum(row_mask) 177 | return loss 178 | 179 | def errors(self, y): 180 | """Compute the number of wrong predictions by self.input_layer. 181 | 182 | Predicted class labels are computed as the indices of the columns of 183 | self.input_layer.linear_output which are maximal. Wrong predictions are 184 | those for which max indices do not match their corresponding y values. 185 | """ 186 | # Compute class memberships predicted by self.input_layer 187 | y_pred = T.argmax(self.input_layer.linear_output[:,1:], axis=1) 188 | y_pred = y_pred + 1 189 | errs = 0 190 | # check if y has same dimension of y_pred 191 | if y.ndim != y_pred.ndim: 192 | raise TypeError('y should have the same shape as self.y_pred', 193 | ('y', y.type, 'y_pred', y_pred.type)) 194 | # check if y is of the correct datatype 195 | if y.dtype.startswith('int'): 196 | # the T.neq operator returns a vector of 0s and 1s, where 1 197 | # represents a mistake in prediction 198 | errs = T.sum(T.neq(y_pred, y) * T.neq(y, 0)) 199 | else: 200 | raise NotImplementedError() 201 | return errs 202 | -------------------------------------------------------------------------------- /basic_sear/utils.py: -------------------------------------------------------------------------------- 1 | """ This file contains different utility functions that are not connected 2 | in anyway to the networks presented in the tutorials, but rather help in 3 | processing the outputs into a more understandable way. 4 | 5 | For example ``tile_raster_images`` helps in generating a easy to grasp 6 | image from a set of samples or weights. 7 | """ 8 | 9 | import numpy as np 10 | import pylab as plt 11 | import PIL as PIL 12 | 13 | class batch(object): 14 | def __init__(self,batch_size): 15 | self.batch_size = batch_size 16 | 17 | def __call__(self,f): 18 | def wrapper(t,X): 19 | X = np.array(X) 20 | p = 0 21 | rem = 0 22 | results = [] 23 | while p < len(X): 24 | Z = X[p:p+self.batch_size] 25 | if Z.shape[0] != self.batch_size: 26 | zeros = np.zeros((self.batch_size-len(Z),X.shape[1])) 27 | rem = len(Z) 28 | Z = np.array(np.vstack((Z,zeros)),dtype=X.dtype) 29 | 30 | temp_results = f(t,Z) 31 | if rem != 0: 32 | temp_results = temp_results[:rem] 33 | 34 | results.extend(temp_results) 35 | p += self.batch_size 36 | return np.array(results,dtype='float32') 37 | return wrapper 38 | 39 | def scale_to_unit_interval(ndar, eps=1e-8): 40 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 41 | ndar = ndar.copy() 42 | ndar -= ndar.min() 43 | ndar *= 1.0 / (ndar.max() + eps) 44 | return ndar 45 | 46 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 47 | scale_rows_to_unit_interval=True, 48 | output_pixel_vals=True): 49 | """ 50 | Transform an array with one flattened image per row, into an array in 51 | which images are reshaped and layed out like tiles on a floor. 52 | 53 | This function is useful for visualizing datasets whose rows are images, 54 | and also columns of matrices for transforming those rows 55 | (such as the first layer of a neural net). 56 | 57 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 58 | be 2-D ndarrays or None; 59 | :param X: a 2-D array in which every row is a flattened image. 60 | 61 | :type img_shape: tuple; (height, width) 62 | :param img_shape: the original shape of each image 63 | 64 | :type tile_shape: tuple; (rows, cols) 65 | :param tile_shape: the number of images to tile (rows, cols) 66 | 67 | :param output_pixel_vals: if output should be pixel values (i.e. int8 68 | values) or floats 69 | 70 | :param scale_rows_to_unit_interval: if the values need to be scaled before 71 | being plotted to [0,1] or not 72 | 73 | 74 | :returns: array suitable for viewing as an image. 75 | (See:`PIL.Image.fromarray`.) 76 | :rtype: a 2-d array with same dtype as X. 77 | 78 | """ 79 | 80 | assert len(img_shape) == 2 81 | assert len(tile_shape) == 2 82 | assert len(tile_spacing) == 2 83 | 84 | # The expression below can be re-written in a more C style as 85 | # follows : 86 | # 87 | # out_shape = [0,0] 88 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - 89 | # tile_spacing[0] 90 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - 91 | # tile_spacing[1] 92 | out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 93 | in zip(img_shape, tile_shape, tile_spacing)] 94 | 95 | if isinstance(X, tuple): 96 | assert len(X) == 4 97 | # Create an output numpy ndarray to store the image 98 | if output_pixel_vals: 99 | out_array = np.zeros((out_shape[0], out_shape[1], 4), 100 | dtype='uint8') 101 | else: 102 | out_array = np.zeros((out_shape[0], out_shape[1], 4), 103 | dtype=X.dtype) 104 | 105 | #colors default to 0, alpha defaults to 1 (opaque) 106 | if output_pixel_vals: 107 | channel_defaults = [0, 0, 0, 255] 108 | else: 109 | channel_defaults = [0., 0., 0., 1.] 110 | 111 | for i in xrange(4): 112 | if X[i] is None: 113 | # if channel is None, fill it with zeros of the correct 114 | # dtype 115 | dt = out_array.dtype 116 | if output_pixel_vals: 117 | dt = 'uint8' 118 | out_array[:, :, i] = np.zeros(out_shape, 119 | dtype=dt) + channel_defaults[i] 120 | else: 121 | # use a recurrent call to compute the channel and store it 122 | # in the output 123 | out_array[:, :, i] = tile_raster_images( 124 | X[i], img_shape, tile_shape, tile_spacing, 125 | scale_rows_to_unit_interval, output_pixel_vals) 126 | return out_array 127 | else: 128 | # if we are dealing with only one channel 129 | H, W = img_shape 130 | Hs, Ws = tile_spacing 131 | # generate a matrix to store the output 132 | dt = X.dtype 133 | if output_pixel_vals: 134 | dt = 'uint8' 135 | out_array = np.zeros(out_shape, dtype=dt) 136 | for tile_row in xrange(tile_shape[0]): 137 | for tile_col in xrange(tile_shape[1]): 138 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 139 | this_x = X[tile_row * tile_shape[1] + tile_col] 140 | if scale_rows_to_unit_interval: 141 | # if we should scale values to be between 0 and 1 142 | # do this by calling the `scale_to_unit_interval` 143 | # function 144 | this_img = scale_to_unit_interval( 145 | this_x.reshape(img_shape)) 146 | else: 147 | this_img = this_x.reshape(img_shape) 148 | # add the slice to the corresponding position in the 149 | # output array 150 | c = 1 151 | if output_pixel_vals: 152 | c = 255 153 | out_array[ 154 | tile_row * (H + Hs): tile_row * (H + Hs) + H, 155 | tile_col * (W + Ws): tile_col * (W + Ws) + W 156 | ] = this_img * c 157 | return out_array 158 | 159 | 160 | def plot_histograms(firings): 161 | N = int(np.ceil(np.sqrt(firings.shape[1]))) 162 | plt.figure(figsize=(N,N)) 163 | axisNum = 0 164 | for row in range(N): 165 | for col in range(N): 166 | axisNum += 1 167 | ax = plt.subplot(N, N, axisNum) 168 | ax.set_xticklabels([]) 169 | ax.set_yticklabels([]) 170 | plt.hist(firings[:,row*N+col],bins=50) 171 | plt.show() 172 | 173 | def visualize(MLP,layer_idx,file_name): 174 | 175 | W = MLP.layers[layer_idx].W.get_value(borrow=True).T 176 | 177 | size = int(np.sqrt(W.shape[1])) 178 | 179 | # hist(W.flatten(),bins=50) 180 | image = PIL.Image.fromarray(tile_raster_images(X=W, 181 | img_shape=(size, size), tile_shape=(10,W.shape[0]/10),tile_spacing=(1, 1))) 182 | image.save(file_name) 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /generalized_ear/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014 Philip Bachman 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | 21 | 22 | ******************************************************************************** 23 | * The copyright notice below comes from code which has been _heavily_ modified * 24 | * in the production of the code in this directory. * 25 | ******************************************************************************** 26 | 27 | 28 | Copyright (C) 2012 Misha Denil 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining a copy of 31 | this software and associated documentation files (the "Software"), to deal in 32 | the Software without restriction, including without limitation the rights to 33 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 34 | of the Software, and to permit persons to whom the Software is furnished to do 35 | so, subject to the following conditions: 36 | 37 | The above copyright notice and this permission notice shall be included in all 38 | copies or substantial portions of the Software. 39 | 40 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 43 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 44 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 45 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 46 | SOFTWARE. 47 | 48 | -------------------------------------------------------------------------------- /generalized_ear/load_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cPickle 3 | import gzip 4 | import os 5 | import sys 6 | 7 | import theano 8 | import theano.tensor as T 9 | 10 | def _shared_dataset(data_xy): 11 | """ Function that loads the dataset into shared variables 12 | 13 | The reason we store our dataset in shared variables is to allow 14 | Theano to copy it into the GPU memory (when code is run on GPU). 15 | Since copying data into the GPU is slow, copying a minibatch everytime 16 | is needed (the default behaviour if the data is not in a shared 17 | variable) would lead to a large decrease in performance. 18 | """ 19 | data_x, data_y = data_xy 20 | shared_x = theano.shared(np.asarray(data_x, 21 | dtype=theano.config.floatX)) 22 | shared_y = theano.shared(np.asarray(data_y, 23 | dtype=theano.config.floatX)) 24 | # When storing data on the GPU it has to be stored as floats 25 | # therefore we will store the labels as ``floatX`` as well 26 | # (``shared_y`` does exactly that). 27 | return shared_x, shared_y 28 | 29 | def load_mnist(path, zero_mean=True): 30 | mnist = np.load(path) 31 | train_set_x = mnist['train_data'] 32 | train_set_y = mnist['train_labels'] + 1 33 | test_set_x = mnist['test_data'] 34 | test_set_y = mnist['test_labels'] + 1 35 | 36 | if zero_mean: 37 | obs_mean = np.mean(train_set_x, axis=0, keepdims=True) 38 | train_set_x = train_set_x - obs_mean 39 | test_set_x = test_set_x - obs_mean 40 | 41 | train_set_x, train_set_y = _shared_dataset((train_set_x, train_set_y)) 42 | test_set_x, test_set_y = _shared_dataset((test_set_x, test_set_y)) 43 | valid_set_x, valid_set_y = test_set_x, test_set_y 44 | 45 | rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), 46 | (test_set_x, test_set_y)] 47 | return rval 48 | 49 | def load_udm_ss(dataset, sup_count, rng, zero_mean=True): 50 | """Load semi-supervised version of the standard UdM MNIST data. 51 | 52 | For this, the training data is split into labeled and unlabeled portions. 53 | The number of labeled examples is 'sup_count', and an equal number of 54 | labeled examples will be selected for each class. The remaining (50000 - 55 | sup_count) examples are provided as unlabeled training data. The validate 56 | and test sets are left unchanged. 57 | 58 | Note: labels for the normal digit classes will range from 1-10, i.e. +1 59 | compared to their standard value, as 'un-classed' examples take label 0. 60 | """ 61 | 62 | udm_data = load_udm(dataset, as_shared=False, zero_mean=zero_mean) 63 | Xtr = udm_data[0][0] 64 | Ytr = udm_data[0][1][:,np.newaxis] 65 | 66 | all_count = Xtr.shape[0] 67 | pc_count = int(np.ceil(sup_count / 10.0)) 68 | sup_count = int(10 * pc_count) 69 | unsup_count = all_count - sup_count 70 | 71 | Xtr_su = [] 72 | Ytr_su = [] 73 | Xtr_un = [] 74 | Ytr_un = [] 75 | 76 | # Sample supervised and unsupervised subsets of each class' observations 77 | for c_label in np.unique(Ytr): 78 | c_idx = [i for i in range(all_count) if (Ytr[i] == c_label)] 79 | rng.shuffle(c_idx) 80 | Xtr_su.append(Xtr[c_idx[0:pc_count],:]) 81 | Ytr_su.append(Ytr[c_idx[0:pc_count],:]) 82 | Xtr_un.append(Xtr[c_idx[pc_count:],:]) 83 | Ytr_un.append(Ytr[c_idx[pc_count:],:]) 84 | 85 | # Stack per-class supervised/unsupervised splits into matrices 86 | Xtr_su = np.vstack(Xtr_su) 87 | Ytr_su = np.vstack(Ytr_su) 88 | Xtr_un = np.vstack(Xtr_un) 89 | Ytr_un = np.vstack(Ytr_un) 90 | # Also keep "unsupervised" copies of the "supervised" data 91 | Xtr_un = Xtr_un #np.vstack([Xtr_un, Xtr_su]) 92 | Ytr_un = 0 * Ytr_un #np.vstack([Ytr_un, Ytr_su]) 93 | 94 | # Shuffle the rows so that observations are not grouped by class 95 | shuf_idx = rng.permutation(Xtr_su.shape[0]) 96 | Xtr_su = Xtr_su[shuf_idx,:] 97 | Ytr_su = Ytr_su[shuf_idx].ravel() + 1 98 | shuf_idx = rng.permutation(Xtr_un.shape[0]) 99 | Xtr_un = Xtr_un[shuf_idx,:] 100 | Ytr_un = Ytr_un[shuf_idx].ravel() 101 | 102 | # Put matrices into GPU shared variables, for great justice 103 | Xtr_su, Ytr_su = _shared_dataset((Xtr_su, Ytr_su)) 104 | Xtr_un, Ytr_un = _shared_dataset((Xtr_un, Ytr_un)) 105 | Xva, Yva = _shared_dataset((udm_data[1][0], (udm_data[1][1] + 1))) 106 | Xte, Yte = _shared_dataset((udm_data[2][0], (udm_data[2][1] + 1))) 107 | 108 | rval = [(Xtr_su, Ytr_su), (Xtr_un, Ytr_un), (Xva, Yva), (Xte, Yte)] 109 | 110 | return rval 111 | 112 | def load_udm(dataset, as_shared=True, zero_mean=True): 113 | """Loads the UdM train/validate/test split of MNIST.""" 114 | 115 | ############# 116 | # LOAD DATA # 117 | ############# 118 | 119 | # Download the MNIST dataset if it is not present 120 | data_dir, data_file = os.path.split(dataset) 121 | if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': 122 | import urllib 123 | origin = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' 124 | print 'Downloading data from %s' % origin 125 | urllib.urlretrieve(origin, dataset) 126 | 127 | print '... loading data' 128 | 129 | # Load the dataset 130 | f = gzip.open(dataset, 'rb') 131 | train_set, valid_set, test_set = cPickle.load(f) 132 | f.close() 133 | #train_set, valid_set, test_set format: tuple(input, target) 134 | #input is an np.ndarray of 2 dimensions (a matrix) 135 | #witch row's correspond to an example. target is a 136 | #np.ndarray of 1 dimensions (vector)) that have the same length as 137 | #the number of rows in the input. It should give the target 138 | #target to the example with the same index in the input. 139 | train_set = [v for v in train_set] 140 | valid_set = [v for v in valid_set] 141 | test_set = [v for v in test_set] 142 | train_set[0] = np.asarray(train_set[0]).astype(np.float32) 143 | valid_set[0] = np.asarray(valid_set[0]).astype(np.float32) 144 | test_set[0] = np.asarray(test_set[0]).astype(np.float32) 145 | if zero_mean: 146 | obs_mean = np.mean(train_set[0], axis=0, keepdims=True) 147 | train_set[0] = train_set[0] - obs_mean 148 | valid_set[0] = valid_set[0] - obs_mean 149 | test_set[0] = test_set[0] - obs_mean 150 | if as_shared: 151 | test_set_x, test_set_y = _shared_dataset((test_set[0],test_set[1]+1)) 152 | valid_set_x, valid_set_y = _shared_dataset((valid_set[0],valid_set[1]+1)) 153 | train_set_x, train_set_y = _shared_dataset((train_set[0],train_set[1]+1)) 154 | else: 155 | test_set_x, test_set_y = test_set 156 | valid_set_x, valid_set_y = valid_set 157 | train_set_x, train_set_y = train_set 158 | 159 | rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), 160 | (test_set_x, test_set_y)] 161 | return rval 162 | 163 | -------------------------------------------------------------------------------- /generalized_ear/output_losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | 5 | class LogisticRegression(object): 6 | """Multi-class Logistic Regression loss dangler.""" 7 | 8 | def __init__(self, linear_layer): 9 | """Dangle a logistic regression from the given linear layer. 10 | 11 | The given linear layer should be a HiddenLayer (or subclass) object, 12 | for HiddenLayer as defined in LayerNet.py.""" 13 | self.input_layer = linear_layer 14 | 15 | def loss_func(self, y): 16 | """Return the multiclass logistic regression loss for y. 17 | 18 | The class labels in y are assumed to be in correspondence with the 19 | set of column indices for self.input_layer.linear_output. 20 | """ 21 | p_y_given_x = T.nnet.softmax(self.input_layer.linear_output) 22 | loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]),y]) 23 | return loss 24 | 25 | def errors(self, y): 26 | """Compute the number of wrong predictions by self.input_layer. 27 | 28 | Predicted class labels are computed as the indices of the columns of 29 | self.input_layer.linear_output which are maximal. Wrong predictions are 30 | those for which max indices do not match their corresponding y values. 31 | """ 32 | # Compute class memberships predicted by self.input_layer 33 | y_pred = T.argmax(self.input_layer.linear_output, axis=1) 34 | errs = 0 35 | # check if y has same dimension of y_pred 36 | if y.ndim != y_pred.ndim: 37 | raise TypeError('y should have the same shape as self.y_pred', 38 | ('y', y.type, 'y_pred', y_pred.type)) 39 | # check if y is of the correct datatype 40 | if y.dtype.startswith('int'): 41 | # the T.neq operator returns a vector of 0s and 1s, where 1 42 | # represents a mistake in prediction 43 | errs = T.sum(T.neq(y_pred, y)) 44 | else: 45 | raise NotImplementedError() 46 | return errs 47 | 48 | class LogRegSS(object): 49 | """Multi-class semi-supervised Logistic Regression loss dangler.""" 50 | 51 | def __init__(self, linear_layer): 52 | """Dangle a logistic regression from the given linear layer. 53 | 54 | The given linear layer should be a HiddenLayer (or subclass) object, 55 | for HiddenLayer as defined in LayerNet.py.""" 56 | self.input_layer = linear_layer 57 | 58 | def safe_softmax_ss(self, x): 59 | """Softmax that shouldn't overflow.""" 60 | e_x = T.exp(x - T.max(x, axis=1, keepdims=True)) 61 | x_sm = e_x / T.sum(e_x, axis=1, keepdims=True) 62 | return x_sm 63 | 64 | def loss_func(self, y): 65 | """Return the multiclass logistic regression loss for y. 66 | 67 | The class labels in y are assumed to be in correspondence with the 68 | set of column indices for self.input_layer.linear_output. 69 | """ 70 | row_idx = T.arange(y.shape[0]) 71 | row_mask = T.neq(y, 0).reshape((y.shape[0], 1)) 72 | p_y_given_x = self.safe_softmax_ss(self.input_layer.linear_output) 73 | wacky_mat = (p_y_given_x * row_mask) + (1. - row_mask) 74 | loss = -T.sum(T.log(wacky_mat[row_idx,y])) / T.sum(row_mask) 75 | return loss 76 | 77 | def errors(self, y): 78 | """Compute the number of wrong predictions by self.input_layer. 79 | 80 | Predicted class labels are computed as the indices of the columns of 81 | self.input_layer.linear_output which are maximal. Wrong predictions are 82 | those for which max indices do not match their corresponding y values. 83 | """ 84 | # Compute class memberships predicted by self.input_layer 85 | y_pred = T.argmax(self.input_layer.linear_output[:,1:], axis=1) 86 | y_pred = y_pred + 1 87 | errs = 0 88 | # check if y has same dimension of y_pred 89 | if y.ndim != y_pred.ndim: 90 | raise TypeError('y should have the same shape as self.y_pred', 91 | ('y', y.type, 'y_pred', y_pred.type)) 92 | # check if y is of the correct datatype 93 | if y.dtype.startswith('int'): 94 | # the T.neq operator returns a vector of 0s and 1s, where 1 95 | # represents a mistake in prediction 96 | errs = T.sum(T.neq(y_pred, y) * T.neq(y, 0)) 97 | else: 98 | raise NotImplementedError() 99 | return errs 100 | 101 | class MCL2Hinge(object): 102 | """Multi-class one-vs-all L2 hinge loss dangler.""" 103 | 104 | def __init__(self, linear_layer): 105 | """Dangle a squred hinge loss from the given linear layer. 106 | 107 | The given linear layer should be a HiddenLayer (or subclass) object, 108 | for HiddenLayer as defined in LayerNet.py.""" 109 | self.input_layer = linear_layer 110 | 111 | def loss_func(self, y): 112 | """Return the multiclass squared hinge loss for y. 113 | 114 | The class labels in y are assumed to be in correspondence with the 115 | set of column indices for self.input_layer.linear_output. 116 | """ 117 | y_hat = self.input_layer.linear_output 118 | margin_pos = T.maximum(0.0, (1.0 - y_hat)) 119 | margin_neg = T.maximum(0.0, (1.0 + y_hat)) 120 | obs_idx = T.arange(y.shape[0]) 121 | loss_pos = T.sum(margin_pos[obs_idx,y]**2.0) 122 | loss_neg = T.sum(margin_neg**2.0) - T.sum(margin_neg[obs_idx,y]**2.0) 123 | loss = (loss_pos + loss_neg) / y.shape[0] 124 | return loss 125 | 126 | def errors(self, y): 127 | """Compute the number of wrong predictions by self.input_layer. 128 | 129 | Predicted class labels are computed as the indices of the columns of 130 | self.input_layer.linear_output which are maximal. Wrong predictions are 131 | those for which max indices do not match their corresponding y values. 132 | """ 133 | # Compute class memberships predicted by self.input_layer 134 | y_pred = T.argmax(self.input_layer.linear_output, axis=1) 135 | errs = 0 136 | # check if y has same dimension of y_pred 137 | if y.ndim != y_pred.ndim: 138 | raise TypeError('y should have the same shape as self.y_pred', 139 | ('y', y.type, 'y_pred', y_pred.type)) 140 | # check if y is of the correct datatype 141 | if y.dtype.startswith('int'): 142 | # the T.neq operator returns a vector of 0s and 1s, where 1 143 | # represents a mistake in prediction 144 | errs = T.sum(T.neq(y_pred, y)) 145 | else: 146 | raise NotImplementedError() 147 | return errs 148 | 149 | class MCL2HingeSS(object): 150 | """Multi-class one-vs-all L2 hinge loss dangler. 151 | 152 | For this loss, class index 0 is never penalized, and errors for inputs 153 | with class index 0 are similarly ignored. This is for semi-supervised 154 | training, constrained by Theano's programming model.""" 155 | 156 | def __init__(self, linear_layer): 157 | """Dangle a squred hinge loss from the given linear layer. 158 | 159 | The given linear layer should be a HiddenLayer (or subclass) object, 160 | for HiddenLayer as defined in LayerNet.py.""" 161 | self.input_layer = linear_layer 162 | 163 | def loss_func(self, y): 164 | """Return the multiclass squared hinge loss for y. 165 | 166 | The class labels in y are assumed to be in correspondence with the 167 | set of column indices for self.input_layer.linear_output. 168 | """ 169 | y_hat = self.input_layer.linear_output 170 | row_idx = T.arange(y.shape[0]) 171 | row_mask = T.neq(y, 0).reshape((y_hat.shape[0], 1)) 172 | margin_pos = T.maximum(0.0, (1.0 - y_hat)) * row_mask 173 | margin_neg = T.maximum(0.0, (1.0 + y_hat)) * row_mask 174 | loss_pos = T.sum(margin_pos[row_idx,y]**2.0) 175 | loss_neg = T.sum(margin_neg**2.0) - T.sum(margin_neg[row_idx,y]**2.0) 176 | loss = (loss_pos + loss_neg) / T.sum(row_mask) 177 | return loss 178 | 179 | def errors(self, y): 180 | """Compute the number of wrong predictions by self.input_layer. 181 | 182 | Predicted class labels are computed as the indices of the columns of 183 | self.input_layer.linear_output which are maximal. Wrong predictions are 184 | those for which max indices do not match their corresponding y values. 185 | """ 186 | # Compute class memberships predicted by self.input_layer 187 | y_pred = T.argmax(self.input_layer.linear_output[:,1:], axis=1) 188 | y_pred = y_pred + 1 189 | errs = 0 190 | # check if y has same dimension of y_pred 191 | if y.ndim != y_pred.ndim: 192 | raise TypeError('y should have the same shape as self.y_pred', 193 | ('y', y.type, 'y_pred', y_pred.type)) 194 | # check if y is of the correct datatype 195 | if y.dtype.startswith('int'): 196 | # the T.neq operator returns a vector of 0s and 1s, where 1 197 | # represents a mistake in prediction 198 | errs = T.sum(T.neq(y_pred, y) * T.neq(y, 0)) 199 | else: 200 | raise NotImplementedError() 201 | return errs 202 | -------------------------------------------------------------------------------- /generalized_ear/utils.py: -------------------------------------------------------------------------------- 1 | """ This file contains different utility functions that are not connected 2 | in anyway to the networks presented in the tutorials, but rather help in 3 | processing the outputs into a more understandable way. 4 | 5 | For example ``tile_raster_images`` helps in generating a easy to grasp 6 | image from a set of samples or weights. 7 | """ 8 | 9 | import numpy as np 10 | import pylab as plt 11 | import PIL as PIL 12 | 13 | class batch(object): 14 | def __init__(self,batch_size): 15 | self.batch_size = batch_size 16 | 17 | def __call__(self,f): 18 | def wrapper(t,X): 19 | X = np.array(X) 20 | p = 0 21 | rem = 0 22 | results = [] 23 | while p < len(X): 24 | Z = X[p:p+self.batch_size] 25 | if Z.shape[0] != self.batch_size: 26 | zeros = np.zeros((self.batch_size-len(Z),X.shape[1])) 27 | rem = len(Z) 28 | Z = np.array(np.vstack((Z,zeros)),dtype=X.dtype) 29 | 30 | temp_results = f(t,Z) 31 | if rem != 0: 32 | temp_results = temp_results[:rem] 33 | 34 | results.extend(temp_results) 35 | p += self.batch_size 36 | return np.array(results,dtype='float32') 37 | return wrapper 38 | 39 | def scale_to_unit_interval(ndar, eps=1e-8): 40 | """ Scales all values in the ndarray ndar to be between 0 and 1 """ 41 | ndar = ndar.copy() 42 | ndar -= ndar.min() 43 | ndar *= 1.0 / (ndar.max() + eps) 44 | return ndar 45 | 46 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0), 47 | scale_rows_to_unit_interval=True, 48 | output_pixel_vals=True): 49 | """ 50 | Transform an array with one flattened image per row, into an array in 51 | which images are reshaped and layed out like tiles on a floor. 52 | 53 | This function is useful for visualizing datasets whose rows are images, 54 | and also columns of matrices for transforming those rows 55 | (such as the first layer of a neural net). 56 | 57 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can 58 | be 2-D ndarrays or None; 59 | :param X: a 2-D array in which every row is a flattened image. 60 | 61 | :type img_shape: tuple; (height, width) 62 | :param img_shape: the original shape of each image 63 | 64 | :type tile_shape: tuple; (rows, cols) 65 | :param tile_shape: the number of images to tile (rows, cols) 66 | 67 | :param output_pixel_vals: if output should be pixel values (i.e. int8 68 | values) or floats 69 | 70 | :param scale_rows_to_unit_interval: if the values need to be scaled before 71 | being plotted to [0,1] or not 72 | 73 | 74 | :returns: array suitable for viewing as an image. 75 | (See:`PIL.Image.fromarray`.) 76 | :rtype: a 2-d array with same dtype as X. 77 | 78 | """ 79 | 80 | assert len(img_shape) == 2 81 | assert len(tile_shape) == 2 82 | assert len(tile_spacing) == 2 83 | 84 | # The expression below can be re-written in a more C style as 85 | # follows : 86 | # 87 | # out_shape = [0,0] 88 | # out_shape[0] = (img_shape[0]+tile_spacing[0])*tile_shape[0] - 89 | # tile_spacing[0] 90 | # out_shape[1] = (img_shape[1]+tile_spacing[1])*tile_shape[1] - 91 | # tile_spacing[1] 92 | out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp 93 | in zip(img_shape, tile_shape, tile_spacing)] 94 | 95 | if isinstance(X, tuple): 96 | assert len(X) == 4 97 | # Create an output numpy ndarray to store the image 98 | if output_pixel_vals: 99 | out_array = np.zeros((out_shape[0], out_shape[1], 4), 100 | dtype='uint8') 101 | else: 102 | out_array = np.zeros((out_shape[0], out_shape[1], 4), 103 | dtype=X.dtype) 104 | 105 | #colors default to 0, alpha defaults to 1 (opaque) 106 | if output_pixel_vals: 107 | channel_defaults = [0, 0, 0, 255] 108 | else: 109 | channel_defaults = [0., 0., 0., 1.] 110 | 111 | for i in xrange(4): 112 | if X[i] is None: 113 | # if channel is None, fill it with zeros of the correct 114 | # dtype 115 | dt = out_array.dtype 116 | if output_pixel_vals: 117 | dt = 'uint8' 118 | out_array[:, :, i] = np.zeros(out_shape, 119 | dtype=dt) + channel_defaults[i] 120 | else: 121 | # use a recurrent call to compute the channel and store it 122 | # in the output 123 | out_array[:, :, i] = tile_raster_images( 124 | X[i], img_shape, tile_shape, tile_spacing, 125 | scale_rows_to_unit_interval, output_pixel_vals) 126 | return out_array 127 | else: 128 | # if we are dealing with only one channel 129 | H, W = img_shape 130 | Hs, Ws = tile_spacing 131 | # generate a matrix to store the output 132 | dt = X.dtype 133 | if output_pixel_vals: 134 | dt = 'uint8' 135 | out_array = np.zeros(out_shape, dtype=dt) 136 | for tile_row in xrange(tile_shape[0]): 137 | for tile_col in xrange(tile_shape[1]): 138 | if tile_row * tile_shape[1] + tile_col < X.shape[0]: 139 | this_x = X[tile_row * tile_shape[1] + tile_col] 140 | if scale_rows_to_unit_interval: 141 | # if we should scale values to be between 0 and 1 142 | # do this by calling the `scale_to_unit_interval` 143 | # function 144 | this_img = scale_to_unit_interval( 145 | this_x.reshape(img_shape)) 146 | else: 147 | this_img = this_x.reshape(img_shape) 148 | # add the slice to the corresponding position in the 149 | # output array 150 | c = 1 151 | if output_pixel_vals: 152 | c = 255 153 | out_array[ 154 | tile_row * (H + Hs): tile_row * (H + Hs) + H, 155 | tile_col * (W + Ws): tile_col * (W + Ws) + W 156 | ] = this_img * c 157 | return out_array 158 | 159 | 160 | def plot_histograms(firings): 161 | N = int(np.ceil(np.sqrt(firings.shape[1]))) 162 | plt.figure(figsize=(N,N)) 163 | axisNum = 0 164 | for row in range(N): 165 | for col in range(N): 166 | axisNum += 1 167 | ax = plt.subplot(N, N, axisNum) 168 | ax.set_xticklabels([]) 169 | ax.set_yticklabels([]) 170 | plt.hist(firings[:,row*N+col],bins=50) 171 | plt.show() 172 | return 173 | 174 | def visualize(EN, proto_key, layer_num, file_name): 175 | W = EN.proto_nets[proto_key][layer_num].W.get_value(borrow=True).T 176 | size = int(np.sqrt(W.shape[1])) 177 | # hist(W.flatten(),bins=50) 178 | image = PIL.Image.fromarray(tile_raster_images(X=W, \ 179 | img_shape=(size, size), tile_shape=(10,W.shape[0]/10),tile_spacing=(1, 1))) 180 | image.save(file_name) 181 | return 182 | 183 | def visualize_samples(X_samp, file_name): 184 | d = int(np.sqrt(X_samp.shape[1])) 185 | # hist(W.flatten(),bins=50) 186 | image = PIL.Image.fromarray(tile_raster_images(X=X_samp, img_shape=(d, d), \ 187 | tile_shape=(10,X_samp.shape[0]/10),tile_spacing=(1, 1))) 188 | image.save(file_name) 189 | return 190 | 191 | 192 | 193 | -------------------------------------------------------------------------------- /generative_models/DKCode.py: -------------------------------------------------------------------------------- 1 | ################################################################### 2 | # Code adapted from Durk Kingma's Github repository: "nips14-ssl" # 3 | ################################################################### 4 | 5 | from collections import OrderedDict 6 | import numpy as np 7 | import theano as theano 8 | import theano.tensor as T 9 | from theano.ifelse import ifelse 10 | 11 | # Pre-processing routines 12 | 13 | def PCA_theano(x_in, cutoff=0.99, global_sd=True): 14 | """ 15 | Given input matrix x_in in numpy form, compute transform functions for 16 | reducing the dimensionality of inputs. Make the transform functions and 17 | all their parameters based around theano shared variables, for GPU use. 18 | """ 19 | x_center = x_in.mean(axis=0) 20 | x = x_in - x_center 21 | if not global_sd: 22 | x_sd = x.std(axis=0) + 1e-5 23 | else: 24 | x_sd = x.std() + 1e-5 25 | # normalize to either unit standard deviation "globally" or 26 | # per-feature 27 | x = x / x_sd 28 | # compute covariance matrix and its eigen-decomposition 29 | print "Performing eigen-decomposition for PCA..." 30 | x_cov = np.dot(x.T, x) / x.shape[0] 31 | eigval, eigvec = np.linalg.eig(x_cov) 32 | # 33 | #eigval = np.ones(eigval.shape) 34 | # 35 | print "Done." 36 | if cutoff <= 1: 37 | # pick the number of dimensions to keep based on recovered variance 38 | n_used = ((eigval.cumsum() / eigval.sum()) < cutoff).sum() 39 | print 'PCA cutoff:', cutoff, 'n_used:', n_used 40 | else: 41 | # pick the number of dimensions to keep by user-provided value 42 | n_used = int(cutoff) 43 | eigval = eigval[:n_used].reshape((n_used,)) 44 | eigvec = eigvec[:,:n_used] 45 | # construct functions for applying PCA 46 | f_enc, f_dec, pca_shared_params = \ 47 | PCA_encdec_theano(eigvec, eigval, x_center, x_sd) 48 | pca_shared_params['pca_dim'] = n_used 49 | return f_enc, f_dec, pca_shared_params 50 | 51 | def PCA_encdec_theano(eigvec, eigval, x_mean, x_sd): 52 | """ 53 | Construct PCA encoder/decoder functions based around Theano shared 54 | variables. Return the function handles and a dict containing the relevant 55 | shared variables (well, symbolic references to them, at least). 56 | """ 57 | # construct the shared variables to use in the encoder/decoder functions 58 | fx = theano.config.floatX 59 | eigval_shared = theano.shared(value=eigval.astype(fx), name='eigval') 60 | eigvec_shared = theano.shared(value=eigvec.astype(fx), name='eigvec') 61 | x_mean_shared = theano.shared(value=x_mean.astype(fx), name='x_mean') 62 | x_sd_shared = theano.shared(value=x_sd.astype(fx), name='x_sd') 63 | pca_shared_params = {'eigval': eigval_shared, 'eigvec': eigvec_shared, \ 64 | 'x_mean':x_mean_shared, 'x_sd':x_sd_shared} 65 | # construct the encoder/decoder functions using the shared variables 66 | def f_enc( x ): 67 | x_sands = (x - x_mean_shared) / x_sd_shared 68 | result = T.dot(x_sands, eigvec_shared) / T.sqrt(eigval_shared) 69 | return result 70 | def f_dec( x ): 71 | result = (T.dot((x * T.sqrt(eigval_shared)), eigvec_shared.T) * \ 72 | x_sd_shared) + x_mean_shared 73 | return result 74 | return f_enc, f_dec, pca_shared_params 75 | 76 | 77 | def norm_clip(dW, max_l2_norm=10.0): 78 | """ 79 | Clip theano symbolic var dW to have some max l2 norm. 80 | """ 81 | dW_l2_norm = T.sqrt(T.sum(dW**2.0)) 82 | norm_ratio = (max_l2_norm / dW_l2_norm) 83 | clip_factor = ifelse(T.lt(norm_ratio, 1.0), norm_ratio, 1.0) 84 | dW_clipped = dW * clip_factor 85 | return dW_clipped 86 | 87 | def get_adam_updates(params=None, grads=None, \ 88 | alpha=None, beta1=None, beta2=None, \ 89 | mom2_init=1e-3, smoothing=1e-6, max_grad_norm=10000.0): 90 | """ 91 | Get the Theano updates to perform ADAM optimization of the shared-var 92 | parameters in params, given the shaared-var gradients in grads. 93 | 94 | params should be an iterable containing "keyable" values, grads should be 95 | a dict containing the grads for all values in params, and the remaining 96 | arguments should be theano shared variable arrays. 97 | """ 98 | 99 | # make an OrderedDict to hold the updates 100 | updates = OrderedDict() 101 | 102 | for p in params: 103 | # initialize update the iteration counter 104 | zero_ary = np.zeros((1,)).astype(theano.config.floatX) 105 | it_count = theano.shared(value=zero_ary) 106 | it_count_new = it_count + 1. 107 | 108 | # apply a bias correction factor to the learning rate 109 | fix1 = 1. - beta1[0]**(it_count[0] + 1.) 110 | fix2 = 1. - beta2[0]**(it_count[0] + 1.) 111 | lr_t = alpha[0] * (T.sqrt(fix2) / fix1) 112 | 113 | # get gradient for parameter p 114 | grad_p = norm_clip(grads[p], max_grad_norm) 115 | 116 | # mean_squared_grad := E[g^2]_{t-1} 117 | mom1_ary = 0.0 * p.get_value(borrow=False) 118 | mom2_ary = (0.0 * p.get_value(borrow=False)) + mom2_init 119 | mom1 = theano.shared(mom1_ary) 120 | mom2 = theano.shared(mom2_ary) 121 | 122 | # update moments 123 | mom1_new = (beta1[0] * mom1) + ((1. - beta1[0]) * grad_p) 124 | mom2_new = (beta2[0] * mom2) + ((1. - beta2[0]) * T.sqr(grad_p)) 125 | 126 | # compute the effective gradient 127 | effgrad = mom1_new / (T.sqrt(mom2_new) + smoothing) 128 | 129 | # do update 130 | p_new = p - (lr_t * effgrad) 131 | 132 | # apply updates 133 | updates[p] = p_new 134 | updates[mom1] = mom1_new 135 | updates[mom2] = mom2_new 136 | updates[it_count] = it_count_new 137 | 138 | return updates 139 | 140 | def get_adadelta_updates(params=None, grads=None, \ 141 | alpha=None, beta1=None, max_grad_norm=10000.0): 142 | """ 143 | Get the Theano updates to perform AdaDelta optimization of the shared-var 144 | parameters in params, given the shaared-var gradients in grads. 145 | 146 | params should be an iterable containing "keyable" values, grads should be 147 | a dict containing the grads for all values in params, and the remaining 148 | arguments should be theano shared variable arrays. 149 | """ 150 | 151 | # make an OrderedDict to hold the updates 152 | updates = OrderedDict() 153 | lr_t = alpha[0] 154 | 155 | for p in params: 156 | # get gradient for parameter p 157 | grad_p = norm_clip(grads[p], max_grad_norm) 158 | 159 | # initialize squared gradient accumulator 160 | mom_ary = (0.0 * p.get_value(borrow=False)) + 1.0 161 | mom1 = theano.shared(mom_ary) 162 | 163 | # update moments 164 | mom1_new = (beta1[0] * mom1) + ((1. - beta1[0]) * T.sqr(grad_p)) 165 | 166 | # compute the effective gradient 167 | effgrad = grad_p / (T.sqrt(mom1_new) + 1e-6) 168 | 169 | # do update 170 | p_new = p - (lr_t * clipped_grad) 171 | 172 | # apply update 173 | updates[p] = p_new 174 | updates[mom1] = mom1_new 175 | 176 | return updates -------------------------------------------------------------------------------- /generative_models/HelperFuncs.py: -------------------------------------------------------------------------------- 1 | import time 2 | import utils as utils 3 | import numpy as np 4 | import numpy.random as npr 5 | import theano 6 | import theano.tensor as T 7 | 8 | ################################## 9 | # MISCELLANEOUS HELPER FUNCTIONS # 10 | ################################## 11 | 12 | def DCG(x): 13 | x_dcg = theano.gradient.disconnected_grad(x) 14 | return x_dcg 15 | 16 | def constFX(x): 17 | """Cast x as constant TensorVariable with dtype floatX.""" 18 | x_CFX = T.constant(x, dtype=theano.config.floatX) 19 | return x_CFX 20 | 21 | def to_fX(np_ary): 22 | np_ary_fX = np_ary.astype(theano.config.floatX) 23 | return np_ary_fX 24 | 25 | def posterior_klds(IN, Xtr, batch_size, batch_count): 26 | """ 27 | Get posterior KLd cost for some inputs from Xtr. 28 | """ 29 | post_klds = [] 30 | for i in range(batch_count): 31 | batch_idx = npr.randint(low=0, high=Xtr.shape[0], size=(batch_size,)) 32 | X = Xtr.take(batch_idx, axis=0) 33 | post_klds.extend([k for k in IN.kld_func(X)]) 34 | return post_klds 35 | 36 | def row_shuffle(X, Y=None): 37 | """ 38 | Return a copy of X with shuffled rows. 39 | """ 40 | shuf_idx = np.arange(X.shape[0]) 41 | npr.shuffle(shuf_idx) 42 | X_shuf = X[shuf_idx] 43 | if Y is None: 44 | result = X_shuf 45 | else: 46 | Y_shuf = Y[shuf_idx] 47 | result = [X_shuf, Y_shuf] 48 | return result 49 | 50 | ##################################### 51 | # HELPER FUNCTIONS FOR DATA MASKING # 52 | ##################################### 53 | 54 | def apply_mask(Xd=None, Xc=None, Xm=None): 55 | """ 56 | Apply a mask, like in the old days. 57 | """ 58 | X_masked = ((1.0 - Xm) * Xd) + (Xm * Xc) 59 | return X_masked 60 | 61 | def binarize_data(X): 62 | """ 63 | Make a sample of bernoulli variables with probabilities given by X. 64 | """ 65 | X_shape = X.shape 66 | probs = npr.rand(*X_shape) 67 | X_binary = 1.0 * (probs < X) 68 | return X_binary.astype(theano.config.floatX) 69 | 70 | def sample_masks(X, drop_prob=0.3): 71 | """ 72 | Sample a binary mask to apply to the matrix X, with rate mask_prob. 73 | """ 74 | probs = npr.rand(*X.shape) 75 | mask = 1.0 * (probs > drop_prob) 76 | return mask.astype(theano.config.floatX) 77 | 78 | def sample_patch_masks(X, im_shape, patch_shape): 79 | """ 80 | Sample a random patch mask for each image in X. 81 | """ 82 | obs_count = X.shape[0] 83 | rs = patch_shape[0] 84 | cs = patch_shape[1] 85 | off_row = npr.randint(1,high=(im_shape[0]-rs-1), size=(obs_count,)) 86 | off_col = npr.randint(1,high=(im_shape[1]-cs-1), size=(obs_count,)) 87 | dummy = np.zeros(im_shape) 88 | mask = np.zeros(X.shape) 89 | for i in range(obs_count): 90 | dummy = (0.0 * dummy) + 1.0 91 | dummy[off_row[i]:(off_row[i]+rs), off_col[i]:(off_col[i]+cs)] = 0.0 92 | mask[i,:] = dummy.ravel() 93 | return mask.astype(theano.config.floatX) 94 | 95 | def collect_obs_costs(batch_costs, batch_reps): 96 | """ 97 | Collect per-observation costs from a cost vector containing the cost for 98 | multiple repetitions of each observation. 99 | """ 100 | obs_count = int(batch_costs.shape[0] / batch_reps) 101 | obs_costs = np.zeros((obs_count,)) 102 | obs_idx = -1 103 | for i in range(batch_costs.shape[0]): 104 | if ((i % batch_reps) == 0): 105 | obs_idx = obs_idx + 1 106 | obs_costs[obs_idx] = obs_costs[obs_idx] + batch_costs[i] 107 | obs_costs = obs_costs / batch_reps 108 | return obs_costs 109 | 110 | def construct_masked_data(xi, \ 111 | drop_prob=0.0, \ 112 | occ_dim=None, \ 113 | data_mean=None): 114 | """ 115 | Construct randomly masked data from xi. 116 | """ 117 | if data_mean is None: 118 | data_mean = np.zeros((xi.shape[1],)) 119 | im_dim = int(xi.shape[1]**0.5) # images should be square 120 | xo = xi.copy() 121 | if drop_prob > 0.0: 122 | # apply fully-random occlusion 123 | xm_rand = sample_masks(xi, drop_prob=drop_prob) 124 | else: 125 | # don't apply fully-random occlusion 126 | xm_rand = np.ones(xi.shape) 127 | if occ_dim is None: 128 | # don't apply rectangular occlusion 129 | xm_patch = np.ones(xi.shape) 130 | else: 131 | # apply rectangular occlusion 132 | xm_patch = sample_patch_masks(xi, (im_dim,im_dim), (occ_dim,occ_dim)) 133 | xm = xm_rand * xm_patch 134 | xi = (xm * xi) + ((1.0 - xm) * data_mean) 135 | xi = to_fX(xi) 136 | xo = to_fX(xo) 137 | xm = to_fX(xm) 138 | return xi, xo, xm 139 | 140 | def shift_and_scale_into_01(X): 141 | X = X - np.min(X, axis=1, keepdims=True) 142 | X = X / np.max(X, axis=1, keepdims=True) 143 | return X -------------------------------------------------------------------------------- /generative_models/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2014 Philip Bachman 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | 21 | 22 | ******************************************************************************** 23 | * The copyright notice below comes from code which has been _heavily_ modified * 24 | * in the production of the code in this directory. * 25 | ******************************************************************************** 26 | 27 | 28 | Copyright (C) 2012 Misha Denil 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining a copy of 31 | this software and associated documentation files (the "Software"), to deal in 32 | the Software without restriction, including without limitation the rights to 33 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 34 | of the Software, and to permit persons to whom the Software is furnished to do 35 | so, subject to the following conditions: 36 | 37 | The above copyright notice and this permission notice shall be included in all 38 | copies or substantial portions of the Software. 39 | 40 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 41 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 42 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 43 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 44 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 45 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 46 | SOFTWARE. 47 | 48 | -------------------------------------------------------------------------------- /generative_models/LogPDFs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | 5 | # library with theano PDF functions 6 | PI = np.pi 7 | C = -0.5 * np.log(2*PI) 8 | 9 | def normal(x, mean, sd): 10 | return C - T.log(T.abs_(sd)) - ((x - mean)**2 / (2 * sd**2)) 11 | 12 | def normal2(x, mean, logvar): 13 | return C - logvar/2 - (x - mean)**2 / (2 * T.exp(logvar)) 14 | 15 | def laplace(x, mean, logvar): 16 | sd = T.exp(0.5 * logvar) 17 | return -(abs(x - mean) / sd) - (0.5 * logvar) - np.log(2) 18 | 19 | def standard_normal(x): 20 | return C - (x**2 / 2) 21 | 22 | # Centered laplace with unit scale (b=1) 23 | def standard_laplace(x): 24 | return np.log(0.5) - T.abs_(x) 25 | 26 | # Centered student-t distribution 27 | # v>0 is degrees of freedom 28 | # See: http://en.wikipedia.org/wiki/Student's_t-distribution 29 | def studentt(x, v): 30 | gamma1 = log_gamma_lanczos((v+1)/2.) 31 | gamma2 = log_gamma_lanczos(0.5*v) 32 | return gamma1 - 0.5 * T.log(v * PI) - gamma2 - (v+1)/2. * T.log(1 + (x*x)/v) 33 | 34 | ################################################################ 35 | # Funcs for temporary backwards compatibilit while refactoring # 36 | ################################################################ 37 | 38 | def log_prob_bernoulli(p_true, p_approx, mask=None): 39 | """ 40 | Compute log probability of some binary variables with probabilities 41 | given by p_true, for probability estimates given by p_approx. We'll 42 | compute joint log probabilities over row-wise groups. (Theano version). 43 | """ 44 | if mask is None: 45 | mask = T.ones((1, p_approx.shape[1])) 46 | log_prob_1 = p_true * T.log(p_approx+1e-6) 47 | log_prob_0 = (1.0 - p_true) * T.log((1.0 - p_approx)+1e-6) 48 | log_prob_01 = log_prob_1 + log_prob_0 49 | row_log_probs = T.sum((log_prob_01 * mask), axis=1, keepdims=True) 50 | return row_log_probs 51 | 52 | def log_prob_bernoulli_np(p_true, p_approx, mask=None): 53 | """ 54 | Compute log probability of some binary variables with probabilities 55 | given by p_true, for probability estimates given by p_approx. We'll 56 | compute joint log probabilities over row-wise groups. (Numpy version). 57 | """ 58 | if mask is None: 59 | mask = np.ones((1, p_approx.shape[1])) 60 | log_prob_1 = p_true * np.log(p_approx+1e-6) 61 | log_prob_0 = (1.0 - p_true) * np.log((1.0 - p_approx)+1e-6) 62 | log_prob_01 = log_prob_1 + log_prob_0 63 | row_log_probs = np.sum((log_prob_01 * mask), axis=1) 64 | return row_log_probs 65 | 66 | def log_prob_gaussian(mu_true, mu_approx, les_sigmas=1.0, mask=None): 67 | """ 68 | Compute log probability of some continuous variables with values given 69 | by mu_true, w.r.t. gaussian distributions with means given by mu_approx 70 | and standard deviations given by les_sigmas. 71 | """ 72 | if mask is None: 73 | mask = T.ones((1, mu_approx.shape[1])) 74 | ind_log_probs = C - T.log(T.abs_(les_sigmas)) - \ 75 | ((mu_true - mu_approx)**2.0 / (2.0 * les_sigmas**2.0)) 76 | row_log_probs = T.sum((ind_log_probs * mask), axis=1, keepdims=True) 77 | return row_log_probs 78 | 79 | def log_prob_gaussian2(mu_true, mu_approx, log_vars=1.0, mask=None): 80 | """ 81 | Compute log probability of some continuous variables with values given 82 | by mu_true, w.r.t. gaussian distributions with means given by mu_approx 83 | and log variances given by les_logvars. 84 | """ 85 | if mask is None: 86 | mask = T.ones((1, mu_approx.shape[1])) 87 | ind_log_probs = C - (0.5 * log_vars) - \ 88 | ((mu_true - mu_approx)**2.0 / (2.0 * T.exp(log_vars))) 89 | row_log_probs = T.sum((ind_log_probs * mask), axis=1, keepdims=True) 90 | return T.cast(row_log_probs, 'floatX') 91 | 92 | def gaussian_kld(mu_left, logvar_left, mu_right, logvar_right): 93 | """ 94 | Compute KL divergence between a bunch of univariate Gaussian distributions 95 | with the given means and log-variances. 96 | We do KL(N(mu_left, logvar_left) || N(mu_right, logvar_right)). 97 | """ 98 | gauss_klds = 0.5 * (logvar_right - logvar_left + \ 99 | (T.exp(logvar_left) / T.exp(logvar_right)) + \ 100 | ((mu_left - mu_right)**2.0 / T.exp(logvar_right)) - 1.0) 101 | return gauss_klds 102 | 103 | def gaussian_kld_BN(logvar_left, logvar_right): 104 | """ 105 | Compute KL divergence between a bunch of univariate Gaussian distributions 106 | with the given means and log-variances. 107 | We do KL(N(mu_left, logvar_left) || N(mu_right, logvar_right)). 108 | """ 109 | gauss_klds = 0.5 * (logvar_right - logvar_left + \ 110 | (T.exp(logvar_left) / T.exp(logvar_right)) + \ 111 | (1.0 / T.exp(logvar_right)) - 1.0) 112 | return gauss_klds 113 | 114 | ################################# 115 | # Log-gamma function for theano # 116 | ################################# 117 | LOG_PI = np.log(PI) 118 | LOG_SQRT_2PI = np.log(np.sqrt(2*PI)) 119 | def log_gamma_lanczos(z): 120 | # reflection formula. Normally only used for negative arguments, 121 | # but here it's also used for 0 < z < 0.5 to improve accuracy in this region. 122 | flip_z = 1 - z 123 | # because both paths are always executed (reflected and non-reflected), 124 | # the reflection formula causes trouble when the input argument is larger than one. 125 | # Note that for any z > 1, flip_z < 0. 126 | # To prevent these problems, we simply set all flip_z < 0 to a 'dummy' value. 127 | # This is not a problem, since these computations are useless anyway and 128 | # are discarded by the T.switch at the end of the function. 129 | flip_z = T.switch(flip_z < 0, 1, flip_z) 130 | small = LOG_PI - T.log(T.sin(PI * z)) - log_gamma_lanczos_sub(flip_z) 131 | big = log_gamma_lanczos_sub(z) 132 | return T.switch(z < 0.5, small, big) 133 | 134 | ## version that isn't vectorised, since g is small anyway 135 | def log_gamma_lanczos_sub(z): #expanded version 136 | # Coefficients used by the GNU Scientific Library 137 | g = 7 138 | p = np.array([0.99999999999980993, 676.5203681218851, -1259.1392167224028, 139 | 771.32342877765313, -176.61502916214059, 12.507343278686905, 140 | -0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7]) 141 | z = z - 1 142 | x = p[0] 143 | for i in range(1, g+2): 144 | x += p[i]/(z+i) 145 | t = z + g + 0.5 146 | return LOG_SQRT_2PI + (z + 0.5) * T.log(t) - t + T.log(x) 147 | 148 | ############################ 149 | # PARZEN DENSITY ESTIMATOR # 150 | ############################ 151 | import time 152 | import gc 153 | 154 | def get_nll(x, parzen, batch_size=100): 155 | """ 156 | Credit: Yann N. Dauphin 157 | """ 158 | 159 | inds = range(x.shape[0]) 160 | n_batches = int(np.ceil(float(len(inds)) / batch_size)) 161 | 162 | times = [] 163 | nlls = [] 164 | for i in range(n_batches): 165 | begin = time.time() 166 | nll = parzen(x[inds[i::n_batches]]) 167 | end = time.time() 168 | times.append(end-begin) 169 | nlls.extend(nll) 170 | if i % 10 == 0: 171 | print i, np.mean(times), np.mean(nlls) 172 | return np.array(nlls) 173 | 174 | 175 | def log_mean_exp(a): 176 | """ 177 | Credit: Yann N. Dauphin 178 | """ 179 | max_ = a.max(1) 180 | result = max_ + T.log(T.exp(a - max_.dimshuffle(0, 'x')).mean(1)) 181 | return result 182 | 183 | def theano_parzen(mu, sigma): 184 | """ 185 | Credit: Yann N. Dauphin 186 | """ 187 | x = T.matrix() 188 | mu = theano.shared(mu) 189 | a = ( x.dimshuffle(0, 'x', 1) - mu.dimshuffle('x', 0, 1) ) / sigma 190 | E = log_mean_exp(-0.5*(a**2).sum(2)) 191 | Z = mu.shape[1] * T.log(sigma * np.sqrt(np.pi * 2)) 192 | parzen_func = theano.function([x], E - Z) 193 | return parzen_func 194 | 195 | def cross_validate_sigma(samples, data, sigmas, batch_size): 196 | """ 197 | Find which sigma is best for the Parzen estimator bound. 198 | """ 199 | lls = [] 200 | best_ll = -1e6 201 | best_lls = None 202 | best_sigma = None 203 | for sigma in sigmas: 204 | print sigma 205 | parzen = theano_parzen(samples, sigma) 206 | tmp = get_nll(data, parzen, batch_size=batch_size) 207 | sigma_lls = np.asarray(tmp) 208 | mean_ll = sigma_lls.mean() 209 | lls.append(mean_ll) 210 | if (mean_ll > best_ll): 211 | best_ll = mean_ll 212 | best_lls = sigma_lls 213 | best_sigma = sigma 214 | del parzen 215 | gc.collect() 216 | return [best_sigma, best_ll, best_lls] -------------------------------------------------------------------------------- /generative_models/MCSampler.py: -------------------------------------------------------------------------------- 1 | ################################################################## 2 | # CODE FOR EFFICIENTLY SAMPLING A (SMALL) FIXED-LENGTH VAE CHAIN # 3 | ################################################################## 4 | 5 | # basic python 6 | import numpy as np 7 | import numpy.random as npr 8 | from collections import OrderedDict 9 | 10 | # theano business 11 | import theano 12 | import theano.tensor as T 13 | #from theano.tensor.shared_randomstreams import RandomStreams as RandStream 14 | from theano.sandbox.cuda.rng_curand import CURAND_RandomStreams as RandStream 15 | 16 | # phil's sweetness 17 | from GIPair import GIPair 18 | from NetLayers import apply_mask 19 | 20 | 21 | class MCSampler(object): 22 | """ 23 | Class for quickly sampling some small fixed number of steps from the 24 | Markov chain constructed by self-looping a variational auto-encoder. 25 | 26 | Parameters: 27 | rng: numpy.random.RandomState (for reproducibility) 28 | Xd: symbolic var for providing points for starting the Markov Chain 29 | i_net: The InfNet instance that will serve as the inferencer 30 | g_net: The GenNet instance that will serve as the generator 31 | d_net: The PeaNet instance that will serve as the discriminator 32 | chain_len: number of steps to unroll the VAE Markov Chain 33 | data_dim: dimension of the generated data 34 | prior_dim: dimension of the model prior 35 | """ 36 | def __init__(self, rng=None, Xd=None, \ 37 | i_net=None, g_net=None, chain_len=None, \ 38 | data_dim=None, prior_dim=None): 39 | # Do some stuff! 40 | self.rng = RandStream(rng.randint(100000)) 41 | self.data_dim = data_dim 42 | self.prior_dim = prior_dim 43 | 44 | # symbolic var for inputting samples for initializing the VAE chain 45 | self.Xd = Xd 46 | # symbolic var for masking subsets of the state variables 47 | self.Xm = T.zeros_like(self.Xd) 48 | # symbolic var for controlling subsets of the state variables 49 | self.Xc = T.zeros_like(self.Xd) 50 | # integer number of times to cycle the VAE loop 51 | self.chain_len = chain_len 52 | 53 | # get a clone of the desired VAE, for easy access 54 | self.GIP = GIPair(rng=rng, Xd=self.Xd, Xc=self.Xc, Xm=self.Xm, \ 55 | g_net=g_net, i_net=i_net, data_dim=self.data_dim, \ 56 | prior_dim=self.prior_dim, params=None, shared_param_dicts=None) 57 | self.IN = self.GIP.IN 58 | self.GN = self.GIP.GN 59 | self.use_encoder = self.IN.use_encoder 60 | assert(self.use_encoder == self.GN.use_decoder) 61 | # self-loop some clones of the main VAE into a chain. 62 | # ** All VAEs in the chain share the same Xc and Xm, which are the 63 | # symbolic inputs for providing the observed portion of the input 64 | # and a mask indicating which part of the input is "observed". 65 | # These inputs are used for training "reconstruction" policies. 66 | self.IN_chain = [] 67 | self.GN_chain = [] 68 | self.Xg_chain = [] 69 | _Xd = self.Xd 70 | for i in range(self.chain_len): 71 | if (i == 0): 72 | # start the chain with data provided by used 73 | _IN = self.IN.shared_param_clone(rng=rng, \ 74 | Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm)) 75 | _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output) 76 | else: 77 | # continue the chain with samples from previous VAE 78 | _IN = self.IN.shared_param_clone(rng=rng, \ 79 | Xd=apply_mask(Xd=_Xd, Xc=self.Xc, Xm=self.Xm)) 80 | _GN = self.GN.shared_param_clone(rng=rng, Xp=_IN.output) 81 | if self.use_encoder: 82 | # use the "decoded" output of the previous generator as input 83 | # to the next inferencer, which will re-encode it prior to 84 | # inference 85 | _Xd = _GN.output_decoded 86 | else: 87 | # use the "encoded" output of the previous generator as input 88 | # to the next inferencer, as the inferencer won't try to 89 | # re-encode it prior to inference 90 | _Xd = _GN.output 91 | self.IN_chain.append(_IN) 92 | self.GN_chain.append(_GN) 93 | self.Xg_chain.append(_Xd) 94 | 95 | # construct the function for training on training data 96 | self.sample_from_chain = self._construct_sample_from_chain() 97 | return 98 | 99 | def _construct_sample_from_chain(self): 100 | """ 101 | Sample for several steps of a self-looped VAE. 102 | """ 103 | outputs = [Xg for Xg in self.Xg_chain] 104 | sample_func = theano.function([self.Xd], outputs=outputs) 105 | return sample_func 106 | 107 | def resample_chain_steps(MCS, Xtr_chains): 108 | # get and set some basic dataset information 109 | assert(len(Xtr_chains) == (MCS.chain_len + 1)) 110 | Xtr = Xtr_chains[0] 111 | for Xc in Xtr_chains: 112 | assert(Xc.shape[0] == Xtr.shape[0]) 113 | assert(Xc.shape[1] == Xtr.shape[1]) 114 | tr_samples = Xtr.shape[0] 115 | data_dim = Xtr.shape[1] 116 | batch_size = 5000 117 | batch_count = int(np.ceil(tr_samples / float(batch_size))) 118 | # print("Resampling {0:d} batches of {1:d} chains with {2:d} steps...".format(batch_count, batch_size, MCS.chain_len)) 119 | for i in range(batch_count): 120 | batch_start = i * batch_size 121 | batch_end = min(tr_samples, (batch_start + batch_size)) 122 | batch_Xd = Xtr[batch_start:batch_end] 123 | batch_chains = MCS.sample_from_chain(batch_Xd) 124 | for j in range(len(batch_chains)): 125 | Xtr_chains[j+1][batch_start:batch_end] = batch_chains[j] 126 | return Xtr_chains 127 | 128 | 129 | if __name__=="__main__": 130 | import utils 131 | import time 132 | from load_data import load_udm 133 | import InfNet as INet 134 | import GenNet as GNet 135 | # Initialize a source of randomness 136 | rng = npr.RandomState(12345) 137 | # Load some data to train/validate/test with 138 | dataset = 'data/mnist.pkl.gz' 139 | datasets = load_udm(dataset, zero_mean=False) 140 | Xtr = datasets[0][0] 141 | Xtr = Xtr.get_value(borrow=False) 142 | Xva = datasets[1][0] 143 | Xva = Xva.get_value(borrow=False) 144 | print("Xtr.shape: {0:s}, Xva.shape: {1:s}".format(str(Xtr.shape),str(Xva.shape))) 145 | 146 | # get and set some basic dataset information 147 | tr_samples = Xtr.shape[0] 148 | data_dim = Xtr.shape[1] 149 | batch_size = 2000 150 | batch_count = int(np.ceil(tr_samples / float(batch_size))) 151 | 152 | # Symbolic inputs 153 | Xd = T.matrix(name='Xd') 154 | Xc = T.matrix(name='Xc') 155 | Xm = T.matrix(name='Xm') 156 | Xt = T.matrix(name='Xt') 157 | Xp = T.matrix(name='Xp') 158 | 159 | # Load inferencer and generator from saved parameters 160 | gn_fname = "MNIST_WALKOUT_TEST_BIN/pt_walk_params_b150000_GN.pkl" 161 | in_fname = "MNIST_WALKOUT_TEST_BIN/pt_walk_params_b150000_IN.pkl" 162 | IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) 163 | GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) 164 | IN.set_sigma_scale(1.25) 165 | prior_dim = GN.latent_dim 166 | 167 | MCS = MCSampler(rng=rng, Xd=Xd, i_net=IN, g_net=GN, chain_len=9, \ 168 | data_dim=data_dim, prior_dim=prior_dim) 169 | 170 | Xtr_chains = [Xtr] 171 | for i in range(MCS.chain_len): 172 | Xtr_chains.append(0.0*Xtr) 173 | 174 | print("Testing chain sampler....") 175 | loop_times = [] 176 | # TESTING SAMPLING SPEED! 177 | for i in range(batch_count): 178 | start_time = time.clock() 179 | batch_start = i * batch_size 180 | batch_end = min(tr_samples, (batch_start + batch_size)) 181 | Xd_batch = Xtr[batch_start:batch_end] 182 | Xd_chain = MCS.sample_from_chain(Xd_batch) 183 | Xs = [Xd_batch[0:50]] 184 | Xs.extend([xd[0:50] for xd in Xd_chain]) 185 | file_name = "MCS_TEST_{0:d}.png".format(i) 186 | utils.visualize_samples(np.vstack(Xs), file_name, num_rows=10) 187 | loop_times.append((time.clock() - start_time)) 188 | total_time = sum(loop_times) 189 | mean_time = total_time / batch_count 190 | time_std = sum([(t - mean_time)**2.0 for t in loop_times]) / batch_count 191 | print("total_time: {0:.4f}".format(total_time)) 192 | print("mean_time: {0:.4f}, time_std: {1:.4f}".format(mean_time, time_std)) 193 | start_time = time.clock() 194 | Xtr_chains = resample_chain_steps(MCS, Xtr_chains) 195 | total_time = time.clock() - start_time 196 | print("total_time: {0:.4f}".format(total_time)) 197 | 198 | 199 | 200 | 201 | 202 | ############## 203 | # EYE BUFFER # 204 | ############## 205 | -------------------------------------------------------------------------------- /generative_models/MnistWalkReg.py: -------------------------------------------------------------------------------- 1 | import time 2 | import utils as utils 3 | import numpy as np 4 | import numpy.random as npr 5 | import theano 6 | import theano.tensor as T 7 | 8 | from load_data import load_udm, load_udm_ss, load_mnist 9 | from PeaNet import PeaNet, load_peanet_from_file 10 | from InfNet import InfNet, load_infnet_from_file 11 | from GenNet import GenNet, load_gennet_from_file 12 | from PeaNetSeq import PeaNetSeq 13 | from GIPair import GIPair 14 | from NetLayers import relu_actfun, softplus_actfun, \ 15 | safe_softmax, safe_log 16 | import GenNet as GNet 17 | import InfNet as INet 18 | import PeaNet as PNet 19 | from DKCode import PCA_theano 20 | from MCSampler import MCSampler, resample_chain_steps 21 | 22 | def downsample_chains(X_chain, stride=1): 23 | Xs = [X_chain[i] for i in range(len(X_chain)) if ((i % stride) == 0)] 24 | return Xs 25 | 26 | 27 | def manifold_walk_regularization(): 28 | 29 | for t_num in range(10): 30 | out_file = open("MWR_TEST_RESULTS_{0:d}.txt".format(t_num), 'wb') 31 | 32 | # Initialize a source of randomness 33 | rng = np.random.RandomState(t_num) 34 | 35 | # Load some data to train/validate/test with 36 | sup_count = 600 37 | dataset = 'data/mnist.pkl.gz' 38 | datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False) 39 | Xtr_su = datasets[0][0].get_value(borrow=False) 40 | Ytr_su = datasets[0][1].get_value(borrow=False).astype(np.int32) 41 | Xtr_un = datasets[1][0].get_value(borrow=False) 42 | Ytr_un = datasets[1][1].get_value(borrow=False).astype(np.int32) 43 | 44 | # get the joint labeled and unlabeled data 45 | Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX) 46 | Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]]) 47 | Ytr_un = 0 * Ytr_un # KEEP CATS FIXED OR FREE? YES/NO? 48 | Xtr_mean = np.mean(Xtr_un, axis=0, keepdims=True) 49 | # get the labeled data 50 | Xtr_su = Xtr_su.astype(theano.config.floatX) 51 | Ytr_su = Ytr_su[:,np.newaxis] 52 | # get observations and labels for the validation set 53 | Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX) 54 | Yva = datasets[2][1].get_value(borrow=False).astype(np.int32) 55 | Yva = Yva[:,np.newaxis] # numpy is dumb 56 | # get observations and labels for the test set 57 | Xte = datasets[3][0].get_value(borrow=False).astype(theano.config.floatX) 58 | Yte = datasets[3][1].get_value(borrow=False).astype(np.int32) 59 | Yte = Yte[:,np.newaxis] # numpy is dumb 60 | # get size information for the data and training batches 61 | un_samples = Xtr_un.shape[0] 62 | su_samples = Xtr_su.shape[0] 63 | va_samples = Xva.shape[0] 64 | data_dim = Xtr_su.shape[1] 65 | label_dim = 10 66 | batch_size = 100 67 | 68 | # Symbolic inputs 69 | Xd = T.matrix(name='Xd') 70 | Xc = T.matrix(name='Xc') 71 | Xm = T.matrix(name='Xm') 72 | Xt = T.matrix(name='Xt') 73 | Xp = T.matrix(name='Xp') 74 | Yd = T.icol('Yd') 75 | 76 | # Load inferencer and generator from saved parameters 77 | gn_fname = "MNIST_WALKOUT_TEST_BIN/pt_walk_params_b150000_GN.pkl" 78 | in_fname = "MNIST_WALKOUT_TEST_BIN/pt_walk_params_b150000_IN.pkl" 79 | IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) 80 | GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) 81 | IN.set_sigma_scale(1.3) 82 | prior_dim = GN.latent_dim 83 | 84 | MCS = MCSampler(rng=rng, Xd=Xd, i_net=IN, g_net=GN, chain_len=2, \ 85 | data_dim=data_dim, prior_dim=prior_dim) 86 | full_chain_len = MCS.chain_len + 1 87 | 88 | # setup "chain" versions of the labeled/unlabeled/validate sets 89 | Xtr_su_chains = [Xtr_su.copy() for i in range(full_chain_len)] 90 | Xtr_un_chains = [Xtr_un.copy() for i in range(full_chain_len)] 91 | Ytr_su_chains = [Ytr_su for i in range(full_chain_len)] 92 | Ytr_un_chains = [Ytr_un for i in range(full_chain_len)] 93 | Xva_chains = [Xva for i in range(full_chain_len)] 94 | Yva_chains = [Yva for i in range(full_chain_len)] 95 | 96 | # downsample, to feed less into the PNS 97 | Xtr_su_short = downsample_chains(Xtr_su_chains, stride=1) 98 | Xtr_un_short = downsample_chains(Xtr_un_chains, stride=1) 99 | Ytr_su_short = downsample_chains(Ytr_su_chains, stride=1) 100 | Ytr_un_short = downsample_chains(Ytr_un_chains, stride=1) 101 | Xva_short = downsample_chains(Xva_chains, stride=1) 102 | Yva_short = downsample_chains(Yva_chains, stride=1) 103 | short_chain_len = len(Xtr_su_short) 104 | print("REGULARIZATION CHAIN STEPS: {0:d}".format(short_chain_len)) 105 | 106 | # choose some parameters for the categorical inferencer 107 | pn_params = {} 108 | pc0 = [data_dim, 800, 800, label_dim] 109 | pn_params['proto_configs'] = [pc0] 110 | # Set up some spawn networks 111 | sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} 112 | pn_params['spawn_configs'] = [ sc0 ] 113 | pn_params['spawn_weights'] = [ 1.0 ] 114 | # Set remaining params 115 | pn_params['activation'] = relu_actfun 116 | pn_params['init_scale'] = 0.5 117 | pn_params['lam_l2a'] = 1e-3 118 | pn_params['vis_drop'] = 0.2 119 | pn_params['hid_drop'] = 0.5 120 | 121 | # Initialize the base network for this PNSeq 122 | PN = PeaNet(rng=rng, Xd=Xd, params=pn_params) 123 | PN.init_biases(0.1) 124 | 125 | print("Initializing PNS...") 126 | # Initialize the PeaNetSeq 127 | PNS = PeaNetSeq(rng=rng, pea_net=PN, seq_len=short_chain_len, \ 128 | seq_Xd=None, params=None) 129 | 130 | # set weighting parameters for the various costs... 131 | PNS.set_lam_class(1.0) 132 | PNS.set_lam_pea_su(0.0) 133 | PNS.set_lam_pea_un(2.0) 134 | PNS.set_lam_ent(0.0) 135 | PNS.set_lam_l2w(1e-5) 136 | 137 | learn_rate = 0.05 138 | PNS.set_pn_sgd_params(lr_pn=learn_rate, mom_1=0.9, mom_2=0.999) 139 | for i in range(300000): 140 | if i < 5000: 141 | scale = float(i + 1) / 5000.0 142 | if ((i+1 % 100000) == 0): 143 | learn_rate = learn_rate * 0.5 144 | if ((i % 250) == 0): 145 | Xtr_su_chains = resample_chain_steps(MCS, Xtr_su_chains) 146 | Xtr_un_chains = resample_chain_steps(MCS, Xtr_un_chains) 147 | Xtr_su_short = downsample_chains(Xtr_su_chains, stride=1) 148 | Xtr_un_short = downsample_chains(Xtr_un_chains, stride=1) 149 | # get some data to train with 150 | su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,)) 151 | xsuc = [(x.take(su_idx, axis=0) - Xtr_mean) for x in Xtr_su_short] 152 | ysuc = [y.take(su_idx, axis=0) for y in Ytr_su_short] 153 | un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) 154 | xunc = [(x.take(un_idx, axis=0) - Xtr_mean) for x in Xtr_un_short] 155 | yunc = [y.take(un_idx, axis=0) for y in Ytr_un_short] 156 | Xb_chains = [np.vstack((xsu, xun)) for (xsu, xun) in zip(xsuc, xunc)] 157 | Yb_chains = [np.vstack((ysu, yun)) for (ysu, yun) in zip(ysuc, yunc)] 158 | # set learning parameters for this update 159 | PNS.set_pn_sgd_params(lr_pn=learn_rate, mom_1=0.9, mom_2=0.999) 160 | # do a minibatch update of all PeaNet parameters 161 | outputs = PNS.train_joint(*(Xb_chains + Yb_chains)) 162 | joint_cost = 1.0 * outputs[0] 163 | class_cost = 1.0 * outputs[1] 164 | pea_cost = 1.0 * outputs[2] 165 | ent_cost = 1.0 * outputs[3] 166 | other_reg_cost = 1.0 * outputs[4] 167 | assert(not (np.isnan(joint_cost))) 168 | if ((i % 500) == 0): 169 | o_str = "batch: {0:d}, joint: {1:.4f}, class: {2:.4f}, pea: {3:.4f}, ent: {4:.4f}, other_reg: {5:.4f}".format( \ 170 | i, joint_cost, class_cost, pea_cost, ent_cost, other_reg_cost) 171 | print(o_str) 172 | out_file.write(o_str+"\n") 173 | out_file.flush() 174 | # check classification error on training and validation set 175 | train_err = PNS.classification_error(Xtr_su-Xtr_mean, Ytr_su) 176 | va_err = PNS.classification_error(Xva-Xtr_mean, Yva) 177 | o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) 178 | print(o_str) 179 | out_file.write(o_str+"\n") 180 | out_file.flush() 181 | if ((i % 1000) == 0): 182 | # draw the main PeaNet's first-layer filters/weights 183 | file_name = "MWR_PN_WEIGHTS.png".format(i) 184 | utils.visualize_net_layer(PNS.PN.proto_nets[0][0], file_name) 185 | print("TESTING COMPLETE!") 186 | 187 | if __name__ == "__main__": 188 | manifold_walk_regularization() -------------------------------------------------------------------------------- /generative_models/SVHNWalkReg.py: -------------------------------------------------------------------------------- 1 | import time 2 | import utils as utils 3 | import numpy as np 4 | import numpy.random as npr 5 | import theano 6 | import theano.tensor as T 7 | 8 | from load_data import load_svhn, load_svhn_gray, load_svhn_all_gray_zca 9 | from PeaNet import PeaNet, load_peanet_from_file 10 | from InfNet import InfNet, load_infnet_from_file 11 | from GenNet import GenNet, load_gennet_from_file 12 | from PeaNetSeq import PeaNetSeq 13 | from VCGLoop import VCGLoop 14 | from GIPair import GIPair 15 | from NetLayers import relu_actfun, softplus_actfun, \ 16 | safe_softmax, safe_log 17 | import GenNet as GNet 18 | import InfNet as INet 19 | import PeaNet as PNet 20 | from DKCode import PCA_theano 21 | 22 | import sys, resource 23 | resource.setrlimit(resource.RLIMIT_STACK, (2**29,-1)) 24 | sys.setrecursionlimit(10**6) 25 | 26 | # DERP 27 | RESULT_PATH = "SVHN_WALKOUT_TEST_KLD/" 28 | 29 | #################### 30 | # HELPER FUNCTIONS # 31 | #################### 32 | 33 | def shift_and_scale_into_01(X): 34 | X = X - np.min(X, axis=1, keepdims=True) 35 | X = X / np.max(X, axis=1, keepdims=True) 36 | return X 37 | 38 | def train_valid_split(X, valid_count=1000): 39 | """ 40 | Split the observations in the rows of X into train/validate sets. 41 | """ 42 | obs_count = X.shape[0] 43 | idx = np.arange(obs_count) 44 | npr.shuffle(idx) 45 | va_idx = idx[:valid_count] 46 | tr_idx = idx[valid_count:] 47 | Xtr = X.take(tr_idx, axis=0) 48 | Xva = X.take(va_idx, axis=0) 49 | return Xtr, Xva 50 | 51 | ########################################## 52 | ########################################## 53 | ## TEST SEMISUPERVISED LEARNING ON SVHN ## 54 | ########################################## 55 | ########################################## 56 | 57 | def test_semisupervised(): 58 | import utils as utils 59 | from load_data import load_udm, load_udm_ss, load_mnist 60 | from NetLayers import relu_actfun 61 | 62 | # Initialize a source of randomness 63 | rng = np.random.RandomState(123) 64 | 65 | sup_count = 1000 66 | va_count = 10000 67 | # Load some data to train/validate/test with 68 | tr_file = 'data/svhn_train_gray.pkl' 69 | te_file = 'data/svhn_test_gray.pkl' 70 | ex_file = 'data/svhn_extra_gray.pkl' 71 | data = load_svhn_gray(tr_file, te_file, ex_file=ex_file, ex_count=200000) 72 | X_mean = np.mean(data['Xtr'], axis=0, keepdims=True) 73 | X_std = np.std(data['Xtr'], axis=0, keepdims=True) 74 | data['Xtr'] = (data['Xtr'] - X_mean) / X_std 75 | data['Xte'] = (data['Xte'] - X_mean) / X_std 76 | data['Xex'] = (data['Xex'] - X_mean) / X_std 77 | idx = np.arange(data['Xtr'].shape[0]) 78 | npr.shuffle(idx) 79 | Xva = data['Xte'][:,:] #[idx[0:va_count],:] 80 | Yva = data['Yte'][:,:].astype(np.int32) # [idx[0:va_count],:].astype(np.int32) 81 | Xtr_su = data['Xtr'][idx[va_count:(va_count+sup_count)], :] 82 | Ytr_su = data['Ytr'][idx[va_count:(va_count+sup_count)], :].astype(np.int32) 83 | Xtr_un = np.vstack([data['Xtr'][idx[va_count:], :], data['Xex']]) 84 | Ytr_un = np.zeros((Xtr_un.shape[0],1)).astype(np.int32) 85 | print("unique(Ytr_su): {0:s}".format(str(np.unique(Ytr_su)))) 86 | print("unique(Ytr_un): {0:s}".format(str(np.unique(Ytr_un)))) 87 | print("Xtr_su.shape: {0:s}, Ytr_su.shape: {1:s}".format(str(Xtr_su.shape), str(Ytr_su.shape))) 88 | print("Xva.shape: {0:s}, Yva.shape: {1:s}".format(str(Xva.shape), str(Yva.shape))) 89 | 90 | un_samples = Xtr_un.shape[0] 91 | su_samples = Xtr_su.shape[0] 92 | va_samples = Xva.shape[0] 93 | 94 | # set up some symbolic variables for input to the GITrip 95 | Xd = T.matrix('Xd_base') 96 | Yd = T.icol('Yd_base') 97 | # set some "shape" parameters for the networks 98 | data_dim = Xtr_un.shape[1] 99 | label_dim = 10 100 | batch_size = 200 # we'll take 2x this per batch, for sup and unsup 101 | 102 | # choose some parameters for the categorical inferencer 103 | pn_params = {} 104 | pc0 = [data_dim, 800, 800, label_dim] 105 | pn_params['proto_configs'] = [pc0] 106 | # Set up some spawn networks 107 | sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.2, 'do_dropout': True} 108 | pn_params['spawn_configs'] = [ sc0 ] 109 | pn_params['spawn_weights'] = [ 1.0 ] 110 | # Set remaining params 111 | pn_params['activation'] = relu_actfun 112 | pn_params['init_scale'] = 0.5 113 | pn_params['lam_l2a'] = 1e-3 114 | pn_params['vis_drop'] = 0.2 115 | pn_params['hid_drop'] = 0.5 116 | 117 | # Initialize the base network for this PNSeq 118 | PN = PeaNet(rng=rng, Xd=Xd, params=pn_params) 119 | PN.init_biases(0.1) 120 | 121 | # Initialize the PeaNetSeq 122 | PNS = PeaNetSeq(rng=rng, pea_net=PN, seq_len=2, seq_Xd=None, params=None) 123 | 124 | # set weighting parameters for the various costs... 125 | PNS.set_lam_class(1.0) 126 | PNS.set_lam_pea_su(0.0) 127 | PNS.set_lam_pea_un(1.0) 128 | PNS.set_lam_ent(0.0) 129 | PNS.set_lam_l2w(1e-5) 130 | 131 | out_file = open("SVHN_SS_TEST.txt", 'wb') 132 | cost_1 = [0. for i in range(10)] 133 | learn_rate = 0.02 134 | PNS.set_pn_sgd_params(lr_pn=learn_rate, mom_1=0.9, mom_2=0.999) 135 | for i in range(300000): 136 | # get some data to train with 137 | su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,)) 138 | Xd_su = Xtr_su.take(su_idx, axis=0) 139 | Yd_su = Ytr_su.take(su_idx, axis=0) 140 | un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) 141 | Xd_un = Xtr_un.take(un_idx, axis=0) 142 | Yd_un = Ytr_un.take(un_idx, axis=0) 143 | Xd_batch = np.vstack((Xd_su, Xd_un)) 144 | Yd_batch = np.vstack((Yd_su, Yd_un)) 145 | # set learning parameters for this update 146 | PNS.set_pn_sgd_params(lr_pn=learn_rate, mom_1=0.9, mom_2=0.999) 147 | # do a minibatch update of all PeaNet parameters 148 | outputs = PNS.train_joint(Xd_batch, Xd_batch, Yd_batch, Yd_batch) 149 | cost_1 = [(cost_1[k] + 1.*outputs[k]) for k in range(len(outputs))] 150 | if ((i % 1000) == 0): 151 | cost_1 = [(v / 1000.) for v in cost_1] 152 | o_str = "batch: {0:d}, joint: {1:.4f}, class: {2:.4f}, pea: {3:.4f}, ent: {4:.4f}, other_reg: {5:.4f}".format( \ 153 | i, cost_1[0], cost_1[1], cost_1[2], cost_1[3], cost_1[4]) 154 | print(o_str) 155 | out_file.write(o_str+"\n") 156 | out_file.flush() 157 | cost_1 = [0. for v in cost_1] 158 | # check classification error on training and validation set 159 | train_err = PNS.classification_error(Xtr_su, Ytr_su) 160 | va_err = PNS.classification_error(Xva, Yva) 161 | o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) 162 | print(o_str) 163 | out_file.write(o_str+"\n") 164 | out_file.flush() 165 | if ((i % 1000) == 0): 166 | # draw the main PeaNet's first-layer filters/weights 167 | file_name = "SVHN_SS_PN_WEIGHTS.png".format(i) 168 | utils.visualize_net_layer(PNS.PN.proto_nets[0][0], file_name) 169 | print("TESTING COMPLETE!") 170 | 171 | 172 | 173 | if __name__=="__main__": 174 | test_semisupervised() -------------------------------------------------------------------------------- /generative_models/TFDWalkoutTest.py: -------------------------------------------------------------------------------- 1 | import time 2 | import utils as utils 3 | import numpy as np 4 | import numpy.random as npr 5 | import theano 6 | import theano.tensor as T 7 | 8 | from load_data import load_tfd 9 | from PeaNet import PeaNet, load_peanet_from_file 10 | from InfNet import InfNet, load_infnet_from_file 11 | from HydraNet import HydraNet, load_hydranet_from_file 12 | from VCGLoop import VCGLoop 13 | from OneStageModel import OneStageModel 14 | from NetLayers import relu_actfun, softplus_actfun, \ 15 | safe_softmax, row_shuffle 16 | from HelperFuncs import sample_masks, sample_patch_masks, posterior_klds, \ 17 | collect_obs_costs 18 | 19 | import sys, resource 20 | resource.setrlimit(resource.RLIMIT_STACK, (2**29,-1)) 21 | sys.setrecursionlimit(10**6) 22 | 23 | # DERP 24 | RESULT_PATH = "TFD_WALKOUT_TEST_KLD/" 25 | #RESULT_PATH = "TFD_WALKOUT_TEST_VAE/" 26 | #RESULT_PATH = "TFD_WALKOUT_TEST_MAX_KLD/" 27 | PRIOR_DIM = 100 28 | LOGVAR_BOUND = 6.0 29 | 30 | ########################################### 31 | ########################################### 32 | ## VAE PRETRAINING FOR THE OneStageModel ## 33 | ########################################### 34 | ########################################### 35 | 36 | def pretrain_osm(lam_kld=0.0): 37 | # Initialize a source of randomness 38 | rng = np.random.RandomState(1234) 39 | 40 | # Load some data to train/validate/test with 41 | data_file = 'data/tfd_data_48x48.pkl' 42 | dataset = load_tfd(tfd_pkl_name=data_file, which_set='unlabeled', fold='all') 43 | Xtr_unlabeled = dataset[0] 44 | dataset = load_tfd(tfd_pkl_name=data_file, which_set='train', fold='all') 45 | Xtr_train = dataset[0] 46 | Xtr = np.vstack([Xtr_unlabeled, Xtr_train]) 47 | dataset = load_tfd(tfd_pkl_name=data_file, which_set='valid', fold='all') 48 | Xva = dataset[0] 49 | tr_samples = Xtr.shape[0] 50 | va_samples = Xva.shape[0] 51 | batch_size = 200 52 | batch_reps = 1 53 | 54 | # setup some symbolic variables and stuff 55 | Xd = T.matrix('Xd_base') 56 | data_dim = Xtr.shape[1] 57 | Xtr_mean = np.mean(Xtr, axis=0) 58 | 59 | ########################## 60 | # NETWORK CONFIGURATIONS # 61 | ########################## 62 | gn_params = {} 63 | shared_config = [PRIOR_DIM, 1500, 1500] 64 | output_config = [data_dim, data_dim] 65 | gn_params['shared_config'] = shared_config 66 | gn_params['output_config'] = output_config 67 | gn_params['activation'] = relu_actfun 68 | gn_params['init_scale'] = 1.2 69 | gn_params['lam_l2a'] = 0.0 70 | gn_params['vis_drop'] = 0.0 71 | gn_params['hid_drop'] = 0.0 72 | gn_params['bias_noise'] = 0.0 73 | gn_params['input_noise'] = 0.0 74 | # choose some parameters for the continuous inferencer 75 | in_params = {} 76 | shared_config = [data_dim, 1500, 1500] 77 | top_config = [shared_config[-1], PRIOR_DIM] 78 | in_params['shared_config'] = shared_config 79 | in_params['mu_config'] = top_config 80 | in_params['sigma_config'] = top_config 81 | in_params['activation'] = relu_actfun 82 | in_params['init_scale'] = 1.2 83 | in_params['lam_l2a'] = 0.0 84 | in_params['vis_drop'] = 0.0 85 | in_params['hid_drop'] = 0.0 86 | in_params['bias_noise'] = 0.0 87 | in_params['input_noise'] = 0.0 88 | # Initialize the base networks for this OneStageModel 89 | IN = InfNet(rng=rng, Xd=Xd, \ 90 | params=in_params, shared_param_dicts=None) 91 | GN = HydraNet(rng=rng, Xd=Xd, \ 92 | params=gn_params, shared_param_dicts=None) 93 | # Initialize biases in IN and GN 94 | IN.init_biases(0.2) 95 | GN.init_biases(0.2) 96 | 97 | ###################################### 98 | # LOAD AND RESTART FROM SAVED PARAMS # 99 | ###################################### 100 | # gn_fname = RESULT_PATH+"pt_osm_params_b110000_GN.pkl" 101 | # in_fname = RESULT_PATH+"pt_osm_params_b110000_IN.pkl" 102 | # IN = load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd, \ 103 | # new_params=None) 104 | # GN = load_hydranet_from_file(f_name=gn_fname, rng=rng, Xd=Xd, \ 105 | # new_params=None) 106 | # in_params = IN.params 107 | # gn_params = GN.params 108 | 109 | ######################### 110 | # INITIALIZE THE GIPAIR # 111 | ######################### 112 | osm_params = {} 113 | osm_params['x_type'] = 'gaussian' 114 | osm_params['xt_transform'] = 'sigmoid' 115 | osm_params['logvar_bound'] = LOGVAR_BOUND 116 | OSM = OneStageModel(rng=rng, x_in=Xd, \ 117 | p_x_given_z=GN, q_z_given_x=IN, \ 118 | x_dim=data_dim, z_dim=PRIOR_DIM, params=osm_params) 119 | OSM.set_lam_l2w(1e-4) 120 | 121 | ###################### 122 | # BASIC VAE TRAINING # 123 | ###################### 124 | out_file = open(RESULT_PATH+"pt_osm_results.txt", 'wb') 125 | # Set initial learning rate and basic SGD hyper parameters 126 | obs_costs = np.zeros((batch_size,)) 127 | costs = [0. for i in range(10)] 128 | learn_rate = 0.0002 129 | momentum = 0.8 130 | for i in range(200000): 131 | kld_scale = min(1.0, float(i) / 20000.0) 132 | if ((i > 1) and ((i % 10000) == 0)): 133 | learn_rate = learn_rate * 0.9 134 | # do a minibatch update of the model, and compute some costs 135 | tr_idx = npr.randint(low=0,high=tr_samples,size=(batch_size,)) 136 | Xb = Xtr.take(tr_idx, axis=0) 137 | # do a minibatch update of the model, and compute some costs 138 | OSM.set_sgd_params(lr=learn_rate, mom_1=momentum, mom_2=0.98) 139 | OSM.set_lam_nll(1.0) 140 | OSM.set_lam_kld(lam_kld_1=(1.0 + (kld_scale * (lam_kld - 1.0))), \ 141 | lam_kld_2=0.0) 142 | result = OSM.train_joint(Xb, batch_reps) 143 | costs = [(costs[j] + result[j]) for j in range(len(result))] 144 | if ((i % 1000) == 0): 145 | # record and then reset the cost trackers 146 | costs = [(v / 1000.0) for v in costs] 147 | str_1 = "-- batch {0:d} --".format(i) 148 | str_2 = " joint_cost: {0:.4f}".format(costs[0]) 149 | str_3 = " nll_cost : {0:.4f}".format(costs[1]) 150 | str_4 = " kld_cost : {0:.4f}".format(costs[2]) 151 | str_5 = " reg_cost : {0:.4f}".format(costs[3]) 152 | costs = [0.0 for v in costs] 153 | # print out some diagnostic information 154 | joint_str = "\n".join([str_1, str_2, str_3, str_4, str_5]) 155 | print(joint_str) 156 | out_file.write(joint_str+"\n") 157 | out_file.flush() 158 | if ((i % 2000) == 0): 159 | Xva = row_shuffle(Xva) 160 | model_samps = OSM.sample_from_prior(500) 161 | file_name = RESULT_PATH+"pt_osm_samples_b{0:d}_XG.png".format(i) 162 | utils.visualize_samples(model_samps, file_name, num_rows=20) 163 | file_name = RESULT_PATH+"pt_osm_inf_weights_b{0:d}.png".format(i) 164 | utils.visualize_samples(OSM.inf_weights.get_value(borrow=False).T, \ 165 | file_name, num_rows=30) 166 | file_name = RESULT_PATH+"pt_osm_gen_weights_b{0:d}.png".format(i) 167 | utils.visualize_samples(OSM.gen_weights.get_value(borrow=False), \ 168 | file_name, num_rows=30) 169 | # compute information about free-energy on validation set 170 | file_name = RESULT_PATH+"pt_osm_free_energy_b{0:d}.png".format(i) 171 | fe_terms = OSM.compute_fe_terms(Xva[0:2500], 20) 172 | fe_mean = np.mean(fe_terms[0]) + np.mean(fe_terms[1]) 173 | fe_str = " nll_bound : {0:.4f}".format(fe_mean) 174 | print(fe_str) 175 | out_file.write(fe_str+"\n") 176 | utils.plot_scatter(fe_terms[1], fe_terms[0], file_name, \ 177 | x_label='Posterior KLd', y_label='Negative Log-likelihood') 178 | # compute information about posterior KLds on validation set 179 | file_name = RESULT_PATH+"pt_osm_post_klds_b{0:d}.png".format(i) 180 | post_klds = OSM.compute_post_klds(Xva[0:2500]) 181 | post_dim_klds = np.mean(post_klds, axis=0) 182 | utils.plot_stem(np.arange(post_dim_klds.shape[0]), post_dim_klds, \ 183 | file_name) 184 | if ((i % 5000) == 0): 185 | IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_IN.pkl".format(i)) 186 | GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_b{0:d}_GN.pkl".format(i)) 187 | IN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_IN.pkl") 188 | GN.save_to_file(f_name=RESULT_PATH+"pt_osm_params_GN.pkl") 189 | return 190 | 191 | if __name__=="__main__": 192 | # FOR EXTREME KLD REGULARIZATION 193 | #pretrain_osm(lam_kld=50.0) 194 | #train_walk_from_pretrained_osm(lam_kld=60.0) 195 | 196 | # FOR KLD MODEL 197 | pretrain_osm(lam_kld=15.0) 198 | # train_walk_from_pretrained_osm(lam_kld=15.0) 199 | 200 | # FOR VAE MODEL 201 | #pretrain_osm(lam_kld=1.0) 202 | #train_walk_from_pretrained_osm(lam_kld=1.0) -------------------------------------------------------------------------------- /generative_models/TestBlocksDDModels.py: -------------------------------------------------------------------------------- 1 | ################################################################## 2 | # Code for testing the variational Multi-Stage Generative Model. # 3 | ################################################################## 4 | 5 | from __future__ import print_function, division 6 | 7 | # basic python 8 | import cPickle as pickle 9 | from PIL import Image 10 | import numpy as np 11 | import numpy.random as npr 12 | from collections import OrderedDict 13 | 14 | # theano business 15 | import theano 16 | import theano.tensor as T 17 | 18 | # blocks stuff 19 | from blocks.initialization import Constant, IsotropicGaussian, Orthogonal 20 | from blocks.filter import VariableFilter 21 | from blocks.graph import ComputationGraph 22 | from blocks.roles import PARAMETER 23 | from blocks.model import Model 24 | from blocks.bricks import Tanh, Identity, Rectifier 25 | from blocks.bricks.cost import BinaryCrossEntropy 26 | from blocks.bricks.recurrent import SimpleRecurrent, LSTM 27 | 28 | # phil's sweetness 29 | import utils 30 | from BlocksModels import * 31 | from load_data import load_udm, load_udm_ss, load_mnist, load_binarized_mnist 32 | from HelperFuncs import row_shuffle, to_fX 33 | 34 | ################################### 35 | ################################### 36 | ## HELPER FUNCTIONS FOR SAMPLING ## 37 | ################################### 38 | ################################### 39 | 40 | def scale_norm(arr): 41 | arr = arr - arr.min() 42 | scale = (arr.max() - arr.min()) 43 | return scale * arr 44 | 45 | def img_grid(arr, global_scale=True): 46 | N, height, width = arr.shape 47 | 48 | rows = int(np.sqrt(N)) 49 | cols = int(np.sqrt(N)) 50 | 51 | if rows*cols < N: 52 | cols = cols + 1 53 | 54 | if rows*cols < N: 55 | rows = rows + 1 56 | 57 | total_height = rows * height 58 | total_width = cols * width 59 | 60 | if global_scale: 61 | arr = scale_norm(arr) 62 | 63 | I = np.zeros((total_height, total_width)) 64 | 65 | for i in xrange(N): 66 | r = i // cols 67 | c = i % cols 68 | 69 | if global_scale: 70 | this = arr[i] 71 | else: 72 | this = scale_norm(arr[i]) 73 | 74 | offset_y, offset_x = r*height, c*width 75 | I[offset_y:(offset_y+height), offset_x:(offset_x+width)] = this 76 | 77 | I = (255*I).astype(np.uint8) 78 | return Image.fromarray(I) 79 | 80 | 81 | ######################################## 82 | ######################################## 83 | ## TEST WITH MODEL-BASED INITIAL STEP ## 84 | ######################################## 85 | ######################################## 86 | 87 | def test_ddm_generation(): 88 | ########################## 89 | # Get some training data # 90 | ########################## 91 | rng = np.random.RandomState(1234) 92 | Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') 93 | Xtr = np.vstack((Xtr, Xva)) 94 | Xva = Xte 95 | #del Xte 96 | tr_samples = Xtr.shape[0] 97 | va_samples = Xva.shape[0] 98 | batch_size = 250 99 | 100 | ############################################################ 101 | # Setup some parameters for the Iterative Refinement Model # 102 | ############################################################ 103 | x_dim = Xtr.shape[1] 104 | enc_dim = 250 105 | dec_dim = 250 106 | mix_dim = 20 107 | z_dim = 100 108 | n_iter = 8 109 | 110 | rnninits = { 111 | 'weights_init': IsotropicGaussian(0.01), 112 | 'biases_init': Constant(0.), 113 | } 114 | inits = { 115 | 'weights_init': IsotropicGaussian(0.01), 116 | 'biases_init': Constant(0.), 117 | } 118 | 119 | # setup the infinite mixture initialization model 120 | mix_enc_mlp = CondNet([Tanh()], [x_dim, 250, mix_dim], \ 121 | name="mix_enc_mlp", **inits) 122 | mix_dec_mlp = MLP([Tanh(), Tanh()], \ 123 | [mix_dim, 250, (2*enc_dim + 2*dec_dim)], \ 124 | name="mix_dec_mlp", **inits) 125 | # setup the components of the sequential generative model 126 | enc_mlp_in = MLP([Identity()], [(x_dim + dec_dim + dec_dim), 4*enc_dim], \ 127 | name="enc_mlp_in", **inits) 128 | dec_mlp_in = MLP([Identity()], [z_dim, 4*dec_dim], \ 129 | name="dec_mlp_in", **inits) 130 | enc_mlp_out = CondNet([], [enc_dim, z_dim], name="enc_mlp_out", **inits) 131 | dec_mlp_out = CondNet([], [dec_dim, z_dim], name="dec_mlp_out", **inits) 132 | enc_rnn = BiasedLSTM(dim=enc_dim, ig_bias=2.0, fg_bias=2.0, \ 133 | name="enc_rnn", **rnninits) 134 | dec_rnn = BiasedLSTM(dim=dec_dim, ig_bias=2.0, fg_bias=2.0, \ 135 | name="dec_rnn", **rnninits) 136 | # set up the transform from latent space to observation space 137 | s2x_mlp = TanhMLPwFFBP(dec_dim, [500], x_dim, name="s2x_mlp", **inits) 138 | 139 | draw = DriftDiffModel( 140 | n_iter, 141 | mix_enc_mlp=mix_enc_mlp, 142 | mix_dec_mlp=mix_dec_mlp, 143 | enc_mlp_in=enc_mlp_in, 144 | enc_mlp_out=enc_mlp_out, 145 | enc_rnn=enc_rnn, 146 | dec_mlp_in=dec_mlp_in, 147 | dec_mlp_out=dec_mlp_out, 148 | dec_rnn=dec_rnn, 149 | s2x_mlp=s2x_mlp) 150 | draw.initialize() 151 | 152 | # build the cost gradients, training function, samplers, etc. 153 | draw.build_model_funcs() 154 | 155 | #draw.load_model_params(f_name="TBDDM_GEN_PARAMS.pkl") 156 | 157 | ################################################################ 158 | # Apply some updates, to check that they aren't totally broken # 159 | ################################################################ 160 | print("Beginning to train the model...") 161 | out_file = open("TBDDM_GEN_RESULTS.txt", 'wb') 162 | costs = [0. for i in range(10)] 163 | learn_rate = 0.0002 164 | momentum = 0.5 165 | batch_idx = np.arange(batch_size) + tr_samples 166 | for i in range(250000): 167 | scale = min(1.0, ((i+1) / 1000.0)) 168 | if (((i + 1) % 10000) == 0): 169 | learn_rate = learn_rate * 0.95 170 | if (i > 10000): 171 | momentum = 0.90 172 | else: 173 | momentum = 0.50 174 | # get the indices of training samples for this batch update 175 | batch_idx += batch_size 176 | if (np.max(batch_idx) >= tr_samples): 177 | # we finished an "epoch", so we rejumble the training set 178 | Xtr = row_shuffle(Xtr) 179 | batch_idx = np.arange(batch_size) 180 | 181 | # set sgd and objective function hyperparams for this update 182 | zero_ary = np.zeros((1,)) 183 | draw.lr.set_value(to_fX(zero_ary + learn_rate)) 184 | draw.mom_1.set_value(to_fX(zero_ary + momentum)) 185 | draw.mom_2.set_value(to_fX(zero_ary + 0.99)) 186 | 187 | # perform a minibatch update and record the cost for this batch 188 | Xb = to_fX(Xtr.take(batch_idx, axis=0)) 189 | result = draw.train_joint(Xb, Xb) 190 | costs = [(costs[j] + result[j]) for j in range(len(result))] 191 | 192 | # diagnostics 193 | if ((i % 250) == 0): 194 | costs = [(v / 250.0) for v in costs] 195 | str1 = "-- batch {0:d} --".format(i) 196 | str2 = " total_cost: {0:.4f}".format(costs[0]) 197 | str3 = " nll_bound : {0:.4f}".format(costs[1]) 198 | str4 = " nll_term : {0:.4f}".format(costs[2]) 199 | str5 = " kld_q2p : {0:.4f}".format(costs[3]) 200 | str6 = " kld_p2q : {0:.4f}".format(costs[4]) 201 | str7 = " reg_term : {0:.4f}".format(costs[5]) 202 | joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7]) 203 | print(joint_str) 204 | out_file.write(joint_str+"\n") 205 | out_file.flush() 206 | costs = [0.0 for v in costs] 207 | if ((i % 500) == 0): 208 | draw.save_model_params("TBDDM_GEN_PARAMS.pkl") 209 | # compute a small-sample estimate of NLL bound on validation set 210 | Xva = row_shuffle(Xva) 211 | Xb = to_fX(Xva[:5000]) 212 | va_costs = draw.compute_nll_bound(Xb, Xb) 213 | str1 = " va_nll_bound : {}".format(va_costs[1]) 214 | str2 = " va_nll_term : {}".format(va_costs[2]) 215 | str3 = " va_kld_q2p : {}".format(va_costs[3]) 216 | joint_str = "\n".join([str1, str2, str3]) 217 | print(joint_str) 218 | out_file.write(joint_str+"\n") 219 | out_file.flush() 220 | # draw some independent samples from the model 221 | samples = draw.do_sample(16*16) 222 | n_iter, N, D = samples.shape 223 | samples = samples.reshape( (n_iter, N, 28, 28) ) 224 | for j in xrange(n_iter): 225 | img = img_grid(samples[j,:,:,:]) 226 | img.save("TBDDM-gen-samples-%03d.png" % (j,)) 227 | 228 | if __name__=="__main__": 229 | test_ddm_generation() -------------------------------------------------------------------------------- /generative_models/TestClassModel.py: -------------------------------------------------------------------------------- 1 | ################################################################## 2 | # Code for testing the variational Multi-Stage Generative Model. # 3 | ################################################################## 4 | 5 | # basic python 6 | import numpy as np 7 | import numpy.random as npr 8 | 9 | # theano business 10 | import theano 11 | import theano.tensor as T 12 | 13 | # phil's sweetness 14 | from LogPDFs import log_prob_bernoulli, log_prob_gaussian2, gaussian_kld 15 | from NetLayers import relu_actfun, softplus_actfun, tanh_actfun, \ 16 | apply_mask, binarize_data, row_shuffle, to_fX 17 | from InfNet import InfNet 18 | from ClassModel import ClassModel 19 | from load_data import load_udm, load_udm_ss, load_mnist, load_binarized_mnist 20 | from HelperFuncs import collect_obs_costs 21 | import utils 22 | 23 | ######################################## 24 | ######################################## 25 | ## TEST WITH MODEL-BASED INITIAL STEP ## 26 | ######################################## 27 | ######################################## 28 | 29 | def test_with_model_init(): 30 | ########################## 31 | # Get some training data # 32 | ########################## 33 | rng = np.random.RandomState(1234) 34 | dataset = 'data/mnist.pkl.gz' 35 | datasets = load_udm(dataset, as_shared=False, zero_mean=False) 36 | Xtr = to_fX(datasets[0][0]) 37 | Xva = to_fX(datasets[1][0]) 38 | Ytr = datasets[0][1] 39 | Yva = datasets[1][1] 40 | 41 | tr_samples = Xtr.shape[0] 42 | va_samples = Xva.shape[0] 43 | batch_size = 200 44 | 45 | BD = lambda ary: binarize_data(ary) 46 | 47 | ####################################### 48 | # Setup some parameters for the model # 49 | ####################################### 50 | obs_dim = Xtr.shape[1] 51 | z_dim = 64 52 | init_scale = 0.2 53 | 54 | # some InfNet instances to build the TwoStageModel from 55 | x_in = T.matrix('x_in') 56 | y_in = T.lvector('y_in') 57 | 58 | ############### 59 | # q_z_given_x # 60 | ############### 61 | print("Building q_z_given_x...") 62 | params = {} 63 | shared_config = [obs_dim, 1000, 1000] 64 | top_config = [shared_config[-1], z_dim] 65 | params['shared_config'] = shared_config 66 | params['mu_config'] = top_config 67 | params['sigma_config'] = top_config 68 | params['activation'] = relu_actfun 69 | params['init_scale'] = init_scale 70 | params['lam_l2a'] = 0.0 71 | params['vis_drop'] = 0.2 72 | params['hid_drop'] = 0.5 73 | params['bias_noise'] = 0.0 74 | params['input_noise'] = 0.0 75 | params['build_theano_funcs'] = False 76 | q_z_given_x = InfNet(rng=rng, Xd=x_in, \ 77 | params=params, shared_param_dicts=None) 78 | q_z_given_x.init_biases(0.2) 79 | 80 | 81 | ########################################################### 82 | # Define parameters for the ClassModel, and initialize it # 83 | ########################################################### 84 | print("Building the ClassModel...") 85 | CM = ClassModel(rng=rng, \ 86 | x_in=x_in, y_in=y_in, \ 87 | q_z_given_x=q_z_given_x, \ 88 | class_count=10, \ 89 | z_dim=z_dim, \ 90 | use_samples=False) 91 | CM.set_drop_rate(0.5) 92 | CM.set_lam_nll(lam_nll=1.0) 93 | CM.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0) 94 | CM.set_lam_l2w(lam_l2w=1e-5) 95 | 96 | ################################################################ 97 | # Apply some updates, to check that they aren't totally broken # 98 | ################################################################ 99 | out_file = open("CM_RESULTS.txt", 'wb') 100 | costs = [0. for i in range(10)] 101 | learn_rate = 0.0002 102 | momentum = 0.9 103 | batch_idx = np.arange(batch_size) + tr_samples 104 | for i in range(250000): 105 | scale = min(1.0, ((i+1) / 1000.0)) 106 | if (((i + 1) % 10000) == 0): 107 | learn_rate = learn_rate * 0.95 108 | # get the indices of training samples for this batch update 109 | batch_idx += batch_size 110 | if (np.max(batch_idx) >= tr_samples): 111 | # we finished an "epoch", so we rejumble the training set 112 | Xtr, Ytr = row_shuffle(Xtr, Ytr) 113 | batch_idx = np.arange(batch_size) 114 | # set sgd and objective function hyperparams for this update 115 | CM.set_sgd_params(lr_1=scale*learn_rate, lr_2=scale*learn_rate, \ 116 | mom_1=scale*momentum, mom_2=0.99) 117 | # perform a minibatch update and record the cost for this batch 118 | Xi_tr = Xtr.take(batch_idx, axis=0) 119 | Yi_tr = Ytr.take(batch_idx, axis=0) 120 | result = CM.train_joint(Xi_tr, Yi_tr) 121 | costs = [(costs[j] + result[j]) for j in range(len(result)-1)] 122 | # output useful information about training progress 123 | if ((i % 500) == 0): 124 | costs = [(v / 500.0) for v in costs] 125 | str1 = "-- batch {0:d} --".format(i) 126 | str2 = " joint_cost : {0:.4f}".format(costs[0]) 127 | str3 = " nll_cost : {0:.4f}".format(costs[1]) 128 | str4 = " kld_cost : {0:.4f}".format(costs[2]) 129 | str5 = " reg_cost : {0:.4f}".format(costs[3]) 130 | joint_str = "\n".join([str1, str2, str3, str4, str5]) 131 | print(joint_str) 132 | out_file.write(joint_str+"\n") 133 | out_file.flush() 134 | costs = [0.0 for v in costs] 135 | if (((i % 2000) == 0) or ((i < 10000) and ((i % 1000) == 0))): 136 | ##################################################### 137 | # compute multi-sample estimates of the free-energy # 138 | ##################################################### 139 | # training set... 140 | fe_terms = CM.compute_fe_terms(Xtr[0:2500],Ytr[0:2500], 30) 141 | fe_nll = np.mean(fe_terms[0]) 142 | fe_kld = np.mean(fe_terms[1]) 143 | fe_joint = fe_nll + fe_kld 144 | joint_str = " vfe-tr: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ 145 | fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) 146 | print(joint_str) 147 | out_file.write(joint_str+"\n") 148 | out_file.flush() 149 | # validation set... 150 | Xva, Yva = row_shuffle(Xva, Yva) 151 | fe_terms = CM.compute_fe_terms(Xva[0:2500], Yva[0:2500], 30) 152 | fe_nll = np.mean(fe_terms[0]) 153 | fe_kld = np.mean(fe_terms[1]) 154 | fe_joint = fe_nll + fe_kld 155 | joint_str = " vfe-va: {0:.4f}, nll: ({1:.4f}, {2:.4f}, {3:.4f}), kld: ({4:.4f}, {5:.4f}, {6:.4f})".format( \ 156 | fe_joint, fe_nll, np.min(fe_terms[0]), np.max(fe_terms[0]), fe_kld, np.min(fe_terms[1]), np.max(fe_terms[1])) 157 | print(joint_str) 158 | out_file.write(joint_str+"\n") 159 | out_file.flush() 160 | ########################################################## 161 | # compute multi-sample estimates of classification error # 162 | ########################################################## 163 | # training set... 164 | va_error, va_preds = CM.class_error(Xtr[:2500], Ytr[:2500], samples=30) 165 | joint_str = " tr-class-error: {0:.4f}".format(va_error) 166 | print(joint_str) 167 | out_file.write(joint_str+"\n") 168 | out_file.flush() 169 | # validation set... 170 | va_error, va_preds = CM.class_error(Xva[:2500], Yva[:2500], samples=30) 171 | joint_str = " va-class-error: {0:.4f}".format(va_error) 172 | print(joint_str) 173 | out_file.write(joint_str+"\n") 174 | out_file.flush() 175 | 176 | if __name__=="__main__": 177 | test_with_model_init() -------------------------------------------------------------------------------- /generative_models/TestImpTM.py: -------------------------------------------------------------------------------- 1 | ################################################################## 2 | # Code for testing the variational Multi-Stage Generative Model. # 3 | ################################################################## 4 | 5 | # basic python 6 | import numpy as np 7 | import numpy.random as npr 8 | import cPickle 9 | 10 | # theano business 11 | import theano 12 | import theano.tensor as T 13 | 14 | # phil's sweetness 15 | import utils 16 | from GPSImputer import TemplateMatchImputer 17 | from load_data import load_udm, load_mnist, load_tfd, load_svhn_gray 18 | from HelperFuncs import construct_masked_data, shift_and_scale_into_01, \ 19 | row_shuffle, to_fX 20 | 21 | RESULT_PATH = "IMP_MNIST_TM/" 22 | 23 | ############################### 24 | ############################### 25 | ## TEST GPS IMPUTER ON MNIST ## 26 | ############################### 27 | ############################### 28 | 29 | def test_mnist_nll(occ_dim=15, drop_prob=0.0): 30 | ######################################### 31 | # Format the result tag more thoroughly # 32 | ######################################### 33 | dp_int = int(100.0 * drop_prob) 34 | result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int) 35 | 36 | ########################## 37 | # Get some training data # 38 | ########################## 39 | rng = np.random.RandomState(1234) 40 | dataset = 'data/mnist.pkl.gz' 41 | datasets = load_udm(dataset, as_shared=False, zero_mean=False) 42 | Xtr = datasets[0][0] 43 | Xva = datasets[1][0] 44 | Xtr = to_fX(shift_and_scale_into_01(Xtr)) 45 | Xva = to_fX(shift_and_scale_into_01(Xva)) 46 | tr_samples = Xtr.shape[0] 47 | va_samples = Xva.shape[0] 48 | batch_size = 200 49 | batch_reps = 1 50 | all_pix_mean = np.mean(np.mean(Xtr, axis=1)) 51 | data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],))) 52 | 53 | TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli') 54 | 55 | log_name = "{}_RESULTS.txt".format(result_tag) 56 | out_file = open(log_name, 'wb') 57 | 58 | Xva = row_shuffle(Xva) 59 | # record an estimate of performance on the test set 60 | xi, xo, xm = construct_masked_data(Xva, drop_prob=drop_prob, \ 61 | occ_dim=occ_dim, data_mean=data_mean) 62 | result = TM.best_match_nll(xo, xm) 63 | match_on_known = np.mean(result[0]) 64 | match_on_unknown = np.mean(result[1]) 65 | str0 = "Test 1:" 66 | str1 = " match on known : {}".format(match_on_known) 67 | str2 = " match on unknown : {}".format(match_on_unknown) 68 | joint_str = "\n".join([str0, str1, str2]) 69 | print(joint_str) 70 | out_file.write(joint_str+"\n") 71 | out_file.flush() 72 | out_file.close() 73 | return 74 | 75 | def test_mnist_img(occ_dim=15, drop_prob=0.0): 76 | ######################################### 77 | # Format the result tag more thoroughly # 78 | ######################################### 79 | dp_int = int(100.0 * drop_prob) 80 | result_tag = RESULT_PATH + "TM_OD{}_DP{}".format(occ_dim, dp_int) 81 | 82 | ########################## 83 | # Get some training data # 84 | ########################## 85 | rng = np.random.RandomState(1234) 86 | dataset = 'data/mnist.pkl.gz' 87 | datasets = load_udm(dataset, as_shared=False, zero_mean=False) 88 | Xtr = datasets[0][0] 89 | Xva = datasets[1][0] 90 | Xtr = to_fX(shift_and_scale_into_01(Xtr)) 91 | Xva = to_fX(shift_and_scale_into_01(Xva)) 92 | tr_samples = Xtr.shape[0] 93 | va_samples = Xva.shape[0] 94 | batch_size = 200 95 | batch_reps = 1 96 | all_pix_mean = np.mean(np.mean(Xtr, axis=1)) 97 | data_mean = to_fX(all_pix_mean * np.ones((Xtr.shape[1],))) 98 | 99 | TM = TemplateMatchImputer(x_train=Xtr, x_type='bernoulli') 100 | 101 | Xva = row_shuffle(Xva) 102 | # record an estimate of performance on the test set 103 | xi, xo, xm = construct_masked_data(Xva[:500], drop_prob=drop_prob, \ 104 | occ_dim=occ_dim, data_mean=data_mean) 105 | img_match_on_known, img_match_on_unknown = TM.best_match_img(xo, xm) 106 | 107 | display_count = 100 108 | # visualize matches on known elements 109 | Xs = np.zeros((2*display_count, Xva.shape[1])) 110 | for idx in range(display_count): 111 | Xs[2*idx] = xi[idx] 112 | Xs[(2*idx)+1] = img_match_on_known[idx] 113 | file_name = "{0:s}_SAMPLES_MOK.png".format(result_tag) 114 | utils.visualize_samples(Xs, file_name, num_rows=20) 115 | # visualize matches on unknown elements 116 | Xs = np.zeros((2*display_count, Xva.shape[1])) 117 | for idx in range(display_count): 118 | Xs[2*idx] = xi[idx] 119 | Xs[(2*idx)+1] = img_match_on_unknown[idx] 120 | file_name = "{0:s}_SAMPLES_MOU.png".format(result_tag) 121 | utils.visualize_samples(Xs, file_name, num_rows=20) 122 | return 123 | 124 | 125 | if __name__=="__main__": 126 | ######### 127 | # MNIST # 128 | ######### 129 | # test_mnist_nll(occ_dim=0, drop_prob=0.6) 130 | # test_mnist_nll(occ_dim=0, drop_prob=0.7) 131 | # test_mnist_nll(occ_dim=0, drop_prob=0.8) 132 | # test_mnist_nll(occ_dim=0, drop_prob=0.9) 133 | # test_mnist_nll(occ_dim=14, drop_prob=0.0) 134 | # test_mnist_nll(occ_dim=16, drop_prob=0.0) 135 | test_mnist_img(occ_dim=0, drop_prob=0.6) 136 | test_mnist_img(occ_dim=0, drop_prob=0.7) 137 | test_mnist_img(occ_dim=0, drop_prob=0.8) 138 | test_mnist_img(occ_dim=0, drop_prob=0.9) 139 | test_mnist_img(occ_dim=14, drop_prob=0.0) 140 | test_mnist_img(occ_dim=16, drop_prob=0.0) 141 | -------------------------------------------------------------------------------- /generative_models/blocks_models/binarized_mnist_converter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import fuel 4 | import h5py 5 | import numpy 6 | 7 | default_directory = os.path.join(fuel.config.data_path, 'binarized_mnist') 8 | default_save_path = os.path.join(default_directory, 'binarized_mnist.hdf5') 9 | 10 | 11 | def binarized_mnist(directory=None, save_path=None): 12 | """Converts the binarized MNIST dataset to HDF5. 13 | Converts the binarized MNIST dataset used in R. Salakhutdinov's DBN 14 | paper [DBN] to an HDF5 dataset compatible with 15 | :class:`fuel.datasets.BinarizedMNIST`. 16 | This method assumes the existence of the files 17 | `binarized_mnist_{train,valid,test}.amat`, which are accessible 18 | through Hugo Larochelle's website [HUGO]. 19 | .. [DBN] Ruslan Salakhutdinov and Iain Murray, *On the Quantitative 20 | Analysis of Deep Belief Networks*, Proceedings of the 25th 21 | international conference on Machine learning, 2008, pp. 872-879. 22 | .. [HUGO] http://www.cs.toronto.edu/~larocheh/public/datasets/ 23 | binarized_mnist/binarized_mnist_{train,valid,test}.amat 24 | Parameters 25 | ---------- 26 | directory : str, optional 27 | Base directory in which the required input files reside. Defaults 28 | to `None`, in which case `'$FUEL_DATA_PATH/binarized_mnist'` is 29 | used. 30 | save_path : str, optional 31 | Where to save the converted dataset. Defaults to `None`, in which 32 | case `'$FUEL_DATA_PATH/binarized_mnist/binarized_mnist.hdf5'` is 33 | used. 34 | """ 35 | if directory is None: 36 | directory = default_directory 37 | if save_path is None: 38 | save_path = default_save_path 39 | 40 | train_set = numpy.loadtxt( 41 | os.path.join(directory, 'binarized_mnist_train.amat')) 42 | valid_set = numpy.loadtxt( 43 | os.path.join(directory, 'binarized_mnist_valid.amat')) 44 | test_set = numpy.loadtxt( 45 | os.path.join(directory, 'binarized_mnist_test.amat')) 46 | 47 | f = h5py.File(save_path, mode="w") 48 | 49 | features = f.create_dataset('features', (70000, 1, 28, 28), dtype='uint8') 50 | features[...] = numpy.vstack([train_set.reshape((-1, 1, 28, 28)), 51 | valid_set.reshape((-1, 1, 28, 28)), 52 | test_set.reshape((-1, 1, 28, 28))]) 53 | f.attrs['train'] = [0, 50000] 54 | f.attrs['valid'] = [50000, 60000] 55 | f.attrs['test'] = [60000, 70000] 56 | 57 | f.flush() 58 | f.close() -------------------------------------------------------------------------------- /generative_models/blocks_models/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Philip-Bachman/NN-Python/e9a7619806c5ccbe2bd648b2a2e0af7967dc6996/generative_models/blocks_models/lib/__init__.py -------------------------------------------------------------------------------- /generative_models/blocks_models/lib/myutils.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | 4 | from abc import ABCMeta, abstractmethod 5 | 6 | import ipdb 7 | import numpy 8 | import six 9 | import theano 10 | 11 | from collections import OrderedDict 12 | 13 | from theano import tensor 14 | from blocks.initialization import NdarrayInitialization, Uniform 15 | 16 | 17 | def merge_gradients(*gradient_list): 18 | """Take and merge multiple ordered dicts 19 | """ 20 | merged = OrderedDict() 21 | for gradients in gradient_list: 22 | assert isinstance(gradients, (dict, OrderedDict)) 23 | for key, val in gradients.items(): 24 | if merged.has_key(key): 25 | merged[key] = merged[key] + val 26 | else: 27 | merged[key] = val 28 | return merged 29 | 30 | #----------------------------------------------------------------------------- 31 | 32 | 33 | class ShapeDependentInitialization(NdarrayInitialization): 34 | """Initialize 35 | 36 | Parameters 37 | ---------- 38 | weights_init : :class:`NdarrayInitialization` instance 39 | The unscaled initialization scheme to initialize the weights with. 40 | """ 41 | def __init__(self, weights_init): 42 | super(ShapeDependentInitialization, self).__init__() 43 | self.weights_init = weights_init 44 | 45 | def generate(self, rng, shape): 46 | weights = self.weights_init.generate(rng, shape) 47 | scale = self.scale_func(*shape) 48 | return scale*weights 49 | 50 | # TODO: Abstract 51 | def scale_func(self, *shape): 52 | pass 53 | 54 | 55 | class TanhInitialization(ShapeDependentInitialization): 56 | """Normalized initialization for tanh MLPs. 57 | 58 | This class initializes parameters by drawing from the uniform 59 | distribution with the interval 60 | 61 | [- sqrt(6)/sqrt(dim_in+dim_out) .. sqrt(6)/sqrt(dim_in+dim_out)] 62 | """ 63 | def __init__(self): 64 | super(TanhInitialization, self).__init__(Uniform(mean=0., width=2.)) 65 | 66 | def scale_func(self, dim_in, dim_out): 67 | return numpy.sqrt(6)/numpy.sqrt(dim_in+dim_out) 68 | -------------------------------------------------------------------------------- /generative_models/blocks_models/plot-log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import division, print_function 4 | 5 | import logging 6 | import argparse 7 | import numpy as np 8 | import pylab 9 | import matplotlib as mpl 10 | import matplotlib.pyplot as plt 11 | import cPickle as pickle 12 | 13 | from mpl_toolkits.mplot3d import Axes3D 14 | 15 | from blocks.main_loop import MainLoop 16 | from blocks.log import TrainingLog 17 | 18 | FORMAT = '[%(asctime)s] %(name)-15s %(message)s' 19 | DATEFMT = "%H:%M:%S" 20 | logging.basicConfig(format=FORMAT, datefmt=DATEFMT, level=logging.INFO) 21 | 22 | 23 | if __name__ == "__main__": 24 | from argparse import ArgumentParser 25 | 26 | parser = ArgumentParser() 27 | parser.add_argument("model_file", help="filename the log to plot from") 28 | args = parser.parse_args() 29 | 30 | logging.info("Loading file %s..." % args.model_file) 31 | with open(args.model_file, "rb") as f: 32 | p = pickle.load(f) 33 | 34 | if isinstance(p, MainLoop): 35 | print("GOOD LUCK, BUT PLEASE USE A LOG!") 36 | assert(False) 37 | elif isinstance(p, TrainingLog): 38 | log = p 39 | 40 | plot_tag = args.model_file[0:-8] 41 | df = log.to_dataframe() 42 | df_keys = df.keys() 43 | 44 | ################################ 45 | # PLOT VARIATIONAL FREE-ENERGY # 46 | ################################ 47 | nll_bound_types = [k for k in df_keys if (k.find('nll_bound') > -1)] 48 | nll_bound_idx = df[nll_bound_types[0]].keys()[1:-5] 49 | #nll_bound_idx = [i for i in nll_bound_idx if i < 40000] 50 | nll_bound_map = {} 51 | for k in nll_bound_types: 52 | idx = np.asarray(nll_bound_idx) 53 | vals = np.asarray(df[k][nll_bound_idx]) 54 | nll_bound_map[k] = [idx, vals] 55 | 56 | nll_plot_name = "NLL_BOUNDS_{}.png".format(plot_tag) 57 | fig = plt.figure() 58 | ax = fig.add_subplot(111) 59 | ax.hold(True) 60 | min_map = {} 61 | for k, v in nll_bound_map.items(): 62 | x, y = v 63 | y_min = np.min(y) 64 | ax.plot(x, y, label=k) 65 | ax.plot(x, ((0.0*y) + y_min), label="min({0:s})={1:.4f}".format(k,y_min)) 66 | ax.legend() 67 | fig.savefig(nll_plot_name, dpi=None, facecolor='w', edgecolor='w', \ 68 | orientation='portrait', papertype=None, format=None, \ 69 | transparent=False, bbox_inches=None, pad_inches=0.1, \ 70 | frameon=None) 71 | plt.close(fig) 72 | ##################### 73 | # PLOT PER-STEP KLD # 74 | ##################### 75 | valid_kl_keys = [k for k in df_keys if (k.find('valid_kl_') > -1)] 76 | valid_kl_idx = df[valid_kl_keys[0]].keys()[1:-5] 77 | valid_kl_map = {} 78 | for k in valid_kl_keys: 79 | idx = np.asarray(valid_kl_idx) 80 | vals = np.asarray(df[k][valid_kl_idx]) 81 | valid_kl_map[k] = [idx, vals] 82 | 83 | kl_plot_name = "KL_TERMS_{}.png".format(plot_tag) 84 | fig = plt.figure() 85 | ax = fig.add_subplot(111) 86 | ax.hold(True) 87 | for k, v in valid_kl_map.items(): 88 | x, y = v 89 | ax.plot(x, y, label=k) 90 | ax.legend() 91 | fig.savefig(kl_plot_name, dpi=None, facecolor='w', edgecolor='w', \ 92 | orientation='portrait', papertype=None, format=None, \ 93 | transparent=False, bbox_inches=None, pad_inches=0.1, \ 94 | frameon=None) 95 | plt.close(fig) 96 | -------------------------------------------------------------------------------- /generative_models/blocks_models/run-att-rw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import division, print_function 4 | 5 | import logging 6 | 7 | FORMAT = '[%(asctime)s] %(name)-15s %(message)s' 8 | DATEFMT = "%H:%M:%S" 9 | logging.basicConfig(format=FORMAT, datefmt=DATEFMT, level=logging.INFO) 10 | 11 | import theano 12 | import theano.tensor as T 13 | import ipdb 14 | import fuel 15 | 16 | from argparse import ArgumentParser 17 | from collections import OrderedDict 18 | from theano import tensor 19 | 20 | from fuel.streams import DataStream, ForceFloatX 21 | from fuel.schemes import SequentialScheme 22 | from fuel.datasets.binarized_mnist import BinarizedMNIST 23 | 24 | from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, RMSProp, Adam, RemoveNotFinite 25 | from blocks.initialization import Constant, IsotropicGaussian, Orthogonal 26 | from blocks.filter import VariableFilter 27 | from blocks.graph import ComputationGraph 28 | from blocks.roles import WEIGHTS, BIASES, PARAMETER 29 | from blocks.model import Model 30 | from blocks.monitoring import aggregation 31 | from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar 32 | from blocks.extensions.plot import Plot 33 | from blocks.extensions.saveload import SerializeMainLoop 34 | from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring 35 | from blocks.main_loop import MainLoop 36 | 37 | from blocks.bricks import Tanh, MLP 38 | from blocks.bricks.cost import BinaryCrossEntropy 39 | from blocks.bricks.recurrent import SimpleRecurrent, LSTM 40 | 41 | from models import * 42 | from attention import ZoomableAttentionWindow 43 | 44 | fuel.config.floatX = theano.config.floatX 45 | 46 | 47 | #---------------------------------------------------------------------------- 48 | def main(name, epochs, batch_size, learning_rate): 49 | if name is None: 50 | name = "att-rw" 51 | 52 | print("\nRunning experiment %s" % name) 53 | print(" learning rate: %5.3f" % learning_rate) 54 | print() 55 | 56 | 57 | #------------------------------------------------------------------------ 58 | 59 | img_height, img_width = 28, 28 60 | 61 | read_N = 12 62 | write_N = 14 63 | 64 | inits = { 65 | #'weights_init': Orthogonal(), 66 | 'weights_init': IsotropicGaussian(0.001), 67 | 'biases_init': Constant(0.), 68 | } 69 | 70 | x_dim = img_height * img_width 71 | 72 | reader = ZoomableAttentionWindow(img_height, img_width, read_N) 73 | writer = ZoomableAttentionWindow(img_height, img_width, write_N) 74 | 75 | # Parameterize the attention reader and writer 76 | mlpr = MLP(activations=[Tanh(), Identity()], 77 | dims=[x_dim, 50, 5], 78 | name="RMLP", 79 | **inits) 80 | mlpw = MLP(activations=[Tanh(), Identity()], 81 | dims=[x_dim, 50, 5], 82 | name="WMLP", 83 | **inits) 84 | 85 | # MLP between the reader and writer 86 | mlp = MLP(activations=[Tanh(), Identity()], 87 | dims=[read_N**2, 300, write_N**2], 88 | name="MLP", 89 | **inits) 90 | 91 | for brick in [mlpr, mlpw, mlp]: 92 | brick.allocate() 93 | brick.initialize() 94 | 95 | #------------------------------------------------------------------------ 96 | x = tensor.matrix('features') 97 | 98 | hr = mlpr.apply(x) 99 | hw = mlpw.apply(x) 100 | 101 | center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) 102 | r = reader.read(x, center_y, center_x, delta, sigma) 103 | 104 | h = mlp.apply(r) 105 | 106 | center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) 107 | c = writer.write(h, center_y, center_x, delta, sigma) / gamma 108 | x_recons = T.nnet.sigmoid(c) 109 | 110 | cost = BinaryCrossEntropy().apply(x, x_recons) 111 | cost.name = "cost" 112 | 113 | #------------------------------------------------------------ 114 | cg = ComputationGraph([cost]) 115 | params = VariableFilter(roles=[PARAMETER])(cg.variables) 116 | 117 | algorithm = GradientDescent( 118 | cost=cost, 119 | params=params, 120 | step_rule=CompositeRule([ 121 | RemoveNotFinite(), 122 | Adam(learning_rate), 123 | StepClipping(3.), 124 | ]) 125 | #step_rule=RMSProp(learning_rate), 126 | #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) 127 | ) 128 | 129 | 130 | #------------------------------------------------------------------------ 131 | # Setup monitors 132 | monitors = [cost] 133 | #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: 134 | # v_mean = v.mean() 135 | # v_mean.name = v.name 136 | # monitors += [v_mean] 137 | # monitors += [aggregation.mean(v)] 138 | 139 | train_monitors = monitors[:] 140 | train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] 141 | train_monitors += [aggregation.mean(algorithm.total_step_norm)] 142 | 143 | # Live plotting... 144 | plot_channels = [ 145 | ["cost"], 146 | ] 147 | 148 | #------------------------------------------------------------ 149 | 150 | mnist_train = BinarizedMNIST("train", sources=['features']) 151 | mnist_test = BinarizedMNIST("test", sources=['features']) 152 | #mnist_train = MNIST("train", binary=True, sources=['features']) 153 | #mnist_test = MNIST("test", binary=True, sources=['features']) 154 | 155 | main_loop = MainLoop( 156 | model=Model(cost), 157 | data_stream=ForceFloatX(DataStream(mnist_train, 158 | iteration_scheme=SequentialScheme( 159 | mnist_train.num_examples, batch_size))), 160 | algorithm=algorithm, 161 | extensions=[ 162 | Timing(), 163 | FinishAfter(after_n_epochs=epochs), 164 | DataStreamMonitoring( 165 | monitors, 166 | ForceFloatX(DataStream(mnist_test, 167 | iteration_scheme=SequentialScheme( 168 | mnist_test.num_examples, batch_size))), 169 | prefix="test"), 170 | TrainingDataMonitoring( 171 | train_monitors, 172 | prefix="train", 173 | after_every_epoch=True), 174 | SerializeMainLoop(name+".pkl"), 175 | #Plot(name, channels=plot_channels), 176 | ProgressBar(), 177 | Printing()]) 178 | main_loop.run() 179 | 180 | #----------------------------------------------------------------------------- 181 | 182 | if __name__ == "__main__": 183 | parser = ArgumentParser() 184 | parser.add_argument("--name", type=str, dest="name", 185 | default=None, help="Name for this experiment") 186 | parser.add_argument("--epochs", type=int, dest="epochs", 187 | default=25, help="Number of training epochs to do") 188 | parser.add_argument("--bs", "--batch-size", type=int, dest="batch_size", 189 | default=100, help="Size of each mini-batch") 190 | parser.add_argument("--lr", "--learning-rate", type=float, dest="learning_rate", 191 | default=1e-3, help="Learning rate") 192 | args = parser.parse_args() 193 | 194 | main(**vars(args)) 195 | 196 | -------------------------------------------------------------------------------- /generative_models/blocks_models/sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function, division 4 | 5 | import logging 6 | import theano 7 | import theano.tensor as T 8 | import cPickle as pickle 9 | 10 | import numpy as np 11 | 12 | 13 | from PIL import Image 14 | from blocks.main_loop import MainLoop 15 | from blocks.model import AbstractModel 16 | from blocks import config 17 | 18 | FORMAT = '[%(asctime)s] %(name)-15s %(message)s' 19 | DATEFMT = "%H:%M:%S" 20 | logging.basicConfig(format=FORMAT, datefmt=DATEFMT, level=logging.INFO) 21 | 22 | def scale_norm(arr): 23 | arr = arr - arr.min() 24 | scale = (arr.max() - arr.min()) 25 | return scale * arr 26 | 27 | def img_grid(arr, global_scale=True): 28 | N, height, width = arr.shape 29 | 30 | rows = int(np.sqrt(N)) 31 | cols = int(np.sqrt(N)) 32 | 33 | if rows*cols < N: 34 | cols = cols + 1 35 | 36 | if rows*cols < N: 37 | rows = rows + 1 38 | 39 | total_height = rows * height 40 | total_width = cols * width 41 | 42 | if global_scale: 43 | arr = scale_norm(arr) 44 | 45 | I = np.zeros((total_height, total_width)) 46 | 47 | for i in xrange(N): 48 | r = i // cols 49 | c = i % cols 50 | 51 | if global_scale: 52 | this = arr[i] 53 | else: 54 | this = scale_norm(arr[i]) 55 | 56 | offset_y, offset_x = r*height, c*width 57 | I[offset_y:(offset_y+height), offset_x:(offset_x+width)] = this 58 | 59 | I = (255*I).astype(np.uint8) 60 | return Image.fromarray(I) 61 | 62 | 63 | if __name__ == "__main__": 64 | from argparse import ArgumentParser 65 | 66 | parser = ArgumentParser() 67 | parser.add_argument("model_file", help="filename of a pickled DRAW model") 68 | parser.add_argument("--size", type=int, 69 | default=28, help="Output image size (width and height)") 70 | args = parser.parse_args() 71 | 72 | logging.info("Loading file %s..." % args.model_file) 73 | with open(args.model_file, "rb") as f: 74 | p = pickle.load(f) 75 | 76 | if isinstance(p, MainLoop): 77 | model = p.model 78 | elif isinstance(p, AbstractModel): 79 | model = p 80 | else: 81 | print("Don't know how to handle unpickled %s" % type(p)) 82 | exit(1) 83 | 84 | draw = model.get_top_bricks()[0] 85 | # reset the random generator 86 | del draw._theano_rng 87 | del draw._theano_seed 88 | draw.seed_rng = np.random.RandomState(config.default_seed) 89 | 90 | #------------------------------------------------------------ 91 | logging.info("Compiling sample function...") 92 | 93 | n_samples = T.iscalar("n_samples") 94 | samples = draw.sample(n_samples) 95 | 96 | do_sample = theano.function([n_samples], outputs=samples, allow_input_downcast=True) 97 | 98 | #------------------------------------------------------------ 99 | logging.info("Sampling and saving images...") 100 | 101 | samples = do_sample(16*16) 102 | #samples = np.random.normal(size=(16, 100, 28*28)) 103 | 104 | if (len(samples.shape) == 2): 105 | # there was only one iter of sampling, so fake more iters 106 | temp = np.zeros((3, samples.shape[0], samples.shape[1])) 107 | temp[0,:,:] = samples.copy() 108 | temp[1,:,:] = samples.copy() 109 | samples = temp 110 | 111 | n_iter, N, D = samples.shape 112 | 113 | samples = samples.reshape( (n_iter, N, args.size, args.size) ) 114 | 115 | for i in xrange(n_iter): 116 | img = img_grid(samples[i,:,:,:]) 117 | img.save("samples-%03d.png" % i) 118 | 119 | #with open("centers.pkl", "wb") as f: 120 | # pikle.dump(f, (center_y, center_x, delta)) 121 | 122 | 123 | -------------------------------------------------------------------------------- /generative_models/blocks_models/simple_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "RUNNING SCRIPT" 3 | tar -czf imod_bikld_results.tar *.pkl 4 | aws s3 cp imod_bikld_results.tar s3://nipsmodels/imod_bikld_results.tar 5 | echo "FINISHED SCRIPT" 6 | date '+%A %W %Y %X' 7 | -------------------------------------------------------------------------------- /generative_models/output_losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | 5 | class LogisticRegression(object): 6 | """Multi-class Logistic Regression loss dangler.""" 7 | 8 | def __init__(self, linear_layer): 9 | """Dangle a logistic regression from the given linear layer. 10 | 11 | The given linear layer should be a HiddenLayer (or subclass) object, 12 | for HiddenLayer as defined in LayerNet.py.""" 13 | self.input_layer = linear_layer 14 | 15 | def loss_func(self, y): 16 | """Return the multiclass logistic regression loss for y. 17 | 18 | The class labels in y are assumed to be in correspondence with the 19 | set of column indices for self.input_layer.linear_output. 20 | """ 21 | p_y_given_x = T.nnet.softmax(self.input_layer.linear_output) 22 | loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]),y]) 23 | return loss 24 | 25 | def errors(self, y): 26 | """Compute the number of wrong predictions by self.input_layer. 27 | 28 | Predicted class labels are computed as the indices of the columns of 29 | self.input_layer.linear_output which are maximal. Wrong predictions are 30 | those for which max indices do not match their corresponding y values. 31 | """ 32 | # Compute class memberships predicted by self.input_layer 33 | y_pred = T.argmax(self.input_layer.linear_output, axis=1) 34 | errs = 0 35 | # check if y has same dimension of y_pred 36 | if y.ndim != y_pred.ndim: 37 | raise TypeError('y should have the same shape as self.y_pred', 38 | ('y', y.type, 'y_pred', y_pred.type)) 39 | # check if y is of the correct datatype 40 | if y.dtype.startswith('int'): 41 | # the T.neq operator returns a vector of 0s and 1s, where 1 42 | # represents a mistake in prediction 43 | errs = T.sum(T.neq(y_pred, y)) 44 | else: 45 | raise NotImplementedError() 46 | return errs 47 | 48 | class LogRegSS(object): 49 | """Multi-class semi-supervised Logistic Regression loss dangler.""" 50 | 51 | def __init__(self, linear_layer): 52 | """Dangle a logistic regression from the given linear layer. 53 | 54 | The given linear layer should be a HiddenLayer (or subclass) object, 55 | for HiddenLayer as defined in LayerNet.py.""" 56 | self.input_layer = linear_layer 57 | 58 | def safe_softmax_ss(self, x): 59 | """Softmax that shouldn't overflow.""" 60 | e_x = T.exp(x - T.max(x, axis=1, keepdims=True)) 61 | x_sm = e_x / T.sum(e_x, axis=1, keepdims=True) 62 | return x_sm 63 | 64 | def loss_func(self, y): 65 | """Return the multiclass logistic regression loss for y. 66 | 67 | The class labels in y are assumed to be in correspondence with the 68 | set of column indices for self.input_layer.linear_output. 69 | """ 70 | row_idx = T.arange(y.shape[0]) 71 | row_mask = T.neq(y, 0).reshape((y.shape[0], 1)) 72 | p_y_given_x = self.safe_softmax_ss(self.input_layer.linear_output) 73 | wacky_mat = (p_y_given_x * row_mask) + (1. - row_mask) 74 | loss = -T.sum(T.log(wacky_mat[row_idx,y])) / T.sum(row_mask) 75 | return loss 76 | 77 | def errors(self, y): 78 | """Compute the number of wrong predictions by self.input_layer. 79 | 80 | Predicted class labels are computed as the indices of the columns of 81 | self.input_layer.linear_output which are maximal. Wrong predictions are 82 | those for which max indices do not match their corresponding y values. 83 | """ 84 | # Compute class memberships predicted by self.input_layer 85 | y_pred = T.argmax(self.input_layer.linear_output[:,1:], axis=1) 86 | y_pred = y_pred + 1 87 | errs = 0 88 | # check if y has same dimension of y_pred 89 | if y.ndim != y_pred.ndim: 90 | raise TypeError('y should have the same shape as self.y_pred', 91 | ('y', y.type, 'y_pred', y_pred.type)) 92 | # check if y is of the correct datatype 93 | if y.dtype.startswith('int'): 94 | # the T.neq operator returns a vector of 0s and 1s, where 1 95 | # represents a mistake in prediction 96 | errs = T.sum(T.neq(y_pred, y) * T.neq(y, 0)) 97 | else: 98 | raise NotImplementedError() 99 | return errs 100 | 101 | class MCL2Hinge(object): 102 | """Multi-class one-vs-all L2 hinge loss dangler.""" 103 | 104 | def __init__(self, linear_layer): 105 | """Dangle a squred hinge loss from the given linear layer. 106 | 107 | The given linear layer should be a HiddenLayer (or subclass) object, 108 | for HiddenLayer as defined in LayerNet.py.""" 109 | self.input_layer = linear_layer 110 | 111 | def loss_func(self, y): 112 | """Return the multiclass squared hinge loss for y. 113 | 114 | The class labels in y are assumed to be in correspondence with the 115 | set of column indices for self.input_layer.linear_output. 116 | """ 117 | y_hat = self.input_layer.linear_output 118 | margin_pos = T.maximum(0.0, (1.0 - y_hat)) 119 | margin_neg = T.maximum(0.0, (1.0 + y_hat)) 120 | obs_idx = T.arange(y.shape[0]) 121 | loss_pos = T.sum(margin_pos[obs_idx,y]**2.0) 122 | loss_neg = T.sum(margin_neg**2.0) - T.sum(margin_neg[obs_idx,y]**2.0) 123 | loss = (loss_pos + loss_neg) / y.shape[0] 124 | return loss 125 | 126 | def errors(self, y): 127 | """Compute the number of wrong predictions by self.input_layer. 128 | 129 | Predicted class labels are computed as the indices of the columns of 130 | self.input_layer.linear_output which are maximal. Wrong predictions are 131 | those for which max indices do not match their corresponding y values. 132 | """ 133 | # Compute class memberships predicted by self.input_layer 134 | y_pred = T.argmax(self.input_layer.linear_output, axis=1) 135 | errs = 0 136 | # check if y has same dimension of y_pred 137 | if y.ndim != y_pred.ndim: 138 | raise TypeError('y should have the same shape as self.y_pred', 139 | ('y', y.type, 'y_pred', y_pred.type)) 140 | # check if y is of the correct datatype 141 | if y.dtype.startswith('int'): 142 | # the T.neq operator returns a vector of 0s and 1s, where 1 143 | # represents a mistake in prediction 144 | errs = T.sum(T.neq(y_pred, y)) 145 | else: 146 | raise NotImplementedError() 147 | return errs 148 | 149 | class MCL2HingeSS(object): 150 | """Multi-class one-vs-all L2 hinge loss dangler. 151 | 152 | For this loss, class index 0 is never penalized, and errors for inputs 153 | with class index 0 are similarly ignored. This is for semi-supervised 154 | training, constrained by Theano's programming model.""" 155 | 156 | def __init__(self, linear_layer): 157 | """Dangle a squred hinge loss from the given linear layer. 158 | 159 | The given linear layer should be a HiddenLayer (or subclass) object, 160 | for HiddenLayer as defined in LayerNet.py.""" 161 | self.input_layer = linear_layer 162 | 163 | def loss_func(self, y): 164 | """Return the multiclass squared hinge loss for y. 165 | 166 | The class labels in y are assumed to be in correspondence with the 167 | set of column indices for self.input_layer.linear_output. 168 | """ 169 | y_hat = self.input_layer.linear_output 170 | row_idx = T.arange(y.shape[0]) 171 | row_mask = T.neq(y, 0).reshape((y_hat.shape[0], 1)) 172 | margin_pos = T.maximum(0.0, (1.0 - y_hat)) * row_mask 173 | margin_neg = T.maximum(0.0, (1.0 + y_hat)) * row_mask 174 | loss_pos = T.sum(margin_pos[row_idx,y]**2.0) 175 | loss_neg = T.sum(margin_neg**2.0) - T.sum(margin_neg[row_idx,y]**2.0) 176 | loss = (loss_pos + loss_neg) / T.sum(row_mask) 177 | return loss 178 | 179 | def errors(self, y): 180 | """Compute the number of wrong predictions by self.input_layer. 181 | 182 | Predicted class labels are computed as the indices of the columns of 183 | self.input_layer.linear_output which are maximal. Wrong predictions are 184 | those for which max indices do not match their corresponding y values. 185 | """ 186 | # Compute class memberships predicted by self.input_layer 187 | y_pred = T.argmax(self.input_layer.linear_output[:,1:], axis=1) 188 | y_pred = y_pred + 1 189 | errs = 0 190 | # check if y has same dimension of y_pred 191 | if y.ndim != y_pred.ndim: 192 | raise TypeError('y should have the same shape as self.y_pred', 193 | ('y', y.type, 'y_pred', y_pred.type)) 194 | # check if y is of the correct datatype 195 | if y.dtype.startswith('int'): 196 | # the T.neq operator returns a vector of 0s and 1s, where 1 197 | # represents a mistake in prediction 198 | errs = T.sum(T.neq(y_pred, y) * T.neq(y, 0)) 199 | else: 200 | raise NotImplementedError() 201 | return errs 202 | -------------------------------------------------------------------------------- /generative_models/result_parsing_script.py: -------------------------------------------------------------------------------- 1 | import os as os 2 | import sys as sys 3 | import numpy as np 4 | import numpy.random as npr 5 | 6 | def print_res(res): 7 | print("err: {0:.4f}".format(res['err'])) 8 | for h_param in ['learn_rate', 'lam_cat', 'lam_pea', 'lam_ent', 'lam_l2w']: 9 | print(" {0:s}: {1:.4f}".format(h_param, res[h_param])) 10 | return 1 11 | 12 | def parse_file(f_name): 13 | f_lines = [l for l in open(f_name).readlines()] 14 | f_dict = {} 15 | for i in [1, 2, 3, 4, 5]: 16 | f_dict[f_lines[i].split()[0].strip(':')] = float(f_lines[i].split()[1]) 17 | e_lines = [l for l in f_lines if ('va_err:' in l)] 18 | e_vals = [float(l.split()[-1]) for l in e_lines] 19 | mean_err = sum(e_vals[-10:]) / len(e_vals[-10:]) 20 | f_dict['err'] = mean_err 21 | return f_dict 22 | 23 | comp_func = lambda x, y: 1 if (x['err'] > y['err']) else -1 24 | 25 | if __name__=="__main__": 26 | if (len(sys.argv) < 2): 27 | print("FILE TAG REQUIRED!") 28 | assert(False) 29 | res_dicts = [parse_file(f) for f in os.listdir(os.getcwd()) if (sys.argv[1] in f)] 30 | res_dicts.sort(cmp=comp_func) 31 | print("**RESULTS**") 32 | for rd in res_dicts: 33 | print("========================================") 34 | print_res(rd) 35 | -------------------------------------------------------------------------------- /nlp/CythonFuncs.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | # try to compile and use the faster cython version 4 | import os 5 | from numpy import get_include 6 | import pyximport 7 | models_dir = os.path.dirname(__file__) or os.getcwd() 8 | pyximport.install(setup_args={"include_dirs": [models_dir, get_include()]}) 9 | from CythonFuncsPyx import w2v_ff_bp_pyx, ag_update_2d_pyx, ag_update_1d_pyx, \ 10 | lut_bp_pyx, nsl_ff_bp_pyx, acl_ff_bp_pyx, DO_INIT 11 | 12 | import numpy as np 13 | import numpy.random as npr 14 | import threading 15 | from ctypes import pythonapi, c_void_p 16 | 17 | ######################################## 18 | # MULTITHREADING HELPER-FUNC AND DEFNS # 19 | ######################################## 20 | 21 | THREAD_NUM = 4 22 | 23 | def make_multithread(inner_func, numthreads): 24 | def func_mt(*args): 25 | length = len(args[0]) 26 | sp_idx = np.arange(0,length).astype(np.uint32) 27 | chunklen = (length + (numthreads-1)) // numthreads 28 | chunkargs = [(sp_idx[i*chunklen:(i+1)*chunklen],)+args for i in range(numthreads)] 29 | # Start a thread for all but the last chunk of work 30 | threads = [threading.Thread(target=inner_func, args=cargs) 31 | for cargs in chunkargs[:-1]] 32 | for thread in threads: 33 | thread.start() 34 | # Give the last chunk of work to the main thread 35 | inner_func(*chunkargs[-1]) 36 | for thread in threads: 37 | thread.join() 38 | return 1 39 | def func_st(*args): 40 | length = len(args[0]) 41 | sp_idx = np.arange(0,length).astype(np.uint32) 42 | sp_args = (sp_idx,) + args 43 | inner_fun(*sp_args) 44 | func = None 45 | if numthreads == 1: 46 | func = func_st 47 | else: 48 | func = func_mt 49 | return func_mt 50 | 51 | ############################## 52 | # NUMBA FUNCTION DEFINITIONS # 53 | ############################## 54 | 55 | w2v_ff_bp = make_multithread(w2v_ff_bp_pyx, THREAD_NUM) 56 | hsm_ff_bp = make_multithread(nsl_ff_bp_pyx, THREAD_NUM) 57 | nsl_ff_bp = make_multithread(nsl_ff_bp_pyx, THREAD_NUM) 58 | lut_bp = make_multithread(lut_bp_pyx, THREAD_NUM) 59 | 60 | ag_update_2d = make_multithread(ag_update_2d_pyx, THREAD_NUM) 61 | ag_update_1d = make_multithread(ag_update_1d_pyx, 1) 62 | 63 | 64 | ############## 65 | # EYE BUFFER # 66 | ############## 67 | -------------------------------------------------------------------------------- /nlp/GPULayers.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | # Imports of public stuff 4 | import numpy as np 5 | import numpy.random as npr 6 | import gnumpy as gp 7 | import numexpr as ne 8 | 9 | # Imports of my stuff 10 | from HelperFuncs import randn, ones, zeros 11 | 12 | # UH OH, GLOBAL PARAMS (TODO: GET RID OF THESE!) 13 | ADA_EPS = 1e-3 14 | MAX_HSM_KEY = 12345678 15 | 16 | ################################# 17 | # FULLY-CONNECTED SOFTMAX LAYER # 18 | ################################# 19 | 20 | class FullLayer: 21 | def __init__(self, in_dim=0, max_out_key=0): 22 | # Set dimension of incoming vectors and the number of outcomes for 23 | # which to perform prediction. Increment the requested prediction size 24 | # by 1, to accommodate 0 indexing. 25 | out_dim = max_out_key + 1 26 | self.dim_input = in_dim 27 | self.dim_output = out_dim 28 | # Initialize parameters, gradients, and adagrad "momentums" 29 | self.params = {} 30 | self.params['W'] = 0.01 * gp.randn((in_dim, out_dim)) 31 | self.params['b'] = gp.zeros((1, out_dim)) 32 | self.grads = {} 33 | self.grads['W'] = gp.zeros((in_dim, out_dim)) 34 | self.grads['b'] = gp.zeros((1, out_dim)) 35 | self.moms = {} 36 | self.moms['W'] = gp.zeros((in_dim, out_dim)) 37 | self.moms['b'] = gp.zeros((1, out_dim)) 38 | # Initialize temp vars to use during feedforward/backpropagation 39 | self.X = [] 40 | self.Y = [] 41 | self.Y_cat = [] 42 | return 43 | 44 | def init_params(self, w_scale=0.01, b_scale=0.0): 45 | """Randomly initialize the weights in this layer.""" 46 | self.params['W'] = w_scale * gp.randn((self.dim_input, self.dim_output)) 47 | self.grads['W'] = gp.zeros((self.dim_input, self.dim_output)) 48 | self.params['b'] = gp.zeros((1, self.dim_output)) 49 | self.grads['b'] = gp.zeros((1, self.dim_output)) 50 | return 51 | 52 | def clip_params(self, max_norm=10.0): 53 | """Bound L2 (row-wise) norm of W by max_norm.""" 54 | M = self.params['W'] 55 | m_scales = max_norm / gp.sqrt(gp.sum(M**2.0,axis=1) + 1e-5) 56 | mask = (m_scales < 1.0) # with gnumpy, this already comes as float32 57 | m_scales = (m_scales * mask) + (1.0 - mask) 58 | self.params['W'] = M * m_scales[:,gp.newaxis] 59 | return 60 | 61 | def feedforward(self, X): 62 | """Run feedforward for this layer.""" 63 | # Cleanup debris from any previous feedforward 64 | self._cleanup() 65 | # Do new feedforward... 66 | self.X = gp.garray(X) 67 | self.Y = gp.dot(self.X, self.params['W']) + self.params['b'] 68 | return self.Y 69 | 70 | def backprop(self, Y_cat, L_ary=None, return_on_gpu=False): 71 | """Backprop through softmax using the given target predictions.""" 72 | # Compute gradient of cross-entropy objective, based on the given 73 | # target predictions and the most recent feedforward information. 74 | L, dLdY = self.xent_loss_and_grad(self.Y, Y_cat.astype(np.uint32)) 75 | # Backprop cross-ent grads to get grads w.r.t. layer parameters 76 | dLdW = gp.dot(self.X.T, dLdY) 77 | dLdb = gp.sum(dLdY, axis=0) 78 | dLdb = dLdb[gp.newaxis,:] 79 | self.grads['W'] += dLdW 80 | self.grads['b'] += dLdb 81 | # Backprop cross-ent grads to get grads w.r.t. layer input 82 | dLdX = gp.dot(dLdY, self.params['W'].T) 83 | # Return gradients w.r.t. to input, either on or off the GPU 84 | if not return_on_gpu: 85 | dLdX = gp.as_numpy_array(dLdX).astype(np.float32) 86 | # Write loss into L_ary if it was given 87 | L_ary[0] = L 88 | return dLdX 89 | 90 | def safe_softmax(self, Y): 91 | """Compute a reasonably (numerically) safe softmax.""" 92 | Y_max = gp.max(Y, axis=1) 93 | Y_max = Y_max[:,gp.newaxis] 94 | Y_exp = gp.exp(Y - Y_max) 95 | Y_sum = gp.sum(Y_exp, axis=1) 96 | Y_sum = Y_sum[:,gp.newaxis] 97 | Y_sm = Y_exp / Y_sum 98 | return Y_sm 99 | 100 | def xent_loss_and_grad(self, Yh, Y_cat): 101 | """Cross-entropy loss for predictions Yh given targets Y_cat.""" 102 | # Convert from categorical classes to "one-hot" target vectors 103 | Y_ind = zeros(Yh.shape) 104 | Y_ind[np.arange(Y_ind.shape[0]), Y_cat] = 1.0 105 | # Push one-hot targets vectors to the GPU 106 | Y_ind = gp.garray(Y_ind) 107 | # Compute softmax and then cross-entropy loss 108 | Yh_sm = self.safe_softmax(Yh) 109 | L = -gp.sum((Y_ind * gp.log(Yh_sm))) 110 | dLdYh = Yh_sm - Y_ind 111 | return [L, dLdYh] 112 | 113 | def l2_regularize(self, lam_l2=1e-5): 114 | """Apply some amount of l2 "shrinkage" to weights and biases.""" 115 | self.params['W'] -= lam_l2 * self.params['W'] 116 | self.params['b'] -= lam_l2 * self.params['b'] 117 | return 118 | 119 | def apply_grad(self, learn_rate=1e-2,): 120 | """Apply the current accumulated gradients, with adagrad.""" 121 | # Update the adagrad "momentums" 122 | self.moms['W'] = (0.95 * self.moms['W']) + (0.05 * self.grads['W']**2.0) 123 | self.moms['b'] = (0.95 * self.moms['b']) + (0.05 * self.grads['b']**2.0) 124 | # Apply adagrad-style updates using current grads and moms 125 | self.params['W'] -= learn_rate * (self.grads['W'] / \ 126 | (gp.sqrt(self.moms['W']) + ADA_EPS)) 127 | self.params['b'] -= learn_rate * (self.grads['b'] / \ 128 | (gp.sqrt(self.moms['b']) + ADA_EPS)) 129 | # Reset gradient accumulators 130 | self.reset_grads() 131 | return 132 | 133 | def reset_grads(self): 134 | """Reset the gradient accumulators for this layer.""" 135 | self.grads['W'] = 0.0 * self.grads['W'] 136 | self.grads['b'] = 0.0 * self.grads['b'] 137 | return 138 | 139 | def reset_moms(self, ada_init=1e-3): 140 | """Reset the adagrad "momentums" for this layer.""" 141 | self.moms['W'] = (0.0 * self.moms['W']) + ada_init 142 | self.moms['b'] = (0.0 * self.moms['b']) + ada_init 143 | return 144 | 145 | def _cleanup(self): 146 | """Cleanup temp vars used during feedforward/backprop.""" 147 | self.X = [] 148 | self.Y = [] 149 | self.Y_cat = [] 150 | return 151 | 152 | ########################## 153 | # NOISE INJECTION LAYERS # 154 | ########################## 155 | 156 | class NoiseLayer: 157 | def __init__(self, drop_rate=0.0, fuzz_scale=0.0): 158 | # Set stuff required for managing this type of layer 159 | self.dYdX = [] 160 | self.drop_rate = drop_rate 161 | self.drop_scale = 1.0 / (1.0 - drop_rate) 162 | self.fuzz_scale = fuzz_scale 163 | # Set stuff common to all layer types 164 | self.X = [] 165 | self.Y = [] 166 | self.dLdY = [] 167 | return 168 | 169 | def set_noise_params(self, drop_rate=0.0, fuzz_scale=0.0): 170 | """Set the drop rate for this drop layer.""" 171 | self.drop_rate = drop_rate 172 | self.drop_scale = 1.0 / (1.0 - drop_rate) 173 | self.fuzz_scale = fuzz_scale 174 | return 175 | 176 | def feedforward(self, X, return_on_gpu=False): 177 | """Perform feedforward through this layer. 178 | """ 179 | # Cleanup debris from any previous feedforward 180 | self._cleanup() 181 | # Record (a pointer to) the passed input 182 | self.X = gp.garray(X) 183 | # Generate and apply a dropout mask to the input 184 | if (self.drop_rate > 1e-4): 185 | drop_mask = self.drop_scale * \ 186 | (gp.rand((self.X.shape[0], self.X.shape[1])) > self.drop_rate) 187 | else: 188 | drop_mask = gp.ones((self.X.shape[0], self.X.shape[1])) 189 | self.dYdX = drop_mask 190 | if (self.fuzz_scale > 1e-4): 191 | fuzz_bump = (self.fuzz_scale / self.drop_scale) * \ 192 | gp.randn((self.X.shape[0], self.X.shape[1])) 193 | self.Y = drop_mask * (self.X + fuzz_bump) 194 | else: 195 | self.Y = drop_mask * self.X 196 | if not return_on_gpu: 197 | self.Y = gp.as_numpy_array(self.Y) 198 | return self.Y 199 | 200 | def backprop(self, dLdY, return_on_gpu=False): 201 | """Perform backprop through this layer. 202 | """ 203 | # Backprop is just multiplication by the mask from feedforward 204 | dLdX = gp.garray(dLdY) * self.dYdX 205 | if not return_on_gpu: 206 | dLdX = gp.as_numpy_array(dLdX).astype(np.float32) 207 | return dLdX 208 | 209 | def _cleanup(self): 210 | """Clear all temp variables for this layer.""" 211 | self.X = [] 212 | self.Y = [] 213 | self.dYdX = [] 214 | return 215 | 216 | ################################### 217 | # TEST BASIC MODULE FUNCTIONALITY # 218 | ################################### 219 | 220 | def run_test(): 221 | ##################### 222 | # TODO: write tests # 223 | ##################### 224 | print("TODO: WRITE TEST FOR GPULayers.py") 225 | 226 | 227 | if __name__ == '__main__': 228 | run_test() 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | ############## 240 | # EYE BUFFER # 241 | ############## 242 | -------------------------------------------------------------------------------- /nlp/NumbaFuncs.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import numpy as np 4 | import numpy.random as npr 5 | import threading 6 | import numba 7 | from math import exp, log, sqrt 8 | from numba import jit, void, i4, f4, u4 9 | from ctypes import pythonapi, c_void_p 10 | 11 | ADA_EPS = 0.001 12 | 13 | ######################################## 14 | # MULTITHREADING HELPER-FUNC AND DEFNS # 15 | ######################################## 16 | 17 | THREAD_NUM = 4 18 | 19 | savethread = pythonapi.PyEval_SaveThread 20 | savethread.argtypes = [] 21 | savethread.restype = c_void_p 22 | 23 | restorethread = pythonapi.PyEval_RestoreThread 24 | restorethread.argtypes = [c_void_p] 25 | restorethread.restype = None 26 | 27 | def make_multithread(inner_func, numthreads): 28 | def func_mt(*args): 29 | length = len(args[0]) 30 | sp_idx = np.arange(0,length).astype(np.int32) 31 | chunklen = (length + (numthreads-1)) // numthreads 32 | chunkargs = [(sp_idx[i*chunklen:(i+1)*chunklen],)+args for i in range(numthreads)] 33 | # Start a thread for all but the last chunk of work 34 | threads = [threading.Thread(target=inner_func, args=cargs) 35 | for cargs in chunkargs[:-1]] 36 | for thread in threads: 37 | thread.start() 38 | # Give the last chunk of work to the main thread 39 | inner_func(*chunkargs[-1]) 40 | for thread in threads: 41 | thread.join() 42 | return 1 43 | def func_st(*args): 44 | length = len(args[0]) 45 | sp_idx = np.arange(0,length).astype(np.int32) 46 | sp_args = (sp_idx,) + args 47 | inner_fun(*sp_args) 48 | func = None 49 | if numthreads == 1: 50 | func = func_st 51 | else: 52 | func = func_mt 53 | return func_mt 54 | 55 | ############################## 56 | # NUMBA FUNCTION DEFINITIONS # 57 | ############################## 58 | 59 | def w2v_ff_bp_sp(sp_idx, anc_idx, pn_idx, pn_sign, Wa, Wc, b, dWa, dWc, db, L, do_grad): 60 | """Feedforward and backprop for unified (neg-sample) word-2-vec layer.""" 61 | threadstate = savethread() 62 | sp_size = sp_idx.shape[0] 63 | cols = pn_idx.shape[1] 64 | vec_dim = Wa.shape[1] 65 | for sp_i in range(sp_size): 66 | i = sp_idx[sp_i] 67 | ai = anc_idx[i] 68 | for j in range(cols): 69 | ci = pn_idx[i,j] 70 | y = b[ci] 71 | for k in range(vec_dim): 72 | y += (Wa[ai,k] * Wc[ci,k]) 73 | exp_pns_y = exp(pn_sign[i,j] * y) 74 | L[0] += log(1.0 + exp_pns_y) 75 | if (do_grad == 1): 76 | dLdy = pn_sign[i,j] * (exp_pns_y / (1.0 + exp_pns_y)) 77 | db[ci] = db[ci] + dLdy 78 | for k in range(vec_dim): 79 | dWa[ai,k] += (dLdy * Wc[ci,k]) 80 | dWc[ci,k] += (dLdy * Wa[ai,k]) 81 | restorethread(threadstate) 82 | return 83 | fn_sig_1 = void(i4[:], i4[:], i4[:,:], f4[:,:], f4[:,:], f4[:,:], f4[:], f4[:,:], f4[:,:], f4[:], f4[:], i4) 84 | w2v_ff_bp_st = jit(fn_sig_1, nopython=True)(w2v_ff_bp_sp) 85 | w2v_ff_bp = make_multithread(w2v_ff_bp_st, THREAD_NUM) 86 | 87 | def nsl_bp_sp(sp_idx, table_idx, X, W, dLdY, dLdX, dW, db): 88 | """Backprop for NSLayer: main loop in Numba-friendly form.""" 89 | threadstate = savethread() 90 | rows = sp_idx.shape[0] 91 | cols = dLdY.shape[1] 92 | vec_dim = X.shape[1] 93 | for spi in range(rows): 94 | i = sp_idx[spi] 95 | for j in range(cols): 96 | dldy = dLdY[i,j] 97 | idx = table_idx[i,j] 98 | db[idx] += dldy 99 | for k in range(vec_dim): 100 | dW[idx,k] += dldy * X[i,k] 101 | dLdX[i,k] += dldy * W[idx,k] 102 | restorethread(threadstate) 103 | return 104 | fn_sig_2 = void(i4[:], i4[:,:], f4[:,:], f4[:,:], f4[:,:], f4[:,:], f4[:,:], f4[:]) 105 | nsl_bp_st = jit(fn_sig_2, nopython=True)(nsl_bp_sp) 106 | nsl_bp = make_multithread(nsl_bp_st, THREAD_NUM) 107 | 108 | def nsl_ff_sp(sp_idx, table_idx, X, W, b, Y): 109 | """Feedforward for NSLayer: main loop in Numba-friendly form.""" 110 | threadstate = savethread() 111 | rows = sp_idx.shape[0] 112 | cols = table_idx.shape[1] 113 | vec_dim = X.shape[1] 114 | for spi in range(rows): 115 | i = sp_idx[spi] 116 | for j in range(cols): 117 | idx = table_idx[i,j] 118 | Y[i,j] = b[idx] 119 | for k in range(vec_dim): 120 | Y[i,j] += X[i,k] * W[idx,k] 121 | restorethread(threadstate) 122 | return 123 | fn_sig_3 = void(i4[:], i4[:,:], f4[:,:], f4[:,:], f4[:], f4[:,:]) 124 | nsl_ff_st = jit(fn_sig_3, nopython=True)(nsl_ff_sp) 125 | nsl_ff = make_multithread(nsl_ff_st, THREAD_NUM) 126 | 127 | def ag_update_2d_sp(sp_idx, row_idx, W, dW, mW, learn_rate): 128 | """Element-wise partial update ala adagrad. 129 | 130 | For the entries indicated by row_idx, this first updates the adagrad sums 131 | of squares in mW, then updates the params in W, and finally sets the 132 | grads in dW back to 0. 133 | """ 134 | threadstate = savethread() 135 | row_count = sp_idx.shape[0] 136 | vec_dim = W.shape[1] 137 | for spi in range(row_count): 138 | idx = row_idx[sp_idx[spi]] 139 | for j in range(vec_dim): 140 | mW[idx,j] = (0.95 * mW[idx,j]) + (0.05 * dW[idx,j] * dW[idx,j]) 141 | W[idx,j] -= (learn_rate * (dW[idx,j] / (sqrt(mW[idx,j]) + ADA_EPS))) 142 | dW[idx,j] = 0.0 143 | restorethread(threadstate) 144 | return 145 | fn_sig_4 = void(i4[:], i4[:], f4[:,:], f4[:,:], f4[:,:], f4) 146 | ag_update_2d_st = jit(fn_sig_4, nopython=True)(ag_update_2d_sp) 147 | ag_update_2d = make_multithread(ag_update_2d_st, THREAD_NUM) 148 | 149 | @numba.jit("void(i4[:], f4[:], f4[:], f4[:], f4)") 150 | def ag_update_1d(row_idx, W, dW, mW, learn_rate): 151 | """Element-wise partial update ala adagrad. 152 | 153 | For the entries indicated by row_idx, this first updates the adagrad sums 154 | of squares in mW, then updates the params in W, and finally sets the 155 | grads in dW back to 0. 156 | """ 157 | row_count = row_idx.shape[0] 158 | for i in range(row_count): 159 | idx = row_idx[i] 160 | mW[idx] = (0.95 * mW[idx]) + (0.05 * dW[idx] * dW[idx]) 161 | W[idx] -= learn_rate * (dW[idx] / (sqrt(mW[idx]) + ADA_EPS)) 162 | dW[idx] = 0.0 163 | return 164 | 165 | def lut_sp(sp_idx, row_idx, dLdY, dW): 166 | """Simple row-wise updates for adjusting dW with dLdY. 167 | 168 | This adds each row of dLdY to some row of dW. The row of dW to adjust 169 | is given by the corresponding item in row_idx.""" 170 | threadstate = savethread() 171 | row_count = sp_idx.shape[0] 172 | vec_dim = dW.shape[1] 173 | for i in range(row_count): 174 | idx = row_idx[sp_idx[i]] 175 | for j in range(vec_dim): 176 | dW[idx,j] += dLdY[i,j] 177 | restorethread(threadstate) 178 | return 179 | fn_sig_5 = void(i4[:], i4[:], f4[:,:], f4[:,:]) 180 | lut_st = jit(fn_sig_5, nopython=True)(lut_sp) 181 | lut_bp = make_multithread(lut_st, THREAD_NUM) 182 | 183 | 184 | def hsm_ff_bp_sp(sp_idx, X, code_keys, code_signs, W, b, dLdX, dLdW, dLdb, L): 185 | threadstate = savethread() 186 | obs_count = sp_idx.shape[0] 187 | code_len = code_keys.shape[1] 188 | vec_dim = X.shape[1] 189 | for spi in range(obs_count): 190 | i = sp_idx[spi] 191 | for j in range(code_len): 192 | code_key = code_keys[i,j] 193 | if code_key < 1234567: 194 | y = b[code_key] 195 | # for speed, this needs to change to sdot via BLAS 196 | for k in range(vec_dim): 197 | y += X[i,k] * W[code_key,k] 198 | neg_label = -1.0 * code_signs[i,j] 199 | exp_y = exp(neg_label * y) 200 | L[i,j] = log(1.0 + exp_y) 201 | g = neg_label * (exp_y / (1.0 + exp_y)) 202 | dLdb[code_key] += g 203 | # for speed, this needs to change to saxpy via BLAS 204 | for k in range(vec_dim): 205 | dLdX[i,k] += g * W[code_key,k] 206 | dLdW[code_key,k] += g * X[i,k] 207 | restorethread(threadstate) 208 | return 209 | fn_sig_6 = void(i4[:], f4[:,:], u4[:,:], f4[:,:], f4[:,:], f4[:], f4[:,:], f4[:,:], f4[:], f4[:,:]) 210 | hsm_ff_bp_st = jit(fn_sig_6, nopython=True)(hsm_ff_bp_sp) 211 | hsm_ff_bp = make_multithread(hsm_ff_bp_st, THREAD_NUM) 212 | 213 | ############## 214 | # EYE BUFFER # 215 | ############## 216 | -------------------------------------------------------------------------------- /nlp/TestCuBlas.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This example uses cuBLAS gemm routine to perform matrix-matrix multiplication. 3 | Please refer to the documentation for details of how to use the gemm routine 4 | http://docs.continuum.io/numbapro/cudalib.html#blas-level-2 5 | 6 | Note: cuBLAS uses Fortran layout 7 | ''' 8 | 9 | import numbapro.cudalib.cublas as cublas 10 | from numbapro import cuda 11 | import numpy as np 12 | import numpy.random as npr 13 | from timeit import default_timer as timer 14 | import gnumpy as gp 15 | 16 | N = 5000 # no. of rows/cols 17 | 18 | def gemm_v1(): 19 | ''' 20 | Note that all arrays are in Fortran order. 21 | ''' 22 | print("Version 1".center(80, '=')) 23 | # Prepare arrays for input 24 | A = np.array(np.arange(N ** 2, dtype=np.float32).reshape(N, N), order='F') 25 | B = np.array(np.arange(N) + 10, dtype=A.dtype, order='F') 26 | D = np.zeros_like(A, order='F') 27 | 28 | # NumPy 29 | start = timer() 30 | E = np.dot(A, np.diag(B)) 31 | numpy_time = timer() - start 32 | print("Numpy took %f seconds" % numpy_time) 33 | 34 | # cuBLAS 35 | blas = cublas.Blas() 36 | 37 | stream = cuda.stream() 38 | cuda.to_device(A, stream=stream) 39 | stream.synchronize() 40 | 41 | start = timer() 42 | blas.gemm('N', 'N', N, N, N, 1.0, A, np.diag(B), 1.0, D) 43 | cuda_time = timer() - start 44 | 45 | print("CUBLAS took %f seconds" % cuda_time) 46 | diff = np.abs(D - E) 47 | print("Maximum error %f" % np.max(diff)) 48 | 49 | 50 | def gemm_v2(): 51 | """ 52 | Let GEMM transpose the input matrices so that they can be in C order, 53 | originally. Note that the output matrix is still in Fortran array. 54 | The string arguments in gemm tells it to apply transformation on the input 55 | matrices. 56 | 57 | See argument description in: 58 | http://docs.continuum.io/numbapro/cudalib.html#blas-level-2 59 | """ 60 | print("Version 2".center(80, '=')) 61 | # Prepare arrays for input 62 | A = np.array(np.arange(N ** 2, dtype=np.float32).reshape(N, N)) 63 | B = np.array(np.arange(N) + 10, dtype=A.dtype) 64 | D = np.zeros_like(A, order='F') 65 | 66 | # NumPy 67 | start = timer() 68 | E = np.dot(A, np.diag(B)) 69 | numpy_time = timer() - start 70 | print("Numpy took %f seconds" % numpy_time) 71 | 72 | # cuBLAS 73 | blas = cublas.Blas() 74 | 75 | stream = cuda.stream() 76 | cuda.to_device(A, stream=stream) 77 | stream.synchronize() 78 | 79 | start = timer() 80 | blas.gemm('T', 'T', N, N, N, 1.0, A, np.diag(B), 1.0, D) 81 | cuda_time = timer() - start 82 | 83 | print("CUBLAS took %f seconds" % cuda_time) 84 | diff = np.abs(D - E) 85 | print("Maximum error %f" % np.max(diff)) 86 | 87 | 88 | def main(): 89 | gemm_v1() 90 | gemm_v2() 91 | 92 | if __name__ == '__main__': 93 | main() 94 | start = timer() 95 | A = npr.randn(256, 1500) 96 | for i in range(1000): 97 | B = gp.garray(A) 98 | B = B + B 99 | A = gp.randn((256, 1500)).as_numpy_array() 100 | berk_time = timer() - start 101 | print("Berk time: {0:.4f}".format(berk_time)) 102 | print(" @ {0:.4f} transfers/second".format(1000.0 / berk_time)) -------------------------------------------------------------------------------- /nlp/gensim_code/GensimUtils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright (C) 2010 Radim Rehurek 5 | # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html 6 | 7 | """ 8 | This module contains various general utility functions. 9 | """ 10 | 11 | try: 12 | import cPickle as _pickle 13 | except ImportError: 14 | import pickle as _pickle 15 | 16 | import re 17 | import os 18 | import sys 19 | import itertools 20 | import traceback 21 | import unicodedata 22 | 23 | if sys.version_info[0] >= 3: 24 | unicode = str 25 | 26 | from six import iteritems, u 27 | 28 | 29 | PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE) 30 | RE_HTML_ENTITY = re.compile(r'&(#?)(x?)(\w+);', re.UNICODE) 31 | 32 | 33 | def tokenize(text, errors="strict", to_lower=False): 34 | """ 35 | Iteratively yield tokens as unicode strings, optionally also lowercasing them. 36 | 37 | Input text may be either unicode or utf8-encoded byte string. 38 | 39 | The tokens on output are maximal contiguous sequences of alphabetic 40 | characters (no digits!). 41 | 42 | """ 43 | text = to_unicode(text, errors=errors) 44 | if to_lower: 45 | text = text.lower() 46 | for match in PAT_ALPHABETIC.finditer(text): 47 | yield match.group() 48 | 49 | 50 | def simple_preprocess(doc, min_len=2, max_len=15): 51 | """ 52 | Convert a document into a list of tokens. 53 | 54 | This lowercases, tokenizes, stems, normalizes etc. -- the output are final 55 | tokens = unicode strings, that won't be processed any further. 56 | 57 | """ 58 | tokens = [token for token in tokenize(doc, to_lower=True, errors='ignore') 59 | if min_len <= len(token) <= max_len and not token.startswith('_')] 60 | return tokens 61 | 62 | 63 | def to_unicode(text, encoding='utf8', errors='strict'): 64 | """Convert a string (bytestring in `encoding` or unicode), to unicode.""" 65 | if isinstance(text, unicode): 66 | return text 67 | return unicode(text, encoding, errors=errors) 68 | 69 | 70 | def make_closing(base, **attrs): 71 | """ 72 | Add support for `with Base(attrs) as fout:` to the base class if it's missing. 73 | The base class' `close()` method will be called on context exit, to always close the file properly. 74 | 75 | This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise 76 | raise "AttributeError: GzipFile instance has no attribute '__exit__'". 77 | 78 | """ 79 | if not hasattr(base, '__enter__'): 80 | attrs['__enter__'] = lambda self: self 81 | if not hasattr(base, '__exit__'): 82 | attrs['__exit__'] = lambda self, type, value, traceback: self.close() 83 | return type('Closing' + base.__name__, (base, object), attrs) 84 | 85 | 86 | def smart_open(fname, mode='rb'): 87 | _, ext = os.path.splitext(fname) 88 | if ext == '.bz2': 89 | from bz2 import BZ2File 90 | return make_closing(BZ2File)(fname, mode) 91 | if ext == '.gz': 92 | from gzip import GzipFile 93 | return make_closing(GzipFile)(fname, mode) 94 | return open(fname, mode) 95 | 96 | 97 | def pickle(obj, fname, protocol=-1): 98 | """Pickle object `obj` to file `fname`.""" 99 | with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows 100 | _pickle.dump(obj, fout, protocol=protocol) 101 | 102 | 103 | def unpickle(fname): 104 | """Load pickled object from `fname`""" 105 | with smart_open(fname) as f: 106 | return _pickle.load(f) 107 | 108 | 109 | def revdict(d): 110 | """ 111 | Reverse a dictionary mapping. 112 | 113 | When two keys map to the same value, only one of them will be kept in the 114 | result (which one is kept is arbitrary). 115 | 116 | """ 117 | return dict((v, k) for (k, v) in iteritems(d)) 118 | 119 | 120 | -------------------------------------------------------------------------------- /nlp/gensim_code/TestGensim.py: -------------------------------------------------------------------------------- 1 | import logging, os 2 | import numpy as np 3 | import numpy.random as npr 4 | import W2VSimple as w2vs 5 | 6 | logging.basicConfig(format='%(message)s', level=logging.INFO) 7 | 8 | class MySentences(object): 9 | def __init__(self, dirname): 10 | self.dirname = dirname 11 | def __iter__(self): 12 | for fname in os.listdir(self.dirname): 13 | for line in open(os.path.join(self.dirname, fname)): 14 | yield line.split() 15 | 16 | def some_nearest_words(keys_to_words, sample_count, W): 17 | norms = np.sqrt(np.sum(W**2.0,axis=1,keepdims=1)) 18 | W = W / (norms + 1e-5) 19 | source_keys = np.zeros((sample_count,)).astype(np.int32) 20 | neighbor_keys = np.zeros((sample_count, 10)).astype(np.int32) 21 | all_keys = np.asarray(keys_to_words.keys()).astype(np.int32) 22 | for s in range(sample_count): 23 | i = npr.randint(0,all_keys.size) 24 | source_k = all_keys[i] 25 | neg_cos_sims = -1.0 * np.sum(W * W[source_k], axis=1) 26 | sorted_k = np.argsort(neg_cos_sims) 27 | source_keys[s] = source_k 28 | neighbor_keys[s,:] = sorted_k[1:11] 29 | source_words = [] 30 | neighbor_words = [] 31 | for s in range(sample_count): 32 | source_words.append(keys_to_words[source_keys[s]]) 33 | neighbor_words.append([keys_to_words[k] for k in neighbor_keys[s]]) 34 | return [source_keys, neighbor_keys, source_words, neighbor_words] 35 | 36 | sentences = MySentences('./training_text') 37 | 38 | model = w2vs.W2VSimple(sentences, alpha=0.002, size=152, window=6, \ 39 | min_count=1, workers=4, hs=1) 40 | k2w = {} 41 | w2k = {} 42 | for w in model.vocab: 43 | k = model.vocab[w].index 44 | k2w[k] = w 45 | w2k[w] = k 46 | 47 | for i in range(1001): 48 | print("ROUND {0:d}".format(i)) 49 | sentences = MySentences('./training_text') 50 | model.train(sentences, chunksize=200) 51 | if ((i > 1) and ((i % 50) == 0)): 52 | print("============================================================") 53 | [s_keys, n_keys, s_words, n_words] = some_nearest_words(k2w, 10, model.syn0) 54 | for w in range(10): 55 | print("{0:s}: {1:s}".format(s_words[w],", ".join(n_words[w]))) 56 | 57 | -------------------------------------------------------------------------------- /nlp/nlp_convnet/STBTests.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.random as npr 3 | import StanfordTrees as st 4 | import LNLayers as lnl 5 | import LayerNets as ln 6 | import random as random 7 | from time import clock 8 | from sys import stdout as stdout 9 | 10 | def simple_stb_test(tree_dir='./trees'): 11 | stb_data = st.SimpleLoad(tree_dir) 12 | return 13 | 14 | if __name__ == '__main__': 15 | tree_dir = './trees' 16 | stb_data = st.SimpleLoad(tree_dir) 17 | max_lut_idx = max(stb_data['lut_keys'].values()) 18 | basic_opts = {} 19 | basic_opts['class_count'] = 5 20 | lut_opts = {} 21 | lut_opts['max_key'] = max_lut_idx 22 | lut_opts['embed_dim'] = 30 23 | lut_opts['max_norm'] = 2.0 24 | basic_opts['lut_layer'] = lut_opts 25 | 26 | # Initialize a network 27 | KMN = ln.KMaxNet(basic_opts) 28 | KMN.init_weights(w_scale=0.05, b_shift=0.1) 29 | 30 | # Get a "flattened" list of training phrases and classes 31 | train_phrases = [] 32 | train_labels = [] 33 | for (phrases, labels) in zip(stb_data['train_phrases'], stb_data['train_labels']): 34 | train_phrases.extend(phrases) 35 | train_labels.extend(labels) 36 | 37 | batch_size = 50 38 | epoch_batches = 2500 39 | learn_rate = 0.01 40 | train_pairs = [(phrase, label) for (phrase, label) in zip(train_phrases, train_labels)] 41 | train_phrases = [] 42 | train_labels = [] 43 | for e in range(500): 44 | print("Starting epoch {0:d}, {1:d} batches".format(e, len(train_pairs)/batch_size)) 45 | stdout.flush() 46 | # Reset batch extraction indices and completed batch counter 47 | batch_start = 0 48 | batch_end = batch_start + batch_size 49 | completed_batches = 0 50 | # Perform batch updates for the current epoch 51 | L = 0.0 52 | acc = 0.0 53 | t1 = clock() 54 | random.shuffle(train_pairs) 55 | if ((e % 5) == 0): 56 | KMN.reset_moms(ada_init=0.0, clear_moms=False) 57 | while ((batch_end < len(train_pairs)) and (completed_batches < epoch_batches)): 58 | # Extract the current training phrase/label batch 59 | batch_pairs = train_pairs[batch_start:batch_end] 60 | # Train on this batch, and count its completion 61 | Xb = [pair[0] for pair in batch_pairs] 62 | Yb = [pair[1] for pair in batch_pairs] 63 | res = KMN.process_training_batch(Xb, Yb, learn_rate, use_dropout=True) 64 | L += res[0] 65 | acc += res[1] 66 | completed_batches += 1 67 | # Advance batch extraction indices 68 | batch_start = batch_start + batch_size 69 | batch_end = batch_start + batch_size 70 | # Print diagnostic info from time-to-time 71 | if ((completed_batches % 50) == 0): 72 | print("completed {0:d} updates, with loss {1:.4f} and acc {2:.4f}".format( \ 73 | completed_batches, (L / 50.0), (acc / 50.0))) 74 | L = 0.0 75 | acc = 0.0 76 | t2 = clock() 77 | print("-- time: {0:.2f}".format(t2-t1)) 78 | t1 = clock() 79 | stdout.flush() 80 | 81 | 82 | 83 | ############## 84 | # EYE BUFFER # 85 | ############## 86 | -------------------------------------------------------------------------------- /nlp/voidptr.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #if PY_VERSION_HEX >= 0x03020000 4 | 5 | /* 6 | ** compatibility with python >= 3.2, which doesn't have CObject anymore 7 | */ 8 | static void * PyCObject_AsVoidPtr(PyObject *obj) 9 | { 10 | void *ret = PyCapsule_GetPointer(obj, NULL); 11 | if (ret == NULL) { 12 | PyErr_Clear(); 13 | } 14 | return ret; 15 | } 16 | 17 | #endif --------------------------------------------------------------------------------