├── README.md ├── __init__.py ├── ae.py ├── basic_layer.py ├── config_yaml ├── train3_new_dup_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml └── trainLenet_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml ├── data ├── __init__.py ├── cifar.py ├── mnist.pkl.zip ├── mnist_bin17.pkl.zip └── out.pkl ├── dataset.py ├── exps └── .readme.md ├── filterit.py ├── generate_exps.py ├── generate_exps_lenet.py ├── generate_exps_search.py ├── init_params └── .readme.md ├── job0.sh ├── joblenet.sh ├── jobs ├── .readme.md ├── 110_1000_4_trainLenet_0_1000_3_0_0_0_0_4_False_False_False_False_False_110.sl └── 20_1000_0_train3_new_dup_0_1000_3_0_0_1_0_0_True_False_False_False_False_20.sl ├── k80.sl ├── layer.py ├── layers.py ├── learning_rate.py ├── learning_rule.py ├── mnist_manip.py ├── non_linearities.py ├── normalization.py ├── outputjobs └── .readme.md ├── p100.sl ├── plot_paper.py ├── submit.sh ├── tools.py ├── train3_bin.py ├── train3_new_dup.py └── trainLenet.py /README.md: -------------------------------------------------------------------------------- 1 | ### Neural Networks Regularization Through Class-wise Invariant Representation Learning. 2 | 3 | This repository contains the code of the paper `Neural Networks Regularization Through Class-wise Invariant Representation Learning. S.Belharbi, C.Chatelain, R.Hérault, S.Adam. 2017.`[ArXiv](https://arxiv.org/abs/1709.01867). 4 | 5 | *Please cite this paper if you use the code in this repository as part of a published research project.* 6 | 7 | Requirements: 8 | - Python (2.7). 9 | - Theano (0.9). 10 | - Numpy (1.13). 11 | - Keras (2.0). 12 | - Matplotlib (1.2) 13 | - Yaml (3.10). 14 | 15 | To run this code, you need to uncompress the MNIST dataset: 16 | ```sh 17 | $ unzip data/mnist.pkl.zip -d data/ 18 | $ unzip data/mnist_bin17.pkl.zip -d data/ 19 | ``` 20 | 21 | To generate *mnist-noise* and *mnist-img*, please see the file `mnist_manip.py`. 22 | 23 | The folder `config_yaml` contains [yaml](http://www.yaml.org/start.html) files to configure an experiment. For instance, this is the content of the yaml file to run an experiment using an mlp with 3 hidden layers: 24 | ```yaml 25 | corrupt_input_l: 0.0 26 | debug_code: false 27 | extreme_random: true 28 | h_ind: [false, false, true, false] 29 | h_w: 0.0 30 | hint: true 31 | max_epochs: 400 32 | model: train3_new_dup 33 | nbr_sup: 1000 34 | norm_gh: false 35 | norm_gsup: false 36 | repet: 0 37 | run: 0 38 | start_corrupting: 0 39 | start_hint: 110 40 | use_batch_normalization: [false, false, false, false] 41 | use_sparsity: false 42 | use_sparsity_in_pred: false 43 | use_unsupervised: false 44 | ``` 45 | To run this experiment on a GPU: 46 | ```sh 47 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python train3_new_dup.py train3_new_dup_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml 48 | ``` 49 | 50 | To use [Slurm](https://slurm.schedmd.com/), see the folder `jobs`. -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbelharbi/learning-class-invariant-features/96338d071edb1e2e030373deaebd366c5a84b7c3/__init__.py -------------------------------------------------------------------------------- /ae.py: -------------------------------------------------------------------------------- 1 | # Based on: https://github.com/caglar/autoencoders.git 2 | # http://www-etud.iro.umontreal.ca/~gulcehrc/ 3 | import theano 4 | import theano.tensor as T 5 | from theano.tensor.shared_randomstreams import RandomStreams 6 | 7 | from basic_layer import Layer 8 | from non_linearities import NonLinearity, CostType, relu 9 | import numpy as np 10 | import cPickle as pkl 11 | 12 | from collections import OrderedDict 13 | 14 | theano.config.warn.subtensor_merge_bug = False 15 | 16 | 17 | class AEHiddenLayer(Layer): 18 | def __init__(self, 19 | input, 20 | n_in, 21 | n_out, 22 | n_in_dec=None, 23 | n_out_dec=None, 24 | W=None, 25 | b=None, 26 | num_pieces=1, 27 | bhid=None, 28 | activation=T.nnet.sigmoid, 29 | sparse_initialize=False, 30 | tied_weights=True, 31 | rng=None): 32 | """ 33 | Typical hidden layer for an auto-encoder: The units are fully connected 34 | and have sigmoidal activation function. Weight matrix (W) is of shape 35 | (n_in, n_out) and the bias vector (b) is of shape(n_out,). 36 | 37 | Hidden units activation is given by: sigmoid(dot(input, w)+ b) 38 | 39 | :type rng: numpy.random.RandomState 40 | :param rng: a random number generator used to initiaze the weights. 41 | 42 | :type input: theano.tensor.dmatrix 43 | :param input: a symbolic tensor of shape (n_examples, n_in) 44 | 45 | :type n_in: int 46 | :param n_in: dimension of the input 47 | 48 | :type n_out: int 49 | :param n_out: number of hidden units 50 | 51 | :type activation: theano.Op or function 52 | :param activation: Non linearity to be applied in the hidden layer. 53 | """ 54 | if rng is None: 55 | rng = np.random.RandomState() 56 | 57 | super(AEHiddenLayer, self).__init__( 58 | input=input, 59 | input1=None, 60 | input2=None, 61 | input_vl=None, 62 | n_in=n_in, 63 | n_out=n_out, 64 | num_pieces=num_pieces, 65 | activation=activation, 66 | sparse_initialize=sparse_initialize, 67 | rng=rng) 68 | 69 | self.reset_layer() 70 | 71 | if W is not None: 72 | self.W = W 73 | 74 | if b is not None: 75 | self.b = b 76 | 77 | if bhid is not None: 78 | self.b_prime = bhid 79 | else: 80 | if n_in_dec is not None: 81 | b_values = np.zeros((n_out_dec), dtype=theano.config.floatX) 82 | else: 83 | b_values = np.zeros( 84 | (self.n_in/num_pieces), dtype=theano.config.floatX) 85 | 86 | self.b_prime = theano.shared(value=b_values, name="b_prime") 87 | 88 | if tied_weights: 89 | self.W_prime = self.W.T 90 | else: 91 | if n_in_dec is not None and n_out_dec is not None: 92 | W_values = np.asarray( 93 | self.rng.normal(loc=0., 94 | scale=0.005, 95 | size=(n_out_dec, n_in_dec)), 96 | dtype=theano.config.floatX) 97 | else: 98 | if self.activation == theano.tensor.tanh: 99 | born = np.sqrt(6. / (self.n_in + self.n_out)) 100 | else: 101 | born = 4 * np.sqrt(6. / (self.n_in + self.n_out)) 102 | W_values = np.asarray( 103 | self.rng.uniform( 104 | low=-born, 105 | high=born, 106 | size=(self.n_out, self.n_in)), 107 | dtype=theano.config.floatX) 108 | 109 | self.W_prime = theano.shared(value=W_values, name='W_prime', 110 | borrow=True) 111 | self.params += [self.W_prime] 112 | 113 | self.params += [self.b_prime] 114 | self.setup_outputs(input) 115 | 116 | def setup_outputs(self, input): 117 | lin_output = T.dot(input, self.W) + self.b 118 | self.output = ( 119 | lin_output if self.activation is None 120 | else self.activation(lin_output)) 121 | 122 | def get_outputs(self, input): 123 | self.setup_outputs(input) 124 | return self.output 125 | 126 | 127 | class Autoencoder(object): 128 | """ 129 | Typical implementation of an autoencoder. 130 | """ 131 | def __init__( 132 | self, 133 | input, 134 | nvis, 135 | nhid=None, 136 | nvis_dec=None, 137 | nhid_dec=None, 138 | rnd=None, 139 | bhid=None, 140 | cost_type=CostType.MeanSquared, 141 | momentum=1, 142 | num_pieces=1, 143 | L2_reg=-1, 144 | L1_reg=-1, 145 | sparse_initialize=False, 146 | nonlinearity=NonLinearity.TANH, 147 | W=None, 148 | b=None, 149 | bvis=None, 150 | tied_weights=True, 151 | reverse=False): 152 | 153 | assert reverse is False 154 | self.input = input 155 | self.nvis = nvis 156 | self.nhid = nhid 157 | self.bhid = bhid 158 | self.bvis = bvis 159 | self.momentum = momentum 160 | self.nonlinearity = nonlinearity 161 | self.tied_weights = tied_weights 162 | self.gparams = None 163 | self.reverse = reverse 164 | self.activation = self.get_non_linearity_fn() 165 | self.catched_params = {} 166 | 167 | if cost_type == CostType.MeanSquared: 168 | self.cost_type = CostType.MeanSquared 169 | elif cost_type == CostType.CrossEntropy: 170 | self.cost_type = CostType.CrossEntropy 171 | 172 | if rnd is None: 173 | self.rnd = np.random.RandomState(1231) 174 | else: 175 | self.rnd = rnd 176 | 177 | self.srng = RandomStreams(seed=1231) 178 | 179 | self.hidden = AEHiddenLayer(input=input, 180 | n_in=nvis, 181 | n_out=nhid, 182 | num_pieces=num_pieces, 183 | n_in_dec=nvis_dec, 184 | W=W, 185 | b=b, 186 | n_out_dec=nhid_dec, 187 | activation=self.activation, 188 | tied_weights=tied_weights, 189 | sparse_initialize=sparse_initialize, 190 | rng=rnd) 191 | 192 | self.params = self.hidden.params 193 | 194 | self.sparse_initialize = sparse_initialize 195 | 196 | self.L1_reg = L1_reg 197 | self.L2_reg = L2_reg 198 | 199 | self.L1 = 0 200 | self.L2 = 0 201 | 202 | if input is not None: 203 | self.x = input 204 | else: 205 | self.x = T.matrix('x_input', dtype=theano.config.floatX) 206 | 207 | def set_regularization_l1(self, L1_reg): 208 | if L1_reg != -1: 209 | self.L1 += abs(self.hidden.W).sum() 210 | if not self.tied_weights: 211 | self.L1 += abs(self.hidden.W_prime).sum() 212 | 213 | def set_regularization_l2(self, L2_reg): 214 | if L2_reg != -1: 215 | self.L2 += (self.hidden.W_prime**2).sum() 216 | if not self.tied_weights: 217 | self.L2 += (self.hidden.W**2).sum() 218 | 219 | def catch_params(self): 220 | for param in self.params: 221 | self.catched_params[param.name] = param.get_value() 222 | 223 | def nonlinearity_fn(self, d_in=None, recons=False): 224 | if self.nonlinearity == NonLinearity.SIGMOID: 225 | return T.nnet.sigmoid(d_in) 226 | elif self.nonlinearity == NonLinearity.RELU and not recons: 227 | return T.maximum(d_in, 0) 228 | elif self.nonlinearity == NonLinearity.RELU and recons: 229 | return T.nnet.softplus(d_in) 230 | elif self.nonlinearity == NonLinearity.TANH: 231 | return T.tanh(d_in) 232 | elif self.nonlinearity is None: 233 | return d_in 234 | 235 | def get_non_linearity_fn(self): 236 | if self.nonlinearity == NonLinearity.SIGMOID: 237 | return T.nnet.sigmoid 238 | elif self.nonlinearity == NonLinearity.RELU: 239 | return relu 240 | elif self.nonlinearity == NonLinearity.TANH: 241 | return T.tanh 242 | elif self.nonlinearity is None: 243 | return None 244 | 245 | def encode(self, x_in=None, center=True): 246 | if x_in is None: 247 | x_in = self.x 248 | 249 | act = self.nonlinearity_fn(T.dot(x_in, self.hidden.W) + self.hidden.b) 250 | if center: 251 | act = act - act.mean(0) 252 | return act 253 | 254 | def encode_linear(self, x_in=None): 255 | if x_in is None: 256 | x_in = self.x_in 257 | 258 | lin_out = T.dot(x_in, self.hidden.W) + self.hidden.b 259 | return self.nonlinearity_fn(lin_out), lin_out 260 | 261 | def decode(self, h): 262 | return self.nonlinearity_fn( 263 | T.dot(h, self.hidden.W_prime) + self.hidden.b_prime) 264 | 265 | def get_rec_cost(self, x_rec, eyes=False): 266 | """ 267 | Returns the reconstruction cost. 268 | """ 269 | if self.cost_type == CostType.MeanSquared: 270 | return T.mean(((self.x - x_rec)**2).sum(axis=1)) 271 | elif self.cost_type == CostType.CrossEntropy: 272 | return T.mean( 273 | (T.nnet.binary_crossentropy(x_rec, self.x)).mean(axis=1)) 274 | 275 | def get_rec_cost_face(self, x_rec): 276 | """ 277 | Returns the reconstruction cost. 278 | """ 279 | d_eyes = ( 280 | (self.x[:, 37] - self.x[:, 46])**2 + 281 | (self.x[:, 37] - self.x[:, 46])**2).T 282 | if self.cost_type == CostType.MeanSquared: 283 | return T.mean(((self.x - x_rec)**2).sum(axis=1) / d_eyes) 284 | elif self.cost_type == CostType.CrossEntropy: 285 | return T.mean( 286 | (T.nnet.binary_crossentropy( 287 | x_rec, self.x)).mean(axis=1) / d_eyes) 288 | 289 | def kl_divergence(self, p, p_hat): 290 | return p * T.log(p) - T.log(p_hat) + (1-p) * T.log(1-p) -\ 291 | (1-p_hat) * T.log(1-p_hat) 292 | 293 | def sparsity_penality(self, h, sparsity_level=0.05, sparse_reg=1e-3, 294 | batch_size=-1): 295 | if batch_size == -1 or batch_size == 0: 296 | raise Exception("Invalid batch size") 297 | 298 | sparsity_level = T.extra_ops.repeat(sparsity_level, self.nhid) 299 | sparsity_penality = 0 300 | avg_act = h.mean(axis=0) 301 | kl_div = self.kl_divergence(sparsity_level, avg_act) 302 | sparsity_penality = sparse_reg * kl_div.sum() 303 | return sparsity_penality 304 | 305 | def act_grads(self, inputs): 306 | h, acts = self.encode_linear(inputs) 307 | h_grad = T.grad(h.sum(), acts) 308 | return (h, h_grad) 309 | 310 | def jacobian_h_x(self, inputs): 311 | h, act_grad = self.act_grads(inputs) 312 | jacobian = self.hidden.W * act_grad.dimshuffle(0, 'x', 1) 313 | return (h, T.reshape(jacobian, newshape=(self.nhid, self.nvis))) 314 | 315 | def compute_jacobian_h_x(self, inputs): 316 | inputs = theano.shared(inputs.flatten()) 317 | h = self.encode(inputs) 318 | # see later 319 | # h = h.faltten() 320 | # inputs = inputs.flatten() 321 | # inputs = T.reshape(inputs, newshape=(self.nvis)) 322 | J = theano.gradient.jacobian(h, inputs) 323 | return h, J 324 | 325 | def sample_one_step(self, x, sigma): 326 | # h, J_t = self.jacobian_h_x(x) 327 | h, J_t = self.compute_jacobian_h_x(x) 328 | eps = self.srng.normal(avg=0, size=(self.nhid, 1), std=sigma) 329 | jacob_w_eps = T.dot(J_t.T, eps) 330 | delta_h = T.dot(J_t, jacob_w_eps) 331 | perturbed_h = h + delta_h.T 332 | x = self.decode(perturbed_h) 333 | return x 334 | 335 | def sample_scan(self, x, sigma, n_steps, samples): 336 | # Enable on-the-fly graph computations 337 | # theano.config.compute_test_value = "raise" 338 | in_val = T.fmatrix("input_values") 339 | # in_val.tag.test_value = np.asarray( 340 | # np.random.rand(1, 784), dtype=theano.config.floatX) 341 | s_sigma = T.fscalr("sigma_values") 342 | # s_sigma = np.asarray( 343 | # np.random.rand(1), dtype=theano.config.floatX) 344 | mode = "FAST_RUN" 345 | values, updates = theano.scan(fn=self.sample_one_step, 346 | outputs_info=in_val, 347 | non_sequences=s_sigma, 348 | n_steps=n_steps, 349 | mode=mode) 350 | ae_sampler = theano.function(inputs=[in_val, s_sigma], 351 | outputs=values[-1], 352 | updates=updates) 353 | samples = ae_sampler(x, sigma) 354 | return samples 355 | 356 | def sample_old(self, x, sigma, n_steps): 357 | # Enable on-the-fly graph computations 358 | # theano.config.compute_test_value = "raise" 359 | # in_val = T.fmatrix('input_values") 360 | # in_val.tag.test_value = np.asarray( 361 | # np.random.rand(1, 784), dtype=theano.config.floatX) 362 | # s_sigma = T.fscalar("sigma_value") 363 | # s_sigma = np.asarray( 364 | # np.random.rand(1), dtype=theano.config.floatX) 365 | # mode = "FAST_RUN" 366 | samples = [] 367 | sample = x 368 | samples.append(x) 369 | for i in xrange(n_steps): 370 | print "Sample %d ..." % i 371 | sampler = self.sample_one_step(sample, sigma) 372 | sample = sampler.eval() 373 | samples.append(sample) 374 | return samples 375 | 376 | def get_sgd_updates(self, learning_rate, lr_scaler=1.0, batch_size=1, 377 | sparsity_level=-1, sparse_reg=-1, x_in=None): 378 | h = self.encode(x_in) 379 | x_rec = self.decode(h) 380 | cost = self.get_rec_cost(x_rec) 381 | 382 | if self.L1_reg != -1 and self.L1_reg is not None: 383 | cost += self.L1_reg * self.L1 384 | 385 | if self.L2_reg != -1 and self.L2_reg is not None: 386 | cost += self.L2_reg * self.L2 387 | 388 | if sparsity_level != -1 and sparse_reg != -1: 389 | sparsity_penal = self.sparsity_penality( 390 | h, sparsity_level, sparse_reg, batch_size) 391 | cost += sparsity_penal 392 | 393 | self.gparams = T.grad(cost, self.params) 394 | updates = OrderedDict({}) 395 | for param, gparam in zip(self.params, self.gparams): 396 | updates[param] = self.momentum * param - lr_scaler * \ 397 | learning_rate * gparam 398 | return (cost, updates, h, x_rec) 399 | 400 | def get_train_cost(self, batch_size=1, sparsity_level=-1, sparse_reg=-1, 401 | x_in=None, face=False): 402 | h = self.encode(x_in) 403 | x_rec = self.decode(h) 404 | cost = self.get_rec_cost(x_rec) 405 | 406 | if self.L1_reg != -1 and self.L1_reg is not None: 407 | cost += self.L1_reg * self.L1 408 | 409 | if self.L2_reg != -1 and self.L2_reg is not None: 410 | cost += self.L2_reg * self.L2 411 | 412 | if sparsity_level != -1 and sparse_reg != -1: 413 | sparsity_penal = self.sparsity_penality( 414 | h, sparsity_level, sparse_reg, batch_size) 415 | cost += sparsity_penal 416 | 417 | return (cost, h, x_rec) 418 | 419 | def get_train_cost_clean(self): 420 | h = self.encode(self.x) 421 | x_rec = self.decode(h) 422 | cost = self.get_rec_cost(x_rec) 423 | 424 | cost += self.L1 425 | cost += self.L2 426 | 427 | return cost 428 | 429 | def save_params(self, weights_file, catched=False): 430 | """Save the model's parameters.""" 431 | f_dump = open(weights_file, "w") 432 | params_vls = {} 433 | if catched: 434 | if self.catched_params != {}: 435 | params_vls = self.catched_params 436 | else: 437 | raise ValueError( 438 | "You asked to save catched params," + 439 | "but you didn't catch any!!!!!!!") 440 | else: 441 | for param in self.params: 442 | params_vls[param.name] = param.get_value() 443 | pkl.dump(params_vls, f_dump, protocol=pkl.HIGHEST_PROTOCOL) 444 | f_dump.close() 445 | 446 | def set_params_vals(self, weights_file): 447 | """Set the values of the parameters.""" 448 | with open(weights_file, 'r') as f: 449 | params_vls = pkl.load(f) 450 | for param in self.params: 451 | param.set_value(params_vls[param.name]) 452 | 453 | def fit(self, 454 | data=None, 455 | learning_rate=0.1, 456 | batch_size=100, 457 | n_epochs=20, 458 | lr_scalar=0.998, 459 | weights_file="out/ae_weights_mnist.npy"): 460 | """ 461 | Fit the data to the autoencoder (training). 462 | """ 463 | if data is None: 464 | raise Exception("Data can't be empty.") 465 | 466 | index = T.lscalar("index") 467 | data_shared = theano.shared( 468 | np.asarray(data, dtype=theano.config.floatX)) 469 | n_batches = data.shape[0] / batch_size 470 | (cost, updates) = self.get_sgd_updates( 471 | learning_rate, lr_scalar, batch_size) 472 | train_ae = theano.function( 473 | [index], cost, updates=updates, 474 | givens={ 475 | self.x: data_shared[index*batch_size: (index+1)*batch_size]}) 476 | 477 | print "Start training the ae." 478 | ae_costs = [] 479 | 480 | for epoch in xrange(n_epochs): 481 | print "Training at epoch %d" % epoch 482 | cost_one_epoch = [] 483 | for batch_index in xrange(n_batches): 484 | cost_one_epoch.append(train_ae(batch_index)) 485 | print "Training at epoch %d, %f" % (epoch, np.mean(cost_one_epoch)) 486 | ae_costs.append(np.mean(cost_one_epoch)) 487 | 488 | print "Saving files ..." 489 | self.save_params(weights_file) 490 | return ae_costs 491 | -------------------------------------------------------------------------------- /basic_layer.py: -------------------------------------------------------------------------------- 1 | # Based on: https://github.com/caglar/autoencoders.git 2 | # http://www-etud.iro.umontreal.ca/~gulcehrc/ 3 | from __future__ import division 4 | import numpy as np 5 | import theano 6 | from theano import tensor as T 7 | import warnings 8 | 9 | 10 | from theano.tensor.signal import pool 11 | from theano.tensor.nnet import conv2d 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 13 | from non_linearities import NonLinearity, CostType, relu, get_non_linearity_str 14 | 15 | 16 | def sharedX_value(value, name=None, borrow=None, dtype=None): 17 | """Share a single value after transforming it to floatX type. 18 | 19 | value: a value 20 | name: variable name (str) 21 | borrow: boolean 22 | dtype: the type of the value when shared. default: theano.config.floatX 23 | """ 24 | if dtype is None: 25 | dtype = theano.config.floatX 26 | return theano.shared( 27 | theano._asarray(value, dtype=dtype), name=name, borrow=borrow) 28 | 29 | 30 | class Layer(object): 31 | """ 32 | A general base layer class for neural network. 33 | for training, the layer takes a pair of samples (input1, input2). 34 | input1 and input2 belong to the same class. 35 | input: sample for the supervised part. 36 | input1: first samples 37 | input2: second sample 38 | intended_to_be_corrupted: boolean. If True, we create a corruptor 39 | for the input. This indicates that may be at some point in the 40 | future the inputs of this layer may be corrupted. 41 | corrupt_input_l: Float. If !=0., only the input1 and input2 will be 42 | corrupted. 43 | NOTE: 44 | Basically, only the input of the first layer is corrupted. There is 45 | no interest/reason in corrupting the intermediate inputs. 46 | """ 47 | def __init__(self, 48 | input, 49 | input1, 50 | input2, 51 | input_vl, 52 | n_in, 53 | n_out, 54 | activation=T.nnet.sigmoid, 55 | sparse_initialize=False, 56 | num_pieces=1, 57 | non_zero_units=25, 58 | rng=None, 59 | hint="l1mean", 60 | use_hint=False, 61 | intended_to_be_corrupted=False, 62 | corrupt_input_l=0., 63 | use_sparsity=False, 64 | use_sparsity_in_pred=False, 65 | use_batch_normalization=False): 66 | 67 | assert hint is not None 68 | self.num_pieces = num_pieces 69 | self.corrupt_input_l = sharedX_value(corrupt_input_l, name="cor_l") 70 | self.intended_to_be_corrupted = intended_to_be_corrupted 71 | self.rng = np.random.RandomState(123) 72 | self.theano_rng = RandomStreams(self.rng.randint(2 ** 30)) 73 | self.input = input 74 | self.input1 = input1 # x1 75 | self.input2 = input2 # x2 76 | self.input_vl = input_vl # bn input used for validation. 77 | self.n_in = n_in 78 | self.n_out = n_out 79 | self.rng = rng 80 | self.sparse_initialize = sparse_initialize 81 | self.non_zero_units = non_zero_units 82 | self.W = None 83 | self.b = None 84 | self.sparser = None 85 | self.activation = activation 86 | self.hint = hint 87 | self.use_hint = use_hint 88 | self.use_sparsity = use_sparsity 89 | self.use_sparsity_in_pred = use_sparsity_in_pred 90 | self.use_batch_normalization = use_batch_normalization 91 | self.bn = None 92 | 93 | def reset_layer(self): 94 | """ 95 | initailize the layer's parameters to random. 96 | """ 97 | if self.W is None: 98 | if self.sparse_initialize: 99 | W_values = self.sparse_initialize_weights() 100 | else: 101 | if self.activation == theano.tensor.tanh: 102 | born = np.sqrt(6. / (self.n_in + self.n_out)) 103 | else: 104 | born = 4 * np.sqrt(6. / (self.n_in + self.n_out)) 105 | W_values = np.asarray(self.rng.uniform( 106 | low=-born, 107 | high=born, 108 | size=(self.n_in, self.n_out)), 109 | dtype=theano.config.floatX) 110 | 111 | self.W = theano.shared(value=W_values, name='W', borrow=True) 112 | 113 | if self.b is None: 114 | b_values = np.zeros(int(self.n_out/self.num_pieces), 115 | dtype=theano.config.floatX) 116 | self.b = theano.shared(value=b_values, name='b', borrow=True) 117 | 118 | if self.sparser is None: 119 | s_values = np.ones( 120 | int(self.n_out/self.num_pieces), dtype=theano.config.floatX) 121 | self.sparser = theano.shared(value=s_values, name='sparser', 122 | borrow=True) 123 | # The layer parameters 124 | self.params = [self.W, self.b] 125 | 126 | def get_corrupted_input(self, input): 127 | """This function keeps 1-self.corruption_input_l entries of the inputs 128 | the same and zero-out randomly selected subset of size 129 | self.coruption_input_l. 130 | 131 | """ 132 | return self.theano_rng.binomial(size=input.shape, n=1, 133 | p=1 - self.corrupt_input_l, 134 | dtype=theano.config.floatX) * input 135 | 136 | def sparse_initialization_weights(self): 137 | """ 138 | Implement the sparse initialization technique as described in 139 | J. Marten, 'Deep learning via Hessian-free optimization', ICML, 2010. 140 | http://icml2010.haifa.il.ibm.com/papers/458.pdf 141 | """ 142 | W = [] 143 | mu, sigma = 0, 1/self.non_zero_units 144 | 145 | for i in xrange(self.n_in): 146 | row = np.zeros(self.n_out) 147 | non_zeros = self.rng.normal(mu, sigma, self.non_zero_units) 148 | # non_zeros /= non_zeros.sum() 149 | non_zero_idxs = self.rng.permutation( 150 | self.n_out)[0:self.non_zero_units] 151 | for j in xrange(self.non_zero_units): 152 | row[non_zero_idxs[j]] = non_zeros[j] 153 | W.append(row) 154 | W = np.asarray(W, dtype=theano.config.floatX) 155 | return W 156 | -------------------------------------------------------------------------------- /config_yaml/train3_new_dup_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml: -------------------------------------------------------------------------------- 1 | corrupt_input_l: 0.0 2 | debug_code: false 3 | extreme_random: true 4 | h_ind: [false, false, true, false] 5 | h_w: 0.0 6 | hint: true 7 | max_epochs: 400 8 | model: train3_new_dup 9 | nbr_sup: 1000 10 | norm_gh: false 11 | norm_gsup: false 12 | repet: 0 13 | run: 0 14 | start_corrupting: 0 15 | start_hint: 110 16 | use_batch_normalization: [false, false, false, false] 17 | use_sparsity: false 18 | use_sparsity_in_pred: false 19 | use_unsupervised: false 20 | -------------------------------------------------------------------------------- /config_yaml/trainLenet_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml: -------------------------------------------------------------------------------- 1 | corrupt_input_l: 0.0 2 | debug_code: false 3 | extreme_random: true 4 | h_ind: [false, false, true, false] 5 | h_w: 0.0 6 | hint: true 7 | max_epochs: 400 8 | model: trainLenet 9 | nbr_sup: 1000 10 | norm_gh: false 11 | norm_gsup: false 12 | repet: 0 13 | run: 0 14 | start_corrupting: 0 15 | start_hint: 110 16 | use_batch_normalization: [false, false, false, false] 17 | use_sparsity: false 18 | use_sparsity_in_pred: false 19 | use_unsupervised: false 20 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbelharbi/learning-class-invariant-features/96338d071edb1e2e030373deaebd366c5a84b7c3/data/__init__.py -------------------------------------------------------------------------------- /data/cifar.py: -------------------------------------------------------------------------------- 1 | import keras 2 | from keras.datasets import cifar10 3 | import numpy as np 4 | import os 5 | import sys 6 | import matplotlib.pyplot as plt 7 | import cPickle as pkl 8 | 9 | 10 | (x_train, y_train), (x_test, y_test) = cifar10.load_data() 11 | y_test = y_test.reshape(y_test.size) 12 | y_train = y_train.reshape(y_train.size) 13 | cl = np.unique(y_train) 14 | vl_ind = [] 15 | for i in cl: 16 | ind = np.argwhere(y_train == i) 17 | ind = ind.reshape(ind.size) 18 | for k in range(1000): 19 | np.random.shuffle(ind) 20 | vl_ind.extend(ind[:int(len(ind)/10)]) 21 | # debug 22 | # path = "./CIFAR10/" + str(i) + '/' 23 | # if not os.path.exists(path): 24 | # os.makedirs(path) 25 | # for k in ind: 26 | # fig = plt.figure() 27 | # plt.imshow(x_train[k]) 28 | # fig.savefig(path + str(k) + ".png") 29 | 30 | x_vl = x_train[vl_ind] 31 | y_vl = y_train[vl_ind] 32 | ind_tr = [] 33 | for i in range(x_train.shape[0]): 34 | if i not in vl_ind: 35 | ind_tr.append(i) 36 | for i in range(10000): 37 | np.random.shuffle(ind_tr) 38 | 39 | new_x_train = x_train[ind_tr] 40 | new_y_train = y_train[ind_tr] 41 | 42 | stuff = [(x_train, y_train), (x_vl, y_vl), (x_test, y_test)] 43 | for e in stuff: 44 | print e[0].shape, e[1].shape 45 | 46 | with open("cifar10.pkl", "w") as f: 47 | pkl.dump(stuff, f) 48 | -------------------------------------------------------------------------------- /data/mnist.pkl.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbelharbi/learning-class-invariant-features/96338d071edb1e2e030373deaebd366c5a84b7c3/data/mnist.pkl.zip -------------------------------------------------------------------------------- /data/mnist_bin17.pkl.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sbelharbi/learning-class-invariant-features/96338d071edb1e2e030373deaebd366c5a84b7c3/data/mnist_bin17.pkl.zip -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | # Based on: https://github.com/caglar/autoencoders.git 2 | # http://www-etud.iro.umontreal.ca/~gulcehrc/ 3 | from __future__ import division 4 | 5 | import pickle as pkl 6 | import math 7 | import numpy as np 8 | 9 | 10 | class Dataset(object): 11 | def __init__(self, is_binary=False): 12 | self.is_binary = is_binary 13 | 14 | # Examples 15 | self.Xtrain = None 16 | self.Xtest = None 17 | 18 | # Labels 19 | self.Ytrain = None 20 | self.Ytest = None 21 | 22 | self.Xtrain_pres = None 23 | self.Xtest_pres = None 24 | 25 | self.sparsity = 0.0 26 | self.n_examples = 0 27 | 28 | def _get_data(self, data_path): 29 | if data_path.endswith("pkl") or data_path.endswith("pickle"): 30 | data = pkl.load(open(data_path, "rb")) 31 | else: 32 | data = np.load(data_path) 33 | return data 34 | 35 | def binarize_labels(self, labels=None): 36 | # Largest label is for the images without different objects. 37 | last_lbl = np.max(labels) 38 | binarized_lbls = [] 39 | if self.is_binary: 40 | for label in labels: 41 | if label == last_lbl: 42 | binarized_lbls.append(0) 43 | else: 44 | binarized_lbls.append(1) 45 | return binarized_lbls 46 | 47 | def setup_dataset(self, data_path=None, train_split_scale=0.0): 48 | data = self._get_data(data_path) 49 | self.n_examples = data[0].shape[0] 50 | ntrain = math.floor(self.n_examples * train_split_scale) 51 | 52 | self.Xtrain = data[0][:ntrain] 53 | self.Xtrain_pres = data[2][:ntrain] 54 | self.Xtest = data[0][ntrain:] 55 | self.Xtest_pres = data[2][ntrain:] 56 | 57 | if train_split_scale != 0.0: 58 | self.Ytrain = np.array( 59 | self.binarize_labels(data[1][:ntrain].flatten()) 60 | if self.is_binary else data[1][:ntrain].flatten()) 61 | 62 | if train_split_scale != 1.0: 63 | self.Ytest = np.array( 64 | self.binarize_labels(data[1][ntrain:].flatten()) 65 | if self.is_binary else data[1][ntrain:].flatten()) 66 | 67 | def comp_sparsity(self): 68 | num_sparse_els = 0 69 | for el in self.Xtrain.flatten(): 70 | if el == 0: 71 | num_sparse_els += 1 72 | for el in self.Xtest.flatten(): 73 | if el == 0: 74 | num_sparse_els += 1 75 | self.sparsity = (num_sparse_els / self.n_examples) 76 | return self.sparsity 77 | -------------------------------------------------------------------------------- /exps/.readme.md: -------------------------------------------------------------------------------- 1 | This folder contains the output of the experiments. -------------------------------------------------------------------------------- /filterit.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | from operator import eq 4 | import numpy as np 5 | import sys 6 | 7 | 8 | def get_vl_tst(f): 9 | with open(f, 'r') as ff: 10 | cont = ff.readlines() 11 | cont = [x.strip() for x in cont] 12 | # vl: 13 | vl = float(cont[1].split(":")[1].split(" ")[0]) 14 | tst = float(cont[2].split(":")[1].split(" ")[0]) 15 | return [vl, tst] 16 | 17 | 18 | def get_all_yamls_perfs(folder, h_ind=[1, 1, 1, 1], hint=[True, False], 19 | norm_gh=True, 20 | norm_gsup=True, nbr_sup=1000): 21 | """Collect all the yaml files and the performance files.""" 22 | h_ind_n = [str(i) for i in h_ind] 23 | h_ind = [bool(k) for k in h_ind] 24 | path_to_exps = folder 25 | list_exps = next(os.walk(path_to_exps))[1] 26 | list_exps = [e for e in list_exps if e.startswith("hint") or 27 | e.startswith("no")] 28 | list_exps = [path_to_exps + e for e in list_exps] 29 | # Start filtering 30 | filtered_list = [] 31 | list_start_hint = [] 32 | for d in list_exps: 33 | # Get the yaml file 34 | for file in os.listdir(d): 35 | if file.endswith(".yaml"): 36 | yaml_file = os.path.join(d, file) 37 | # print yaml_file 38 | # Satrt filtering ... 39 | # Read the yaml file 40 | with open(yaml_file, 'r') as y: 41 | yaml_cont = yaml.load(y) 42 | if yaml_cont["hint"] not in hint: 43 | continue 44 | if yaml_cont["norm_gh"] != norm_gh: 45 | continue 46 | if yaml_cont["norm_gsup"] != norm_gsup: 47 | continue 48 | if yaml_cont["nbr_sup"] != nbr_sup: 49 | continue 50 | if not all(map(eq, yaml_cont["h_ind"], h_ind)): 51 | continue 52 | # Get the per file. 53 | for file in os.listdir(d): 54 | if file.endswith(".txt"): 55 | perf_file = os.path.join(d, file) 56 | filtered_list.append(perf_file) 57 | list_start_hint.append(yaml_cont["start_hint"]) 58 | # No that you are done collecting the appropriate files. 59 | # COmpute the mean+-std 60 | vl, tst = [], [] 61 | for file in filtered_list: 62 | [v, t] = get_vl_tst(file) 63 | vl.append(v) 64 | tst.append(t) 65 | # remove the largest and smallest value (test error) 66 | comb = zip(vl, tst, list_start_hint) 67 | sorted_comb = sorted(comb, key=lambda tup: tup[1]) 68 | print "(vl, tst, start_hint)", len(comb) 69 | for el in sorted_comb: 70 | print el 71 | # remove the best and the worst. 72 | sorted_comb.pop(0) 73 | sorted_comb.pop(-1) 74 | vl, tst, list_start_hint = zip(*sorted_comb) 75 | # back to original lists. 76 | m_vl = np.mean(vl) 77 | std_vl = np.std(vl) 78 | m_tst = np.mean(tst) 79 | std_tst = np.std(tst) 80 | print str(len(filtered_list)), "_".join(h_ind_n), " norm_gh:",\ 81 | str(norm_gh),\ 82 | " norm_gsup:", str(norm_gsup),\ 83 | " vl:", str(m_vl), "+-", str(std_vl), " tst:", str(m_tst), "+-",\ 84 | str(std_tst), "\n" 85 | 86 | inds = [[0, 0, 1, 0]] 87 | norm_gh = False 88 | norm_gsup = False 89 | hint = [True, False] 90 | nbr_sup = int(sys.argv[2]) 91 | path_exps = str(sys.argv[1]) 92 | 93 | for e in inds: 94 | get_all_yamls_perfs(path_exps, h_ind=e, hint=hint, norm_gh=norm_gh, 95 | norm_gsup=norm_gsup, nbr_sup=nbr_sup) 96 | -------------------------------------------------------------------------------- /generate_exps.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | 4 | 5 | def get_name_exp_from_yaml(d): 6 | name = "" 7 | name = str(d["run"]) + "_" 8 | name += str(d["nbr_sup"]) + "_" 9 | name += str(len(d["h_ind"]) - 1) + "_" 10 | name += "_".join([str(int(k)) for k in d["h_ind"]]) + "_" 11 | name += str(d["repet"]) + "_" 12 | name += str(d["hint"]) + "_" 13 | name += str(d["norm_gsup"]) + "_" 14 | name += str(d["norm_gh"]) + "_" 15 | name += str(d["debug_code"]) + "_" 16 | name += str(d["use_unsupervised"]) 17 | 18 | return name 19 | 20 | 21 | def save_file(exp, rep, max_rep): 22 | # grad normalization 23 | # conf_norm = [(1, 0), (0, 1), (1, 1)] 24 | conf_norm = [(0, 0)] 25 | for c in conf_norm: 26 | exp["norm_gh"] = bool(c[0]) 27 | exp["norm_gsup"] = bool(c[1]) 28 | if rep == max_rep - 1: 29 | exp["debug_code"] = True 30 | print "Just forces debuge to TRUE *********" 31 | exp["debug_code"] = False 32 | name = get_name_exp_from_yaml(exp) 33 | with open(fold_exps+"/"+name+".yaml", "w") as fyaml: 34 | yaml.dump(exp, fyaml) 35 | f.write("python " + runner + " " + name + ".yaml \n") 36 | # Default 37 | nbr_layers = 3 38 | use_unsupervised = False 39 | exp = {"debug_code": False, 40 | "nbr_sup": 1000, 41 | "run": 45, 42 | "h_ind": [False for i in range(nbr_layers+1)], 43 | "use_batch_normalization": [False for i in range(nbr_layers+1)], 44 | "corrupt_input_l": 0., 45 | "start_corrupting": 0, 46 | "use_sparsity": False, 47 | "use_sparsity_in_pred": False, 48 | "max_epochs": 400, 49 | "hint": False, 50 | "extreme_random": True, 51 | "norm_gsup": False, 52 | "norm_gh": False, 53 | "repet": 0, 54 | "use_unsupervised": use_unsupervised, 55 | "h_w": 1., 56 | "start_hint": 5 57 | } 58 | nbr_sup_ = [1000, 3000, 5000, 50000] 59 | h_w_vls = [.0, .0, .0, .0] 60 | start_hint_vl = [2, 2, 1, 1] 61 | run = 0 62 | fold_exps = "config_yaml" 63 | bash_name = "job0.sh" 64 | f = open(bash_name, "w") 65 | f.write("#!/usr/bin/env bash \n") 66 | runner = "train3_new_dup.py" 67 | max_rep = 7 68 | for nbr, h_w, start_hint in zip(nbr_sup_, h_w_vls, start_hint_vl): 69 | for rep in range(max_rep): 70 | print rep 71 | exp["nbr_sup"] = nbr 72 | # we need one run for an MLP without hint. 73 | if rep == 0: 74 | exp["debug_code"] = False 75 | else: 76 | exp["debug_code"] = False 77 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 78 | exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)] 79 | exp["use_batch_normalization"][-2] = False 80 | exp["hint"] = False 81 | exp["run"] = run 82 | exp["repet"] = rep 83 | exp["norm_gh"] = False 84 | exp["norm_gsup"] = False 85 | exp["max_epochs"] = 2000 86 | exp["start_hint"] = 0 87 | exp["h_w"] = h_w 88 | name = get_name_exp_from_yaml(exp) 89 | with open(fold_exps+"/"+name+".yaml", "w") as fyaml: 90 | yaml.dump(exp, fyaml) 91 | f.write("python " + runner + " " + name + ".yaml \n") 92 | 93 | exp["max_epochs"] = 400 94 | exp["debug_code"] = False 95 | # ******* Train inly the layer before the output. 96 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 97 | exp["h_ind"][-2] = True 98 | exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)] 99 | exp["use_batch_normalization"][-2] = True 100 | exp["hint"] = True 101 | exp["run"] = run 102 | exp["repet"] = rep 103 | exp["start_hint"] = start_hint 104 | save_file(exp, rep, max_rep) 105 | continue 106 | # ***** 107 | # Exclusive layers 108 | for i in range(nbr_layers+1): 109 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 110 | exp["h_ind"][i] = True 111 | exp["hint"] = True 112 | exp["run"] = run 113 | exp["repet"] = rep 114 | save_file(exp, rep, max_rep) 115 | 116 | # From input to output 117 | # exp["h_ind"] = [False for k in range(nbr_layers+1)] 118 | # exp["h_ind"][0] = True 119 | # for kk in range(1, nbr_layers+1): 120 | # exp["h_ind"][kk] = True 121 | # save_file(exp, rep, max_rep) 122 | # From output to input 123 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 124 | exp["h_ind"][-1] = True 125 | for kk in range(-2, -(nbr_layers+2), -1): 126 | exp["h_ind"][kk] = True 127 | save_file(exp, rep, max_rep) 128 | f.close() 129 | os.system("chmod +x " + bash_name) 130 | -------------------------------------------------------------------------------- /generate_exps_lenet.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | 4 | 5 | def get_name_exp_from_yaml(d): 6 | name = "lenet_" 7 | name += str(d["run"]) + "_" 8 | name += str(d["nbr_sup"]) + "_" 9 | name += str(len(d["h_ind"]) - 1) + "_" 10 | name += "_".join([str(int(k)) for k in d["h_ind"]]) + "_" 11 | name += str(d["repet"]) + "_" 12 | name += str(d["hint"]) + "_" 13 | name += str(d["norm_gsup"]) + "_" 14 | name += str(d["norm_gh"]) + "_" 15 | name += str(d["debug_code"]) + "_" 16 | name += str(d["use_unsupervised"]) 17 | 18 | return name 19 | 20 | 21 | def save_file(exp, rep, max_rep): 22 | # grad normalization 23 | # conf_norm = [(1, 0), (0, 1), (1, 1)] 24 | conf_norm = [(0, 0)] 25 | for c in conf_norm: 26 | exp["norm_gh"] = bool(c[0]) 27 | exp["norm_gsup"] = bool(c[1]) 28 | if rep == max_rep - 1: 29 | exp["debug_code"] = True 30 | print "Just forces debuge to TRUE *********" 31 | exp["debug_code"] = False 32 | name = get_name_exp_from_yaml(exp) 33 | with open(fold_exps+"/"+name+".yaml", "w") as fyaml: 34 | yaml.dump(exp, fyaml) 35 | f.write("python " + runner + " " + name + ".yaml \n") 36 | # Default 37 | nbr_layers = 3 38 | use_unsupervised = False 39 | exp = {"debug_code": False, 40 | "nbr_sup": 1000, 41 | "run": 45, 42 | "h_ind": [False for i in range(nbr_layers+1)], 43 | "use_batch_normalization": [False for i in range(nbr_layers+1)], 44 | "corrupt_input_l": 0., 45 | "start_corrupting": 0, 46 | "use_sparsity": False, 47 | "use_sparsity_in_pred": False, 48 | "max_epochs": 400, 49 | "hint": False, 50 | "extreme_random": True, 51 | "norm_gsup": False, 52 | "norm_gh": False, 53 | "repet": 0, 54 | "use_unsupervised": use_unsupervised, 55 | "h_w": 1., 56 | "start_hint": 5 57 | } 58 | nbr_sup_ = [1000, 3000, 5000, 50000] 59 | h_w_vls = [.0, .0, .0, .0] 60 | start_hint_vl = [1, 1, 1, 1] 61 | run = 0 62 | fold_exps = "config_yaml" 63 | bash_name = "joblenet.sh" 64 | f = open(bash_name, "w") 65 | f.write("#!/usr/bin/env bash \n") 66 | runner = "trainLenet.py" 67 | max_rep = 7 68 | for nbr, h_w, start_hint in zip(nbr_sup_, h_w_vls, start_hint_vl): 69 | for rep in range(max_rep): 70 | print rep 71 | exp["nbr_sup"] = nbr 72 | # we need one run for an MLP without hint. 73 | if rep == 0: 74 | exp["debug_code"] = False 75 | else: 76 | exp["debug_code"] = False 77 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 78 | exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)] 79 | exp["use_batch_normalization"][-2] = False 80 | exp["hint"] = False 81 | exp["run"] = run 82 | exp["repet"] = rep 83 | exp["norm_gh"] = False 84 | exp["norm_gsup"] = False 85 | exp["max_epochs"] = 2000 86 | exp["start_hint"] = 0 87 | print h_w 88 | exp["h_w"] = h_w 89 | name = get_name_exp_from_yaml(exp) 90 | # with open(fold_exps+"/"+name+".yaml", "w") as fyaml: 91 | # yaml.dump(exp, fyaml) 92 | # f.write("python " + runner + " " + name + ".yaml \n") 93 | 94 | # The layer just before the softmax. 95 | exp["max_epochs"] = 400 96 | exp["debug_code"] = False 97 | # ******* Train inly the layer before the output. 98 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 99 | exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)] 100 | exp["use_batch_normalization"][-2] = True 101 | exp["h_ind"][-2] = True 102 | exp["hint"] = True 103 | exp["run"] = run 104 | exp["repet"] = rep 105 | exp["start_hint"] = start_hint 106 | save_file(exp, rep, max_rep) 107 | continue 108 | 109 | # The output of the last cnn layer. 110 | exp["max_epochs"] = 400 111 | exp["debug_code"] = False 112 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 113 | exp["h_ind"][-3] = True 114 | exp["hint"] = True 115 | exp["run"] = run 116 | exp["repet"] = rep 117 | save_file(exp, rep, max_rep) 118 | 119 | # The last two layers beofre the softmax. 120 | exp["max_epochs"] = 400 121 | exp["debug_code"] = False 122 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 123 | exp["h_ind"][-2] = True 124 | exp["h_ind"][-3] = True 125 | exp["hint"] = True 126 | exp["run"] = run 127 | exp["repet"] = rep 128 | save_file(exp, rep, max_rep) 129 | continue 130 | # ***** 131 | # Exclusive layers 132 | for i in range(nbr_layers+1): 133 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 134 | exp["h_ind"][i] = True 135 | exp["hint"] = True 136 | exp["run"] = run 137 | exp["repet"] = rep 138 | save_file(exp, rep, max_rep) 139 | 140 | # From input to output 141 | # exp["h_ind"] = [False for k in range(nbr_layers+1)] 142 | # exp["h_ind"][0] = True 143 | # for kk in range(1, nbr_layers+1): 144 | # exp["h_ind"][kk] = True 145 | # save_file(exp, rep, max_rep) 146 | # From output to input 147 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 148 | exp["h_ind"][-1] = True 149 | for kk in range(-2, -(nbr_layers+2), -1): 150 | exp["h_ind"][kk] = True 151 | save_file(exp, rep, max_rep) 152 | f.close() 153 | os.system("chmod +x " + bash_name) 154 | -------------------------------------------------------------------------------- /generate_exps_search.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | 4 | 5 | def get_name_exp_from_yaml(d): 6 | name = "" 7 | name = str(d["run"]) + "_" 8 | name += str(d["nbr_sup"]) + "_" 9 | name += str(len(d["h_ind"]) - 1) + "_" 10 | name += "_".join([str(int(k)) for k in d["h_ind"]]) + "_" 11 | name += str(d["repet"]) + "_" 12 | name += str(d["hint"]) + "_" 13 | name += str(d["norm_gsup"]) + "_" 14 | name += str(d["norm_gh"]) + "_" 15 | name += str(d["debug_code"]) + "_" 16 | name += str(d["use_unsupervised"]) + "_" 17 | name += str(d["start_hint"]) 18 | 19 | return name 20 | 21 | 22 | def save_file(exp, rep, max_rep): 23 | # grad normalization 24 | # conf_norm = [(1, 0), (0, 1), (1, 1)] 25 | conf_norm = [(0, 0)] 26 | for c in conf_norm: 27 | exp["norm_gh"] = bool(c[0]) 28 | exp["norm_gsup"] = bool(c[1]) 29 | print "Just forces debuge to TRUE *********" 30 | exp["debug_code"] = False 31 | name = get_name_exp_from_yaml(exp) 32 | with open(fold_exps+"/"+name+".yaml", "w") as fyaml: 33 | yaml.dump(exp, fyaml) 34 | f.write("python " + runner + " " + name + ".yaml \n") 35 | # Default 36 | nbr_layers = 3 37 | use_unsupervised = False 38 | exp = {"debug_code": False, 39 | "nbr_sup": 1000, 40 | "run": 45, 41 | "h_ind": [False for i in range(nbr_layers+1)], 42 | "use_batch_normalization": [False for i in range(nbr_layers+1)], 43 | "corrupt_input_l": 0., 44 | "start_corrupting": 0, 45 | "use_sparsity": False, 46 | "use_sparsity_in_pred": False, 47 | "max_epochs": 400, 48 | "hint": False, 49 | "extreme_random": True, 50 | "norm_gsup": False, 51 | "norm_gh": False, 52 | "repet": 0, 53 | "use_unsupervised": use_unsupervised, 54 | "h_w": 1., 55 | "start_hint": 5 56 | } 57 | gpu = "p100.sl" 58 | nbr_sup_ = 1000 59 | h_w_vls = .0 60 | # start_hint_vl = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150] 61 | start_hint_vl = [10, 10, 10, 10, 10, 10, 10] 62 | run = 0 63 | fold_exps = "config_yaml" 64 | folder_jobs = "jobs" 65 | bash_name = "submit.sh" 66 | f = open(bash_name, "w") 67 | f.write("#!/usr/bin/env bash \n") 68 | runner = "train3_new_dup.py" 69 | max_rep = 7 70 | flags = "THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 " 71 | rep = 0 72 | fgpu = open(gpu, "r") 73 | gpu_cont = fgpu.read() 74 | 75 | for start_hint in start_hint_vl: 76 | exp["nbr_sup"] = nbr_sup_ 77 | exp["run"] = run 78 | exp["norm_gh"] = False 79 | exp["norm_gsup"] = False 80 | exp["start_hint"] = 0 81 | exp["h_w"] = h_w_vls 82 | 83 | exp["max_epochs"] = 400 84 | exp["debug_code"] = False 85 | # ******* Train inly the layer before the output. 86 | exp["h_ind"] = [False for k in range(nbr_layers+1)] 87 | exp["h_ind"][-2] = True 88 | exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)] 89 | exp["use_batch_normalization"][-2] = False 90 | exp["hint"] = True 91 | exp["run"] = run 92 | exp["repet"] = rep 93 | exp["start_hint"] = start_hint 94 | name = get_name_exp_from_yaml(exp) 95 | with open(fold_exps+"/"+name+".yaml", "w") as fyaml: 96 | yaml.dump(exp, fyaml) 97 | name_job = str(start_hint) + "_" + str(nbr_sup_) + "_" + str(rep) + ".sl" 98 | with open(folder_jobs + "/" + name_job, "w") as fjob: 99 | fjob.write(gpu_cont + "\n") 100 | fjob.write(flags + " python " + runner + " " + name + ".yaml \n") 101 | # save_file(exp, rep, max_rep) 102 | f.write("sbatch ./" + folder_jobs + "/" + name_job + " \n") 103 | rep += 1 104 | 105 | f.close() 106 | fgpu.close() 107 | os.system("chmod +x " + bash_name) 108 | -------------------------------------------------------------------------------- /init_params/.readme.md: -------------------------------------------------------------------------------- 1 | This folder contains the initial parameters of the models. -------------------------------------------------------------------------------- /job0.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | -------------------------------------------------------------------------------- /joblenet.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python trainLenet.py lenet_0_1000_3_0_0_1_0_0_True_False_False_False_False.yaml 3 | python trainLenet.py lenet_0_1000_3_0_0_1_0_1_True_False_False_False_False.yaml 4 | python trainLenet.py lenet_0_1000_3_0_0_1_0_2_True_False_False_False_False.yaml 5 | python trainLenet.py lenet_0_1000_3_0_0_1_0_3_True_False_False_False_False.yaml 6 | python trainLenet.py lenet_0_1000_3_0_0_1_0_4_True_False_False_False_False.yaml 7 | python trainLenet.py lenet_0_1000_3_0_0_1_0_5_True_False_False_False_False.yaml 8 | python trainLenet.py lenet_0_1000_3_0_0_1_0_6_True_False_False_False_False.yaml 9 | python trainLenet.py lenet_0_3000_3_0_0_1_0_0_True_False_False_False_False.yaml 10 | python trainLenet.py lenet_0_3000_3_0_0_1_0_1_True_False_False_False_False.yaml 11 | python trainLenet.py lenet_0_3000_3_0_0_1_0_2_True_False_False_False_False.yaml 12 | python trainLenet.py lenet_0_3000_3_0_0_1_0_3_True_False_False_False_False.yaml 13 | python trainLenet.py lenet_0_3000_3_0_0_1_0_4_True_False_False_False_False.yaml 14 | python trainLenet.py lenet_0_3000_3_0_0_1_0_5_True_False_False_False_False.yaml 15 | python trainLenet.py lenet_0_3000_3_0_0_1_0_6_True_False_False_False_False.yaml 16 | python trainLenet.py lenet_0_5000_3_0_0_1_0_0_True_False_False_False_False.yaml 17 | python trainLenet.py lenet_0_5000_3_0_0_1_0_1_True_False_False_False_False.yaml 18 | python trainLenet.py lenet_0_5000_3_0_0_1_0_2_True_False_False_False_False.yaml 19 | python trainLenet.py lenet_0_5000_3_0_0_1_0_3_True_False_False_False_False.yaml 20 | python trainLenet.py lenet_0_5000_3_0_0_1_0_4_True_False_False_False_False.yaml 21 | python trainLenet.py lenet_0_5000_3_0_0_1_0_5_True_False_False_False_False.yaml 22 | python trainLenet.py lenet_0_5000_3_0_0_1_0_6_True_False_False_False_False.yaml 23 | python trainLenet.py lenet_0_50000_3_0_0_1_0_0_True_False_False_False_False.yaml 24 | python trainLenet.py lenet_0_50000_3_0_0_1_0_1_True_False_False_False_False.yaml 25 | python trainLenet.py lenet_0_50000_3_0_0_1_0_2_True_False_False_False_False.yaml 26 | python trainLenet.py lenet_0_50000_3_0_0_1_0_3_True_False_False_False_False.yaml 27 | python trainLenet.py lenet_0_50000_3_0_0_1_0_4_True_False_False_False_False.yaml 28 | python trainLenet.py lenet_0_50000_3_0_0_1_0_5_True_False_False_False_False.yaml 29 | python trainLenet.py lenet_0_50000_3_0_0_1_0_6_True_False_False_False_False.yaml 30 | -------------------------------------------------------------------------------- /jobs/.readme.md: -------------------------------------------------------------------------------- 1 | This folder contains the [Slurm](https://slurm.schedmd.com/) jobs. -------------------------------------------------------------------------------- /jobs/110_1000_4_trainLenet_0_1000_3_0_0_0_0_4_False_False_False_False_False_110.sl: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Slurm submission script, 4 | # GPU job 5 | # CRIHAN v 1.00 - Jan 2017 6 | # support@criann.fr 7 | 8 | # Not shared resources 9 | #SBATCH --share 10 | 11 | # Job name 12 | #SBATCH -J "lenet" 13 | 14 | # Batch output file 15 | #SBATCH --output ./outputjobs/lenet.o%J 16 | 17 | # Batch error file 18 | #SBATCH --error ./outputjobs/lenet.e%J 19 | 20 | # GPUs architecture and number 21 | # ---------------------------- 22 | # Partition (submission class) 23 | #SBATCH --partition gpu_p100 24 | 25 | # GPUs per compute node 26 | # gpu:4 (maximum) for gpu_k80 27 | # gpu:2 (maximum) for gpu_p100 28 | #SBATCH --gres gpu:1 29 | # ---------------------------- 30 | 31 | # Job time (hh:mm:ss) 32 | #SBATCH --time 24:00:00 33 | 34 | # MPI task maximum memory (MB) 35 | #SBATCH --mem-per-cpu 32000 36 | # ---------------------------- 37 | 38 | #SBATCH --mail-type ALL 39 | # User e-mail address 40 | #SBATCH --mail-user soufiane.belharbi@insa-rouen.fr 41 | 42 | # environments 43 | # --------------------------------- 44 | module load cuda/8.0 45 | module load python/2.7.12 46 | # --------------------------------- 47 | 48 | cd $LOCAL_WORK_DIR/workspace/code/class-invariance-hint/ 49 | 50 | 51 | 52 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python trainLenet.py trainLenet_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml 53 | -------------------------------------------------------------------------------- /jobs/20_1000_0_train3_new_dup_0_1000_3_0_0_1_0_0_True_False_False_False_False_20.sl: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Slurm submission script, 4 | # GPU job 5 | # CRIHAN v 1.00 - Jan 2017 6 | # support@criann.fr 7 | 8 | # Not shared resources 9 | #SBATCH --share 10 | 11 | # Job name 12 | #SBATCH -J "lenet" 13 | 14 | # Batch output file 15 | #SBATCH --output ./outputjobs/lenet.o%J 16 | 17 | # Batch error file 18 | #SBATCH --error ./outputjobs/lenet.e%J 19 | 20 | # GPUs architecture and number 21 | # ---------------------------- 22 | # Partition (submission class) 23 | #SBATCH --partition gpu_p100 24 | 25 | # GPUs per compute node 26 | # gpu:4 (maximum) for gpu_k80 27 | # gpu:2 (maximum) for gpu_p100 28 | #SBATCH --gres gpu:1 29 | # ---------------------------- 30 | 31 | # Job time (hh:mm:ss) 32 | #SBATCH --time 24:00:00 33 | 34 | # MPI task maximum memory (MB) 35 | #SBATCH --mem-per-cpu 32000 36 | # ---------------------------- 37 | 38 | #SBATCH --mail-type ALL 39 | # User e-mail address 40 | #SBATCH --mail-user soufiane.belharbi@insa-rouen.fr 41 | 42 | # environments 43 | # --------------------------------- 44 | module load cuda/8.0 45 | module load python/2.7.12 46 | # --------------------------------- 47 | 48 | cd $LOCAL_WORK_DIR/workspace/code/class-invariance-hint/ 49 | 50 | 51 | 52 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python train3_new_dup.py train3_new_dup_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml 53 | -------------------------------------------------------------------------------- /k80.sl: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Slurm submission script, 4 | # GPU job 5 | # CRIHAN v 1.00 - Jan 2017 6 | # support@criann.fr 7 | 8 | # Not shared resources 9 | #SBATCH --share 10 | 11 | # Job name 12 | #SBATCH -J "lenet" 13 | 14 | # Batch output file 15 | #SBATCH --output ./outputjobs/lenet.o%J 16 | 17 | # Batch error file 18 | #SBATCH --error ./outputjobs/lenet.e%J 19 | 20 | # GPUs architecture and number 21 | # ---------------------------- 22 | # Partition (submission class) 23 | #SBATCH --partition gpu_k80 24 | 25 | # GPUs per compute node 26 | # gpu:4 (maximum) for gpu_k80 27 | # gpu:2 (maximum) for gpu_p100 28 | #SBATCH --gres gpu:1 29 | # ---------------------------- 30 | 31 | # Job time (hh:mm:ss) 32 | #SBATCH --time 24:00:00 33 | 34 | # MPI task maximum memory (MB) 35 | #SBATCH --mem-per-cpu 3000 36 | # ---------------------------- 37 | 38 | #SBATCH --mail-type ALL 39 | # User e-mail address 40 | #SBATCH --mail-user soufiane.belharbi@insa-rouen.fr 41 | 42 | # environments 43 | # --------------------------------- 44 | module load cuda/8.0 45 | module load python/2.7.12 46 | # --------------------------------- 47 | 48 | cd $LOCAL_WORK_DIR/workspace/code/class-invariance-hint/ 49 | 50 | # THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python trainLenet.py lenet_0_1000_3_0_0_1_0_0_True_False_False_False_False.yaml 51 | 52 | -------------------------------------------------------------------------------- /layer.py: -------------------------------------------------------------------------------- 1 | # Based on: https://github.com/caglar/autoencoders.git 2 | # http://www-etud.iro.umontreal.ca/~gulcehrc/ 3 | from __future__ import division 4 | import numpy as np 5 | import theano 6 | from theano import tensor as T 7 | import warnings 8 | 9 | 10 | from theano.tensor.signal import pool 11 | from theano.tensor.nnet import conv2d 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 13 | from basic_layer import Layer 14 | from ae import Autoencoder 15 | from non_linearities import NonLinearity, CostType, relu, get_non_linearity_str 16 | from normalization import BatchNormLayer 17 | 18 | 19 | def sharedX_value(value, name=None, borrow=None, dtype=None): 20 | """Share a single value after transforming it to floatX type. 21 | 22 | value: a value 23 | name: variable name (str) 24 | borrow: boolean 25 | dtype: the type of the value when shared. default: theano.config.floatX 26 | """ 27 | if dtype is None: 28 | dtype = theano.config.floatX 29 | return theano.shared( 30 | theano._asarray(value, dtype=dtype), name=name, borrow=borrow) 31 | 32 | 33 | class HiddenLayer(Layer): 34 | def __init__(self, input, input1, input2, input_vl, n_in, n_out, W=None, 35 | b=None, 36 | activation=T.tanh, rng=None, hint=None, use_hint=False, 37 | intended_to_be_corrupted=False, corrupt_input_l=0., 38 | use_sparsity=False, use_sparsity_in_pred=False, 39 | use_unsupervised=False, use_batch_normalization=False): 40 | """ 41 | Typical hidden layer of an MLP: units are fully connected and have 42 | tangente hyperbolic activation function. Weight matrix (W) is of shape 43 | (n_in, n_out) and the bias vector (b) is of shape (nout,). 44 | 45 | Hidden unit activation is given by: tanh(dot(input, w)+ b) 46 | 47 | :type rng: numpy.random.RandomState 48 | :param rng: a random number generator used to initiaze the weights. 49 | 50 | :type input: theano.tensor.dmatrix 51 | :param input: a symbolic tensor of shape (n_examples, n_in) 52 | 53 | :type n_in: int 54 | :param n_in: dimension of the input 55 | 56 | :type n_out: int 57 | :param n_out: number of hidden units 58 | 59 | :type activation: theano.Op or function 60 | :param activation: Non linearity to be applied in the hidden layer. 61 | """ 62 | if rng is None: 63 | rng = np.random.RandomState() 64 | 65 | super(HiddenLayer, self).__init__( 66 | input, input1, input2, input_vl, n_in, n_out, 67 | activation=activation, 68 | rng=rng, hint=hint, use_hint=use_hint, 69 | intended_to_be_corrupted=intended_to_be_corrupted, 70 | corrupt_input_l=corrupt_input_l, 71 | use_sparsity=use_sparsity, 72 | use_sparsity_in_pred=use_sparsity_in_pred, 73 | use_batch_normalization=use_batch_normalization) 74 | self.reset_layer() 75 | 76 | if W is not None: 77 | self.W = W 78 | 79 | if b is not None: 80 | self.b = b 81 | 82 | self.params = [self.W, self.b] 83 | if self.use_batch_normalization: 84 | # we normalize the output of the layer, not its input. 85 | # it does not matter the size of the minibatch (10). 86 | self.bn = BatchNormLayer([100, n_out]) 87 | 88 | self.setup_outputs(input) 89 | self.setup_outputs_vl(input_vl) 90 | self.setup_outputs1(input1) 91 | self.setup_outputs2(input2) 92 | # Create the associated auto-encoder: tied-wights AE. 93 | self.use_unsupervised = use_unsupervised 94 | self.ae = Autoencoder( 95 | input=input, nvis=n_in, nhid=n_out, cost_type=CostType.MeanSquared, 96 | nonlinearity=get_non_linearity_str(activation), W=self.W, b=self.b, 97 | tied_weights=True, reverse=False) 98 | 99 | def setup_outputs(self, input): 100 | # lin_output = T.dot(input, self.W) + self.b 101 | if self.intended_to_be_corrupted: 102 | warnings.warn("Input 1 Will be corrupted!!!!!!") 103 | lin_output = T.dot( 104 | self.get_corrupted_input(input), self.W) + self.b 105 | else: 106 | lin_output = T.dot(input, self.W) + self.b 107 | 108 | # Normalize the linear transformation, (if there is bn) 109 | if self.use_batch_normalization: 110 | assert self.bn is not None 111 | lin_output = self.bn.get_output_for( 112 | lin_output, deterministic=False, batch_norm_use_averages=False, 113 | batch_norm_update_averages=True) 114 | self.output = ( 115 | lin_output if self.activation is None 116 | else self.activation(lin_output)) 117 | if self.use_sparsity_in_pred: 118 | assert self.use_sparsity 119 | self.output = self.output * self.sparser 120 | 121 | def setup_outputs_vl(self, input): 122 | """Setup the outputs for the test. 123 | Specifically for the batch normalization output. 124 | """ 125 | lin_output = T.dot(input, self.W) + self.b 126 | # Normalize the linear transformation. 127 | if self.use_batch_normalization: 128 | assert self.bn is not None 129 | lin_output = self.bn.get_output_for( 130 | lin_output, deterministic=False, batch_norm_use_averages=False, 131 | batch_norm_update_averages=True) 132 | self.output_vl = ( 133 | lin_output if self.activation is None 134 | else self.activation(lin_output)) 135 | if self.use_sparsity_in_pred: 136 | assert self.use_sparsity 137 | self.output = self.output * self.sparser 138 | 139 | def setup_outputs1(self, input): 140 | if self.intended_to_be_corrupted: 141 | warnings.warn("Input 1 Will be corrupted!!!!!!") 142 | lin_output = T.dot( 143 | self.get_corrupted_input(input), self.W) + self.b 144 | else: 145 | lin_output = T.dot(input, self.W) + self.b 146 | # Batch normalization 147 | if self.use_batch_normalization: 148 | assert self.bn is not None 149 | lin_output = self.bn.get_output_for( 150 | lin_output, deterministic=False, 151 | batch_norm_use_averages=False, 152 | batch_norm_update_averages=False) 153 | # We compute the distance over the linear transformation. 154 | # self.output1 = lin_output 155 | self.output1 = ( 156 | lin_output if self.activation is None 157 | else self.activation(lin_output)) 158 | if self.use_sparsity_in_pred: 159 | assert self.use_sparsity 160 | self.output1 = self.output1 * self.sparser 161 | 162 | def setup_outputs2(self, input): 163 | if self.intended_to_be_corrupted: 164 | warnings.warn("Input 2 Will be corrupted!!!!!!") 165 | lin_output = T.dot( 166 | self.get_corrupted_input(input), self.W) + self.b 167 | else: 168 | lin_output = T.dot(input, self.W) + self.b 169 | # Batch normalization 170 | if self.use_batch_normalization: 171 | assert self.bn is not None 172 | lin_output = self.bn.get_output_for( 173 | lin_output, deterministic=False, 174 | batch_norm_use_averages=False, 175 | batch_norm_update_averages=False) 176 | # We compute the distance over the linear transformation. 177 | # self.output2 = lin_output 178 | self.output2 = ( 179 | lin_output if self.activation is None 180 | else self.activation(lin_output)) 181 | if self.use_sparsity_in_pred: 182 | assert self.use_sparsity 183 | self.output2 = self.output2 * self.sparser 184 | 185 | def get_outputs(self, input): 186 | self.setup_outputs(input) 187 | return self.output 188 | 189 | def get_outputs1(self, input): 190 | self.setup_outputs1(input) 191 | return self.output1 192 | 193 | def get_outputs2(self, input): 194 | self.setup_outputs2(input) 195 | return self.output2 196 | 197 | def _squared_magn(self, x): 198 | """Returns the sum of the squared values of an array.""" 199 | return (x**2).sum(axis=1) 200 | 201 | def _magnitude(self, x): 202 | """Returns the magnitude of an array.""" 203 | return T.sqrt(T.maximum(self._squared_magn(x), 1e-7)) 204 | # np.finfo(theano.config.floatX).tiny)) 205 | 206 | def get_arc_cosine_penalty(self): 207 | """Calculate the arccosine distance in [0, 1]. 208 | 0: the two vectors are very similar. (have the same orientation) 209 | 1: the two vectors are very disimilar (have the opposite orientation). 210 | The cosine similarity does not take in consideration the magnitude 211 | of the vectors. It considers only thier orientation (angle). 212 | Therefore, two vectors are similar if they have the same angle. 213 | See: https://en.wikipedia.org/wiki/Cosine_similarity 214 | """ 215 | # tiny value: 216 | # tiny = sharedX_value(np.finfo(dtype=theano.config.floatX).tiny, 217 | # name="tiny") 218 | # the gradient of sqrt at 0 is undefinded (nan). 219 | # use a tiny value instead of 0. 220 | # OLD SOLUTION 221 | # denom = T.sqrt( 222 | # T.sum(self.output1**2, axis=1) * T.sum(self.output2**2, axis=1)) 223 | # nomin = (self.output1 * self.output2).sum(1) 224 | # cosine = nomin/denom # the cosine betwen the two vectors 225 | # pi = sharedX_value(np.pi, name="pi") 226 | # minus1 = sharedX_value(-1., name="minus1") 227 | # plus1 = sharedX_value(1. - np.finfo(dtype=theano.config.floatX).eps, 228 | # name="plus1") 229 | # # Need to be clipped. accos() gives nan when sin is close to 1. 230 | # angle = T.arccos(T.clip( 231 | # cosine, minus1.get_value(), plus1.get_value()))/pi 232 | # OLD SOLUTION 233 | # plus1 = sharedX_value(1. - np.finfo(dtype=theano.config.floatX).eps, 234 | # name="plus1") 235 | pi = sharedX_value(np.pi, name="pi") 236 | cosine = T.clip(((self.output1 * self.output2).sum(axis=1) / ( 237 | self._magnitude(self.output1) * self._magnitude(self.output2))), 238 | -1, 1 - 1e-7) 239 | angle = T.clip(T.arccos(cosine) / pi, 0, 1) 240 | 241 | return angle 242 | 243 | def get_l2_penalty(self, ind=0): 244 | """calculate the Euclidean distance between the two outputs.""" 245 | dif = (self.output1 - self.output2) 246 | if self.use_sparsity: 247 | dif = dif * self.sparser 248 | if ind == 0: 249 | return (dif**2).sum(1) 250 | elif ind == 1: 251 | return (dif**2).mean(1) 252 | else: 253 | raise ValueError("ind error.") 254 | 255 | def get_l1_penalty(self, ind=0): 256 | """calculate the Manhattan distance between the two outputs.""" 257 | dif = (self.output1 - self.output2) 258 | if self.use_sparsity: 259 | dif = dif * self.sparser 260 | if ind == 0: 261 | return (abs(dif)).sum(1) 262 | elif ind == 1: 263 | return (abs(dif)).mean(1) 264 | else: 265 | raise ValueError("ind error.") 266 | 267 | def get_contrastive(self, sim, margin): 268 | distance = ((self.output1 - self.output2)**2).sum(1) 269 | converge = (1. - sim) * distance 270 | contraste = sim * T.maximum(0, margin - distance) 271 | 272 | return converge + contraste 273 | 274 | def get_divergence(self, sim, margin): 275 | distance = ((self.output1 - self.output2)**2).sum(1) ** (1/2.) 276 | contraste = sim * T.maximum(0, margin - distance) 277 | 278 | return contraste 279 | 280 | def insepct_get_l1_conv(self, sim, margin): 281 | return (1. - sim) * self.get_l1_penalty(ind=1) 282 | 283 | def inscpect_get_l1_div(self, sim, margin): 284 | distance = ((self.output1 - self.output2)**2).sum(1) 285 | contraste = sim * T.maximum(0, margin - distance) 286 | return contraste 287 | 288 | def inspect_get_l1_distance(self, sim, margin): 289 | distance = ((self.output1 - self.output2)**2).sum(1) 290 | d = sim * distance 291 | return d 292 | 293 | def get_penalty(self, sim, margin): 294 | if self.hint is "l1sum": 295 | return (1. - sim) * self.get_l1_penalty(ind=0) 296 | elif self.hint is "l1mean": 297 | return (1. - sim) * self.get_l1_penalty(ind=1) 298 | elif self.hint is "l2sum": 299 | return (1. - sim) * self.get_l2_penalty(ind=0) 300 | elif self.hint is "l2mean": 301 | return (1. - sim) * self.get_l2_penalty(ind=1) 302 | elif self.hint is "arccos": 303 | return (1. - sim) * self.get_arc_cosine_penalty() 304 | elif self.hint is "l1sumcos": 305 | return (1. - sim) * ( 306 | self.get_l1_penalty(ind=0) + self.get_arc_cosine_penalty()) 307 | elif self.hint is "l1meancos": 308 | return (1. - sim) * ( 309 | self.get_l1_penalty(ind=1) + self.get_arc_cosine_penalty()) 310 | elif self.hint is "l2sumcos": 311 | return (1. - sim) * ( 312 | self.get_l2_penalty(ind=0) + self.get_arc_cosine_penalty()) 313 | elif self.hint is "l2meancos": 314 | return (1. - sim) * ( 315 | self.get_l2_penalty(ind=0) + self.get_arc_cosine_penalty()) 316 | elif self.hint is "contrastive": 317 | return self.get_contrastive(sim, margin) 318 | elif self.hint is "divergence": 319 | return self.get_divergence(sim, margin) 320 | else: 321 | raise ValueError("self.hint uknonw!!!!") 322 | 323 | 324 | class LeNetConvPoolLayer_hint(HiddenLayer): 325 | """Pool Layer of a convolutional network """ 326 | 327 | def __init__(self, rng, input, input1, input2, input_vl, 328 | filter_shape, image_shape, poolsize=(2, 2), 329 | activation=T.tanh, hint="l1mean", 330 | use_hint=False, 331 | intended_to_be_corrupted=False, 332 | corrupt_input_l=0., 333 | use_sparsity=False, 334 | use_sparsity_in_pred=False, 335 | use_unsupervised=False, 336 | use_batch_normalization=False): 337 | """ 338 | Allocate a LeNetConvPoolLayer with shared variable internal parameters. 339 | 340 | :type rng: numpy.random.RandomState 341 | :param rng: a random number generator used to initialize weights 342 | 343 | :type input: theano.tensor.dtensor4 344 | :param input: symbolic image tensor, of shape image_shape 345 | 346 | :type filter_shape: tuple or list of length 4 347 | :param filter_shape: (number of filters, num input feature maps, 348 | filter height, filter width) 349 | 350 | :type image_shape: tuple or list of length 4 351 | :param image_shape: (batch size, num input feature maps, 352 | image height, image width) 353 | 354 | :type poolsize: tuple or list of length 2 355 | :param poolsize: the downsampling (pooling) factor (#rows, #cols) 356 | """ 357 | 358 | assert hint is not None 359 | assert image_shape[1] == filter_shape[1] 360 | self.corrupt_input_l = sharedX_value(corrupt_input_l, name="cor_l") 361 | self.intended_to_be_corrupted = intended_to_be_corrupted 362 | self.rng = np.random.RandomState(123) 363 | self.theano_rng = RandomStreams(self.rng.randint(2 ** 30)) 364 | self.input = input 365 | # keep track of model input 366 | self.input = input 367 | self.input1 = input1 # x1 368 | self.input2 = input2 # x2 369 | self.input_vl = input_vl # bn input used for validation. 370 | self.sparser = None 371 | self.activation = activation 372 | self.hint = hint 373 | self.use_hint = use_hint 374 | self.use_sparsity = use_sparsity 375 | self.use_sparsity_in_pred = use_sparsity_in_pred 376 | self.use_unsupervised = use_unsupervised 377 | self.ae = None # no need for cnn... for now. 378 | self.use_batch_normalization = use_batch_normalization 379 | self.bn = None 380 | # the bn is applied before the pooling. (and after the linear op.) 381 | # output_shape = [batch size, num output maps, img height, img width] 382 | map_size_h = (image_shape[2] - filter_shape[2] + 1) 383 | map_size_w = (image_shape[3] - filter_shape[3] + 1) 384 | output_shape = [image_shape[0], filter_shape[0], map_size_h, 385 | map_size_w] 386 | if self.use_batch_normalization: 387 | self.bn = BatchNormLayer(output_shape) 388 | # assert self.use_batch_normalization is False 389 | 390 | # there are "num input feature maps * filter height * filter width" 391 | # inputs to each hidden unit 392 | fan_in = np.prod(filter_shape[1:]) 393 | # each unit in the lower layer receives a gradient from: 394 | # "num output feature maps * filter height * filter width" / 395 | # pooling size 396 | fan_out = (filter_shape[0] * np.prod(filter_shape[2:]) // 397 | np.prod(poolsize)) 398 | # initialize weights with random weights 399 | W_bound = np.sqrt(6. / (fan_in + fan_out)) 400 | self.W = theano.shared( 401 | np.asarray( 402 | rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), 403 | dtype=theano.config.floatX 404 | ), 405 | name="W", 406 | borrow=True 407 | ) 408 | 409 | # the bias is a 1D tensor -- one bias per output feature map 410 | b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX) 411 | self.b = theano.shared(value=b_values, name="b", borrow=True) 412 | 413 | # convolve input feature maps with filters 414 | conv_out = conv2d( 415 | input=self.input, 416 | filters=self.W, 417 | filter_shape=filter_shape, 418 | input_shape=image_shape 419 | ) 420 | 421 | conv_out1 = conv2d( 422 | input=self.input1, 423 | filters=self.W, 424 | filter_shape=filter_shape, 425 | input_shape=image_shape 426 | ) 427 | conv_out2 = conv2d( 428 | input=self.input2, 429 | filters=self.W, 430 | filter_shape=filter_shape, 431 | input_shape=image_shape 432 | ) 433 | conv_out_vl = conv2d( 434 | input=self.input_vl, 435 | filters=self.W, 436 | filter_shape=filter_shape, 437 | input_shape=image_shape 438 | ) 439 | # BN 440 | if self.bn is not None: 441 | conv_out = self.bn.get_output_for( 442 | conv_out, deterministic=False, 443 | batch_norm_use_averages=False, 444 | batch_norm_update_averages=True) 445 | conv_out1 = self.bn.get_output_for( 446 | conv_out1, deterministic=False, 447 | batch_norm_use_averages=False, 448 | batch_norm_update_averages=False) 449 | conv_out2 = self.bn.get_output_for( 450 | conv_out2, deterministic=False, 451 | batch_norm_use_averages=False, 452 | batch_norm_update_averages=False) 453 | conv_out_vl = self.bn.get_output_for( 454 | conv_out_vl, deterministic=False, 455 | batch_norm_use_averages=False, 456 | batch_norm_update_averages=True) 457 | # pool each feature map individually, using maxpooling 458 | pooled_out = pool.pool_2d( 459 | input=conv_out, 460 | ds=poolsize, 461 | ignore_border=True 462 | ) 463 | pooled_out1 = pool.pool_2d( 464 | input=conv_out1, 465 | ds=poolsize, 466 | ignore_border=True 467 | ) 468 | pooled_out2 = pool.pool_2d( 469 | input=conv_out2, 470 | ds=poolsize, 471 | ignore_border=True 472 | ) 473 | pooled_out_vl = pool.pool_2d( 474 | input=conv_out_vl, 475 | ds=poolsize, 476 | ignore_border=True 477 | ) 478 | # add the bias term. Since the bias is a vector (1D array), we first 479 | # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will 480 | # thus be broadcasted across mini-batches and feature map 481 | # width & height 482 | self.output = activation( 483 | pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) 484 | 485 | self.output1_non_fl = activation( 486 | pooled_out1 + self.b.dimshuffle('x', 0, 'x', 'x')) 487 | self.output2_non_fl = activation( 488 | pooled_out2 + self.b.dimshuffle('x', 0, 'x', 'x')) 489 | self.output_vl = activation( 490 | pooled_out_vl + self.b.dimshuffle('x', 0, 'x', 'x')) 491 | 492 | self.output1 = self.output1_non_fl.flatten(2) 493 | self.output2 = self.output2_non_fl.flatten(2) 494 | if self.sparser is None: 495 | dim_h = int((image_shape[2] - filter_shape[1] + 1) / poolsize[0]) 496 | dim_w = int((image_shape[3] - filter_shape[1] + 1) / poolsize[1]) 497 | dim_out = filter_shape[0] * dim_h * dim_w 498 | s_values = np.ones( 499 | (dim_out), 500 | dtype=theano.config.floatX) 501 | self.sparser = theano.shared(value=s_values, name='sparser', 502 | borrow=True) 503 | 504 | # store parameters of this layer 505 | self.params = [self.W, self.b] 506 | 507 | 508 | class LogisticRegressionLayer(Layer): 509 | """ 510 | Multi-class logistic regression layer. 511 | The logistic regression is fully described by a weight matrix ::math:`W` 512 | and a bias vector ::math: `b`. Classification is done by projecting data 513 | points onto a set of hyperplanes, the distance to which is used to 514 | determine a class membership probablity. 515 | """ 516 | def __init__(self, input, n_in, n_out, is_binary=False, threshold=0.4, 517 | rng=None): 518 | """ 519 | Initialize the parameters of the logistic regression. 520 | :type input: theano.tensor.TensorType 521 | :param input: symbolic variable that describes the input of the 522 | architecture (one minibatch) 523 | :type n_in: int 524 | :param n_in: number of input units, the dimension of the space in which 525 | the datapoints lie 526 | :type n_out: int 527 | :param n_out: number of output units, the dimension of the space in 528 | which the labels lie (number of classes) 529 | """ 530 | self.activation = T.nnet.sigmoid 531 | self.threshold = threshold 532 | super(LogisticRegressionLayer, self).__init__( 533 | input, 534 | n_in, 535 | n_out, 536 | self.activation, 537 | rng) 538 | 539 | self.reset_layer() 540 | 541 | self.is_binary = is_binary 542 | if n_out == 1: 543 | self.is_binary = True 544 | # The number of classes 545 | self.n_classes_seen = np.zeros(n_out) 546 | # The number of the wrong classification madefor the class i 547 | self.n_wrong_classif_made = np.zeros(n_out) 548 | 549 | self.reset_conf_mat() 550 | 551 | # Compute vector class-membership probablities in symbolic form 552 | # self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+ self.b) 553 | self.p_y_given_x = self.get_class_memberships(self.input) 554 | 555 | if not self.is_binary: 556 | # Compute prediction as class whose probability is maximal 557 | # in symbolic form 558 | self.y_decision = T.argmax(self.p_y_given_x, axis=1) 559 | else: 560 | # If the probability is greater than the specified threshold 561 | # assign to the class 1, otherwise it is 0. Which alos can be 562 | # checked if p(y=1|x) > threshold. 563 | self.y_decision = T.gt(T.flatten(self.p_y_given_x), self.threshold) 564 | 565 | self.params = [self.W, self.b] 566 | 567 | def reset_conf_mat(self): 568 | """ 569 | Reset the confusion matrix. 570 | """ 571 | self.conf_mat = np.zeros(shape=(self.n_out, self.n_out), 572 | dtype=np.dtype(int)) 573 | 574 | def negative_log_likelihood(self, y): 575 | """ 576 | Return the mean of the negative log-likelihood of the prediction 577 | of this model under a given target distribution. 578 | .. math:: 579 | \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = 580 | \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} 581 | \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ 582 | \ell (\theta=\{W,b\}, \mathcal{D}) 583 | 584 | :type y: theano.tensor.TensorType 585 | :param y: corresponds to a vector that gives for each example 586 | the correct label. 587 | Note: We use the mean instead of the sum so that the learning rate 588 | is less dependent of the batch size. 589 | """ 590 | if self.is_binary: 591 | return -T.mean(T.log(self.p_y_given_x)) 592 | return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) 593 | 594 | def crossentropy_categorical(self, y): 595 | """ 596 | Find the categorical cross entropy. 597 | """ 598 | return T.mean(T.nnet.categorical_crossentropy(self.p_y_given_x, y)) 599 | 600 | def crossentropy(self, y): 601 | """ 602 | use the theano nnet cross entropy function. Return the mean. 603 | Note: self.p_y_given_x is (batch_size, 1) but y is (batch_size,). 604 | In order to establish the compliance, we should flatten the 605 | p_y_given_x. 606 | """ 607 | return T.mean( 608 | T.nnet.binary_crossentropy(T.flatten(self.p_y_given_x), y)) 609 | 610 | def get_class_memberships(self, x): 611 | lin_activation = T.dot(x, self.W) + self.b 612 | if self.is_binary: 613 | # return the sigmoid value 614 | return T.nnet.sigmoid(lin_activation) 615 | # else retunr the softmax 616 | return T.nnet.softmax(lin_activation) 617 | 618 | def update_conf_mat(self, y, p_y_given_x): 619 | """ 620 | Update the confusion matrix with the given true labels and estimated 621 | labels. 622 | """ 623 | if self.n_out == 1: 624 | y_decision = (p_y_given_x > self.threshold) 625 | else: 626 | y_decision = np.argmax(p_y_given_x, axis=1) 627 | for i in xrange(y.shape[0]): 628 | self.conf_mat[y[i]][y_decision[i]] += 1 629 | 630 | def errors(self, y): 631 | """ 632 | returns a float representing the number of errors in the minibatch 633 | over the total number of examples of the minibatch. Zero one loss 634 | over the size of the minibatch. 635 | 636 | :type y: theano.tensor.TensorType 637 | :param y: corresponds to a vector that gives for each example the 638 | correct label. 639 | """ 640 | if y.ndim != self.y_decision.ndim: 641 | raise TypeError("y should have the same shape as self.y_decision", 642 | ('y', y.type, "y_decision", self.y_decision.type)) 643 | if y.dtype.startswith('int') or y.dtype.startswith('uint'): 644 | # The T.neq operator returns a vector of 0s and 1s, where: 645 | # 1 represents a mistake in classification 646 | return T.mean(T.neq(self.y_decision, y)) 647 | else: 648 | raise NotImplementedError() 649 | 650 | def raw_prediction_errors(self, y): 651 | """ 652 | Returns a binary array where each each element indicates if the 653 | corresponding sample has been correctly classified (0) or not (1) in 654 | the minibatch. 655 | 656 | :type y: theano.tensor.TensorType 657 | :param y: corresponds to a vector that gives for each example the 658 | correct label. 659 | """ 660 | if y.ndim != self.y_decision.ndim: 661 | raise TypeError("y should have the same shape as self.y_decision", 662 | ('y', y.type, "y_decision", self.y_decision.type)) 663 | if y.dtype.startswith('int') or y.dtype.startswith('uint'): 664 | # The T.neq operator returns a vector of 0s and 1s, where: 665 | # 1 represents a mistake in classification 666 | return T.neq(self.y_decision, y) 667 | else: 668 | raise NotImplementedError() 669 | 670 | def error_per_calss(self, y): 671 | """ 672 | Return an array where each value is the error for the corresponding 673 | classe in the minibatch. 674 | 675 | :type y: theano.tensor.TensorType 676 | :param y: corresponds to a vector that gives for each example the 677 | correct label. 678 | """ 679 | if y.ndim != self.y_decision.ndim: 680 | raise TypeError("y should have the same shape as self.y_decision", 681 | ('y', y.type, "y_decision", self.y_decision.type)) 682 | if y.dtype.startswith('int') or y.dtype.startswith('uint'): 683 | y_decision_res = T.neq(self.y_decision, y) 684 | for (i, y_decision_r) in enumerate(y_decision_res): 685 | self.n_classes_seen[y[i]] += 1 686 | if y_decision_r: 687 | self.n_wrong_classif_made[y[i]] += 1 688 | pred_per_class = self.n_wrong_classif_made / self.n_classes_seen 689 | return T.mean(y_decision_res), pred_per_class 690 | else: 691 | raise NotImplementedError() 692 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | from theano import tensor as T 2 | import theano 3 | import numpy 4 | from theano.tensor.signal import downsample 5 | from theano.tensor.nnet import conv 6 | 7 | from layer import HiddenLayer 8 | from layer import LeNetConvPoolLayer_hint 9 | 10 | 11 | def relu(x): 12 | return T.switch(x > 0, x, 0) 13 | 14 | 15 | def sharedX_value(value, name=None, borrow=None, dtype=None): 16 | """Share a single value after transforming it to floatX type. 17 | value: a value 18 | name: variable name (str) 19 | borrow: boolean 20 | dtype: the type of the value when shared. default: theano.config.floatX 21 | """ 22 | if dtype is None: 23 | dtype = theano.config.floatX 24 | return theano.shared( 25 | theano._asarray(value, dtype=dtype), name=name, borrow=borrow) 26 | 27 | 28 | class IdentityHiddenLayer(object): 29 | """ 30 | This is the identity layer. It takes the input and give it back as output. 31 | We will be using this layer just after the last convolution layer to applay 32 | a dropout. 33 | """ 34 | def __init__(self, rng, input): 35 | self.input = input 36 | self.W = None 37 | self.b = None 38 | self.params = [] 39 | self.output = input 40 | 41 | 42 | def dropout_from_layer(rng, layer_output, p): 43 | """ 44 | p: float. The probablity of dropping a unit. 45 | """ 46 | srng = theano.tensor.shared_randomstreams.RandomStreams( 47 | rng.randint(99999)) 48 | one = T.constant(1) 49 | retain_prob = one - p 50 | mask = srng.binomial(n=1, p=retain_prob, size=layer_output.shape, 51 | dtype=layer_output.dtype) 52 | output = layer_output * mask 53 | 54 | return output 55 | 56 | 57 | def localResponseNormalizationCrossChannel(incoming, alpha=1e-4, 58 | k=2, beta=0.75, n=5): 59 | """ 60 | Implement the local response normalization cross the channels described 61 | in , 62 | A.Krizhevsky et al. sec.3.3. 63 | Reference of the code: 64 | https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/ 65 | normalization.py 66 | https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/expr/normalize.py 67 | Parameters: 68 | incomping: The feature maps. (output of the convolution layer). 69 | alpha: float scalar 70 | k: float scalr 71 | beta: float scalar 72 | n: integer: number of adjacent channels. Must be odd. 73 | """ 74 | if n % 2 == 0: 75 | raise NotImplementedError("Works only with odd n") 76 | 77 | input_shape = incoming.shape 78 | half_n = n // 2 79 | input_sqr = T.sqr(incoming) 80 | b, ch, r, c = input_shape 81 | extra_channels = T.alloc(0., b, ch + 2*half_n, r, c) 82 | input_sqr = T.set_subtensor(extra_channels[:, half_n:half_n+ch, :, :], 83 | input_sqr) 84 | scale = k 85 | for i in range(n): 86 | scale += alpha * input_sqr[:, i:i+ch, :, :] 87 | scale = scale ** beta 88 | 89 | return incoming / scale 90 | 91 | 92 | class LRNCCIdentityLayer(IdentityHiddenLayer): 93 | def __init__(self, input, alpha=1e-4, k=2, beta=0.75, n=5): 94 | super(LRNCCIdentityLayer, self).__init__(rng=None, input=input) 95 | self.output = localResponseNormalizationCrossChannel( 96 | incoming=self.output, alpha=alpha, k=k, beta=beta, n=n) 97 | 98 | 99 | class DropoutIdentityHiddenLayer(IdentityHiddenLayer): 100 | def __init__(self, rng, input, dropout_rate, rescale): 101 | """ 102 | rescale: Boolean. Can be only used when applying dropout. 103 | """ 104 | if rescale: 105 | one = T.constant(1) 106 | retain_prob = one - dropout_rate 107 | input /= retain_prob 108 | 109 | super(DropoutIdentityHiddenLayer, self).__init__(rng=rng, input=input) 110 | if dropout_rate > 0.: 111 | self.output = dropout_from_layer(rng, self.output, p=dropout_rate) 112 | 113 | 114 | class DropoutHiddenLayer(HiddenLayer): 115 | def __init__(self, rng, input, n_in, n_out, dropout_rate, rescale, 116 | W=None, b=None, b_v=0., activation=None): 117 | """ 118 | rescale: Boolean. Can be only used when applying dropout. 119 | """ 120 | if rescale: 121 | one = T.constant(1) 122 | retain_prob = one - dropout_rate 123 | input /= retain_prob 124 | 125 | super(DropoutHiddenLayer, self).__init__( 126 | input=input, n_in=n_in, n_out=n_out, W=W, b=b, 127 | activation=activation, rng=rng) 128 | if dropout_rate > 0.: 129 | self.output = dropout_from_layer(rng, self.output, p=dropout_rate) 130 | 131 | 132 | class LeNetConvPoolLayer(object): 133 | def __init__(self, rng, input, filter_shape, image_shape, 134 | poolsize=(2, 2), maxout=False, poolmaxoutfactor=2, 135 | W=None, b=None, b_v=0., stride=(1, 1), LRN={ 136 | "app": False, "before": False, "alpha": 1e-4, "k": 2, 137 | "beta": 0.75, "n": 5}): 138 | """ 139 | Input: 140 | maxout: Boolean. Indicates if to do or not a maxout. 141 | poolmaxoutfactor: How many feature maps to maxout. The number of 142 | input feature maps must be a multiple of poolmaxoutfactor. 143 | allow_dropout_conv: Boolean. Allow or not the dropout in conv. 144 | layer. This maybe helpful when we want to use dropout only 145 | for fully connected layers. 146 | LRN: tuple (a, b) of booleans. a: apply or not the local response 147 | normalization. b: before (True) or after (False) the pooling. 148 | b_v: float. The initial value of the bias. 149 | """ 150 | self.LRNCCIdentityLayer = None 151 | if maxout: 152 | assert poolmaxoutfactor == 2 153 | assert image_shape[1] == filter_shape[1] 154 | self.input = input 155 | 156 | fan_in = numpy.prod(filter_shape[1:]) 157 | fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / 158 | numpy.prod(poolsize)) 159 | # initialize weights with random weights 160 | W_bound = numpy.sqrt(6. / (fan_in + fan_out)) 161 | if W is None: 162 | W = theano.shared( 163 | numpy.asarray( 164 | rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), 165 | dtype=theano.config.floatX 166 | ), 167 | name="w_conv", 168 | borrow=True 169 | ) 170 | if b is None: 171 | b_v = ( 172 | numpy.ones( 173 | (filter_shape[0],)) * b_v).astype(theano.config.floatX) 174 | b = theano.shared(value=b_v, name="b_conv", borrow=True) 175 | 176 | self.W = W 177 | self.b = b 178 | conv_out = conv.conv2d( 179 | input=input, 180 | filters=self.W, 181 | filter_shape=filter_shape, 182 | image_shape=image_shape, 183 | subsample=stride 184 | ) 185 | # Local reponse normalization 186 | if LRN["app"] and LRN["before"]: 187 | self.LRNCCIdentityLayer = LRNCCIdentityLayer( 188 | conv_out, alpha=LRN["alpha"], k=LRN["k"], beta=LRN["beta"], 189 | n=LRN["n"]) 190 | conv_out = self.LRNCCIdentityLayer.output 191 | print "LRN BEFORE pooling ..." 192 | 193 | if maxout: 194 | z = T.add(conv_out, self.b.dimshuffle('x', 0, 'x', 'x')) 195 | s = None 196 | for i in range(filter_shape[0]/poolmaxoutfactor): 197 | t = z[:, i::poolmaxoutfactor, :, :] 198 | if s is None: 199 | s = t 200 | else: 201 | s = T.maximum(s, t) 202 | z = s 203 | if poolsize not in [None, (1, 1)]: 204 | pooled_out = downsample.max_pool_2d( 205 | input=z, 206 | ds=poolsize, 207 | ignore_border=True 208 | ) 209 | self.output = pooled_out 210 | else: 211 | self.output = z 212 | else: 213 | if poolsize not in [None, (1, 1)]: 214 | pooled_out = downsample.max_pool_2d( 215 | input=conv_out, 216 | ds=poolsize, 217 | ignore_border=True 218 | ) 219 | self.output = relu( 220 | pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')) 221 | print "RELU..." 222 | else: 223 | # simple relu 224 | term = conv_out + self.b.dimshuffle('x', 0, 'x', 'x') 225 | self.output = T.switch(term > 0, term, 0 * term) 226 | print "RELU..." 227 | 228 | # Local reponse normalization 229 | if LRN["app"] and not LRN["before"]: 230 | self.LRNCCIdentityLayer = LRNCCIdentityLayer( 231 | self.output, alpha=LRN["alpha"], k=LRN["k"], beta=LRN["beta"], 232 | n=LRN["n"]) 233 | self.output = self.LRNCCIdentityLayer.output 234 | print "LRN AFTER activation(of pooling)..." 235 | 236 | self.params = [self.W, self.b] 237 | 238 | 239 | class DropoutLeNetConvPoolLayer(LeNetConvPoolLayer): 240 | def __init__(self, rng, input, filter_shape, image_shape, dropout_rate, 241 | rescale, poolsize=(2, 2), stride=(1, 1), 242 | LRN={ 243 | "app": False, "before": False, "alpha": 1e-4, "k": 2, 244 | "beta": 0.75, "n": 5}, 245 | maxout=False, poolmaxoutfactor=2, W=None, b=None, b_v=0.): 246 | if rescale: 247 | one = T.constant(1) 248 | retain_prob = one - dropout_rate 249 | input /= retain_prob 250 | super(DropoutLeNetConvPoolLayer, self).__init__( 251 | rng=rng, input=input, filter_shape=filter_shape, 252 | image_shape=image_shape, poolsize=poolsize, stride=stride, 253 | LRN=LRN, maxout=maxout, poolmaxoutfactor=poolmaxoutfactor, 254 | W=W, b=b, b_v=b_v) 255 | if dropout_rate > 0.: 256 | self.output = dropout_from_layer(rng, self.output, p=dropout_rate) 257 | -------------------------------------------------------------------------------- /learning_rate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | 4 | 5 | class AnnealedLearningRate(object): 6 | """A callback to adjust the learning rate on each freq (batch or epoch). 7 | 8 | The learning rate will be annealed by 1/t at each freq. 9 | Parameters: 10 | anneal_start: int 11 | the epoch when to start annealing. 12 | """ 13 | def __init__(self, anneal_start, freq='epoch'): 14 | self._initialized = False 15 | self._count = 0. 16 | self._anneal_start = anneal_start 17 | self.freq = freq 18 | 19 | def __call__(self, learning_rate): 20 | """Updates the learning rate according to the annealing schedule. 21 | 22 | """ 23 | if not self._initialized: 24 | self._base = learning_rate.get_value() 25 | self._initialized = True 26 | self._count += 1 27 | learning_rate.set_value( 28 | np.cast[theano.config.floatX](self.get_current_learning_rate())) 29 | 30 | def get_current_learning_rate(self): 31 | """Calculate the current learning rate according to the annealing 32 | schedule. 33 | 34 | """ 35 | return self._base * min(1, self._anneal_start / self._count) 36 | 37 | 38 | class ExponentialDecayLearningRate(object): 39 | """ 40 | This anneals the learning rate by dviding it by decay_factor after 41 | each update (freq='batch'). 42 | 43 | lr = lr * decay_factor**(-t) 44 | Parameters: 45 | decay_factor: float 46 | de the decay factor 47 | min_lr: float 48 | The lr will be fixed to min_lr when it's reached. 49 | """ 50 | def __init__(self, decay_factor, min_lr): 51 | self._count = 0 52 | self._min_reached = False 53 | self.min_lr = min_lr 54 | self.decay_factor = decay_factor 55 | self.freq = 'batch' 56 | 57 | def __call__(self, learning_rate): 58 | """Update the learning rate according to the exponential decay 59 | schedule. 60 | 61 | """ 62 | if self._count == 0.: 63 | self._base_lr = learning_rate.get_vale() 64 | self._count += 1 65 | 66 | if not self._min_reached: 67 | new_lr = self._base_lr * (self.decay_factor ** (-self._count)) 68 | if new_lr <= self.min_lr: 69 | self._min_reached = True 70 | new_lr = self._min_reached 71 | else: 72 | new_lr = self.min_lr 73 | 74 | learning_rate.set_value(np.cast[theano.config.floatX](new_lr)) 75 | -------------------------------------------------------------------------------- /learning_rule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 21 08:28:21 2015 4 | 5 | @author: Soufiane Belharbi 6 | """ 7 | import theano 8 | import theano.tensor as T 9 | import numpy as np 10 | from collections import OrderedDict 11 | 12 | from tools import sharedX_value, sharedX_mtx 13 | from tools import floatX 14 | 15 | 16 | def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7): 17 | """Max weight norm constraints and gradient clipping 18 | 19 | This takes a TensorVariable and rescales it so that incoming weight 20 | norms are below a specified constraint value. Vectors violating the 21 | constraint are rescaled so that they are within the allowed range. 22 | 23 | Parameters 24 | ---------- 25 | tensor_var : TensorVariable 26 | Theano expression for update, gradient, or other quantity. 27 | max_norm : scalar 28 | This value sets the maximum allowed value of any norm in 29 | `tensor_var`. 30 | norm_axes : sequence (list or tuple) 31 | The axes over which to compute the norm. This overrides the 32 | default norm axes defined for the number of dimensions 33 | in `tensor_var`. When this is not specified and `tensor_var` is a 34 | matrix (2D), this is set to `(0,)`. If `tensor_var` is a 3D, 4D or 35 | 5D tensor, it is set to a tuple listing all axes but axis 0. The 36 | former default is useful for working with dense layers, the latter 37 | is useful for 1D, 2D and 3D convolutional layers. 38 | (Optional) 39 | epsilon : scalar, optional 40 | Value used to prevent numerical instability when dividing by 41 | very small or zero norms. 42 | 43 | Credit: 44 | https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py 45 | 46 | Returns 47 | ------- 48 | TensorVariable 49 | Input `tensor_var` with rescaling applied to weight vectors 50 | that violate the specified constraints. 51 | 52 | 53 | Notes 54 | ----- 55 | When `norm_axes` is not specified, the axes over which the norm is 56 | computed depend on the dimensionality of the input variable. If it is 57 | 2D, it is assumed to come from a dense layer, and the norm is computed 58 | over axis 0. If it is 3D, 4D or 5D, it is assumed to come from a 59 | convolutional layer and the norm is computed over all trailing axes 60 | beyond axis 0. For other uses, you should explicitly specify the axes 61 | over which to compute the norm using `norm_axes`. 62 | """ 63 | ndim = tensor_var.ndim 64 | 65 | if norm_axes is not None: 66 | sum_over = tuple(norm_axes) 67 | elif ndim == 2: # DenseLayer 68 | sum_over = (0,) 69 | elif ndim in [3, 4, 5]: # Conv{1,2,3}DLayer 70 | sum_over = tuple(range(1, ndim)) 71 | else: 72 | raise ValueError( 73 | "Unsupported tensor dimensionality {}." 74 | "Must specify `norm_axes`".format(ndim) 75 | ) 76 | 77 | dtype = np.dtype(theano.config.floatX).type 78 | norms = T.sqrt(T.sum(T.sqr(tensor_var), axis=sum_over, keepdims=True)) 79 | target_norms = T.clip(norms, 0, dtype(max_norm)) 80 | constrained_output = \ 81 | (tensor_var * (target_norms / (dtype(epsilon) + norms))) 82 | 83 | return constrained_output 84 | 85 | 86 | class LearningRule(): 87 | """ A `LearningRule` is a class that calculates the new parameters value 88 | using: 89 | a learning rate, the current parameters value and the current gradient. 90 | 91 | """ 92 | def get_updates(self, learning_rate, params, grads, lr_scalers): 93 | """ Compute the current updates for the parameters. 94 | 95 | """ 96 | 97 | raise NotImplementedError( 98 | str(type(self)) + " does not implement get_updates.") 99 | 100 | 101 | class Momentum(LearningRule): 102 | """Implementation of the momentum as in the method described in section 103 | 9 of [1]:'A Practical Guide to Training Restricted Boltzmann Machines', 104 | bu Geoffrey Hinton.(https://www.cs.toronto.edu/~hinton/absps/guideTR.pdf) 105 | We implemented alos the formula presented in Imagenet paper: 106 | , 107 | A.Krizhevsky et al. . 108 | More details in: 109 | [2]'On the importance of initialization and momentum in deep learning', 110 | I. Sutskever et al. 111 | [3]'Advances in optimizating recurrent networks', Y. Bengio et al. 112 | 113 | The model's parametes are updated such as: 114 | velocity_(t+1) := momentum * velocity_t - 115 | learning_rate * d cost / d param_t 116 | param_(t+1) := param_t + velocity_(t+1) 117 | 118 | Parameters: 119 | init_momentum: float 120 | Initial value of the momentum coefficient. It remains fisex unless 121 | used with 'MomentumAdjuster'. 122 | nesterov_momentum: boolean 123 | If True, uses the accelerated momentum technique described in [2,3] 124 | max_colm_norm: Boolean. The incoming weight vector corresponding to 125 | each hidden unit is constrained to have a maximum squared length of 126 | max_norm. 127 | max_norm: Float. The maximum norm. 128 | """ 129 | def __init__(self, init_momentum, nesterov_momentum=False, 130 | imagenet=False, imagenetDecay=5e-4, max_colm_norm=False, 131 | max_norm=15.0): 132 | assert init_momentum >= 0., 'The initial momentum should be >=0.' 133 | assert init_momentum < 1., 'The initial momentum should be < 1.' 134 | 135 | self.momentum = sharedX_value(value=init_momentum, name="momentum", 136 | borrow=True) 137 | self.nesterov_momentum = nesterov_momentum 138 | self._first_time = True 139 | self.velocity = None # tracks the velocity at the previous time 140 | self.imagenet = imagenet 141 | self.imagenetDecay = sharedX_value(value=imagenetDecay, 142 | name="imagenetDecay", 143 | borrow=True) 144 | self.max_colm_norm = max_colm_norm 145 | self.max_norm = max_norm 146 | 147 | def get_updates(self, learning_rate, params, grads, lr_scalers): 148 | """ 149 | get the updates (params, and velocity) 150 | """ 151 | # the initial velocity is zero. 152 | if self._first_time: 153 | self.velocity = [ 154 | sharedX_mtx( 155 | param.get_value() * 0., 156 | name='vel_'+param.name, borrow=True) for param in params] 157 | 158 | updates = [] 159 | for (param, grad, vel, lr_sc) in zip( 160 | params, grads, self.velocity, lr_scalers): 161 | lr_scaled = learning_rate * lr_sc 162 | if self.imagenet: 163 | new_vel = self.momentum * vel -\ 164 | lr_scaled * self.imagenetDecay * param - lr_scaled * grad 165 | else: 166 | new_vel = self.momentum * vel - lr_scaled * grad 167 | 168 | updates.append((vel, new_vel)) 169 | inc = new_vel 170 | # this is the equivalence of NAG in [3].3.5, eq [7]. 171 | # It helps to avoid calculating the new grad(param+vel_(t-1)). 172 | # The only different from the paper is: momentum_(t) 173 | # which it's set to momentum_(t-1). If you develop the final inc, 174 | # you will find that it's equivalent to eq.[7] mentioned above. 175 | if self.nesterov_momentum: 176 | inc = self.momentum * new_vel - lr_scaled * grad 177 | 178 | new_param = param + inc 179 | if self.max_colm_norm and param.name in ["W", "w"]: 180 | new_param_final = norm_constraint(tensor_var=new_param, 181 | max_norm=self.max_norm) 182 | else: 183 | new_param_final = new_param 184 | updates.append((param, new_param_final)) 185 | 186 | # add the velocity updates to updates 187 | 188 | return updates 189 | 190 | 191 | class MomentumLinearAdjusterOverEpoch(object): 192 | """A callback to adjust linearly the momentum on each frequence (EPOCH). 193 | It adjusts the momentum based on the number of the epochs seen. 194 | 195 | Parameters: 196 | final_momentum: float 197 | The momentum coefficient to use at the end of the learning. 198 | start: int 199 | The epoch on wich to start growing the momentum. 200 | saturate: int 201 | The epoch on wich to momentum should reach its final value. 202 | 203 | """ 204 | def __init__(self, final_momentum, start, saturate): 205 | assert saturate >= start, "The momentum can not saturate before it "\ 206 | "starts increasing. Please set a saturation value higher than the"\ 207 | " start value." 208 | self._initialized = False 209 | self._count = 0 210 | self.saturate = saturate 211 | self.final_momentum = final_momentum 212 | self.start = start 213 | self.freq = 'epoch' # it works only on epochs 214 | self._first_time = True 215 | 216 | def __call__(self, learning_rule, seen_epochs): 217 | """Update the momentum according to the number of the epochs already 218 | seen. 219 | 220 | Parameters: 221 | trainingAlgorithm: instance of 222 | training_algorithm.trainingAlgorithm, 223 | the current algorithm used for training the model. 224 | """ 225 | # check 226 | if not hasattr(learning_rule, 'momentum'): 227 | raise ValueError( 228 | str(type(self))+' works only when the learning_rule ' 229 | 'specified in the training algorithm has the attribute ' 230 | '. For examples: "sarco.learning_rule.Momentum"') 231 | 232 | self._count = seen_epochs 233 | self._apply_momentum(learning_rule) 234 | 235 | def _apply_momentum(self, learning_rule): 236 | """Apply the momentum. 237 | """ 238 | 239 | momentum = learning_rule.momentum 240 | if not self._initialized: 241 | self._init_momentum = momentum.get_value() 242 | self._initialized = True 243 | momentum.set_value( 244 | np.cast[theano.config.floatX](self.get_current_momentum())) 245 | 246 | def get_current_momentum(self): 247 | """Return the current momentum with the desired schedule. 248 | 249 | """ 250 | w = self.saturate - self.start 251 | if w == 0: 252 | # saturate=start, jump straighforward to the final momentum value 253 | # if we exceeded the saturation, return the final momentum 254 | if self._count >= self.saturate: 255 | return self.final_momentum 256 | else: 257 | # else: (we didn't reach yet the saturation point), 258 | # return the initial momentum 259 | return self._init_momentum 260 | 261 | coef = float(self._count - self.start) / float(w) 262 | if coef < 0.: 263 | coef = 0. # no effect 264 | if coef > 1.: 265 | coef = 1. 266 | 267 | cu_m = self._init_momentum * (1 - coef) + coef * self.final_momentum 268 | 269 | return cu_m 270 | 271 | 272 | class AdaDelta(LearningRule): 273 | """Implement the ADADELTA algorithm of [1] to update the parameters 274 | of the model. 275 | Parameters: 276 | decay: float 277 | Decay rate in [1]. 278 | 279 | Caution: the parameter 'epsilon' in [1] is the learning rate. 280 | So It would be better to use a small learning rate 281 | (maybe fixed all the learning process [we will see] 282 | [1]:'AdaDelta: An Adaptive Learning Rate Method', Zeiler M. ) 283 | """ 284 | def __init__(self, decay=0.95, max_colm_norm=False, max_norm=15.0): 285 | assert decay >= 0., 'The decay parameter in ' + str(type(self)) +\ 286 | ' must be >= 0.' 287 | assert decay < 1., 'The decay parameter in ' + str(type(self)) +\ 288 | ' must be < 1.' 289 | self.decay = decay 290 | self._first_time = True 291 | self.mean_square_grad = None 292 | self.mean_squar_dx = None 293 | self.max_colm_norm = max_colm_norm 294 | self.max_norm = max_norm 295 | 296 | def get_updates(self, learning_rate, params, grads, lr_scalers): 297 | """Compute the AdaDelta updates of the model's parameters. 298 | 299 | param_t := param_(t-1) + AdaDelta_update_t 300 | """ 301 | if self._first_time: 302 | self.mean_square_grad = [ 303 | sharedX_mtx( 304 | param.get_value() * 0., 305 | name='mean_square_grad_'+param.name, 306 | borrow=True) for param in params] 307 | self.mean_squar_dx = [ 308 | sharedX_mtx( 309 | param.get_value() * 0., 310 | name='mean_square_dx_'+param.name, 311 | borrow=True) for param in params] 312 | self._first_time = False 313 | 314 | updates = [] 315 | for (param, grad, mean_square_grad, mean_squar_dx, lr_sc) in zip( 316 | params, grads, self.mean_square_grad, self.mean_squar_dx, 317 | lr_scalers): 318 | # Calculate the running average gradient: E[g^2]_t 319 | new_mean_square_grad = ( 320 | self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grad)) 321 | 322 | # The update: delta_x_t 323 | lr_scaled = learning_rate * lr_sc 324 | epsilon = lr_scaled 325 | rms_dx_t_1 = T.sqrt(mean_squar_dx + epsilon) 326 | rms_grad_t = T.sqrt(new_mean_square_grad + epsilon) 327 | delta_x_t = - (rms_dx_t_1 / rms_grad_t) * grad 328 | # Compute: E[delta_x^2]_t 329 | new_mean_square_dx = ( 330 | self.decay * mean_squar_dx + 331 | (1 - self.decay) * T.sqr(delta_x_t)) 332 | 333 | # update the params 334 | new_param = param + delta_x_t 335 | # Send for the update 336 | updates.append((mean_square_grad, new_mean_square_grad)) 337 | updates.append((mean_squar_dx, new_mean_square_dx)) 338 | if self.max_colm_norm and param.name in ["W", "w"]: 339 | new_param_final = norm_constraint(tensor_var=new_param, 340 | max_norm=self.max_norm) 341 | else: 342 | new_param_final = new_param 343 | updates.append((param, new_param_final)) 344 | 345 | return updates 346 | 347 | 348 | class AdaGrad(LearningRule): 349 | """Implement the AdaGrad algorithm of [1] to update the parameters of 350 | the model. 351 | 352 | For more details on how to implement AdGrad, see [2], §2. 353 | [1]:'Adaptive subgradient methods for online learning and 354 | stochastic optimization.', Duchi et al. 355 | [2]:'Notes on AdaDrad', Chris Dyer. 356 | (link: http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf) 357 | 358 | """ 359 | def __init__(self, max_colm_norm=False, max_norm=15.0): 360 | self._first_time = True 361 | self.sum_square_grad = None 362 | self.max_colm_norm = max_colm_norm 363 | self.max_norm = max_norm 364 | 365 | def get_updates(self, learning_rate, params, grads, lr_scalers): 366 | """Compute the AdaDelta updates of the model's parameters. 367 | 368 | param_t := param_(t-1) + AdaDelta_update_t 369 | """ 370 | if self._first_time: 371 | self.sum_square_grad = [ 372 | sharedX_mtx( 373 | param.get_value() * 0., 374 | name='sum_square_grad_'+param.name, 375 | borrow=True) for param in params] 376 | self._first_time = False 377 | 378 | updates = [] 379 | for (param, grad, sum_square_grad, lr_sc) in zip( 380 | params, grads, self.sum_square_grad, lr_scalers): 381 | # Calculate the running average gradient: E[g^2]_t 382 | new_sum_square_grad = sum_square_grad + T.sqr(grad) 383 | 384 | # The update: delta_x_t 385 | lr_scaled = learning_rate * lr_sc 386 | epsilon = lr_scaled 387 | sqrt_sum_grad_t = T.sqrt(new_sum_square_grad) 388 | delta_x_t = - (epsilon / sqrt_sum_grad_t) * grad 389 | 390 | # update the params 391 | new_param = param + delta_x_t 392 | # Send for the update 393 | updates.append((sum_square_grad, new_sum_square_grad)) 394 | if self.max_colm_norm and param.name in ["W", "w"]: 395 | new_param_final = norm_constraint(tensor_var=new_param, 396 | max_norm=self.max_norm) 397 | else: 398 | new_param_final = new_param 399 | updates.append((param, new_param_final)) 400 | 401 | return updates 402 | 403 | 404 | class RMSProp(LearningRule): 405 | """Implements the RMSProp learning rule as described in [1]. 406 | 407 | The RMSProp rule was described in [1]. The idea is similar to the 408 | AdaDelta, 409 | which consists of dividing the learning rate for a weight by a running 410 | average of the magintudes of recent graidients of that weight. 411 | Parameters: 412 | decay: float 413 | Decay constant similar to the one used in AdaDelta, and Momentum. 414 | max_scaling: float 415 | Restrict the RMSProp gradient scaling coefficient to values below 416 | 'max_scaling' to avoid a learning rate too small (almost zero). 417 | 418 | [1]: 'Neural Networks for Machine Learning, Lecture 6a Overview of 419 | mini-­‐batch gradient descent', a lecture by Hinton et al. 420 | (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) 421 | """ 422 | def __init__(self, decay=0.9, max_scaling=1e5, max_colm_norm=False, 423 | max_norm=15.0): 424 | assert 0. <= decay < 1., 'decay must be: 0. <= decay < 1' 425 | assert max_scaling > 0., 'max_scaling must be > 0.' 426 | self.decay = sharedX_value(decay, name='decay', borrow=True) 427 | self.epsilon = 1. / max_scaling 428 | self.mean_square_grads = None 429 | self._first_time = True 430 | self.max_colm_norm = max_colm_norm 431 | self.max_norm = max_norm 432 | 433 | def get_updates(self, learning_rate, params, grads, lr_scalers): 434 | """Compute the parameters' updates. 435 | 436 | """ 437 | if self._first_time: 438 | self.mean_square_grads = [ 439 | sharedX_mtx( 440 | param.get_value() * 0., 441 | name='mean_square_grad_'+param.name, 442 | borrow=True) for param in params] 443 | self._first_time = False 444 | updates = [] 445 | for (param, grad, mean_square_grad, lr_sc) in zip( 446 | params, grads, self.mean_square_grads, lr_scalers): 447 | new_mean_square_grad = ( 448 | self.decay * mean_square_grad + (1-self.decay) * T.sqr(grad)) 449 | # the update 450 | rms_grad_t = T.sqrt(new_mean_square_grad) 451 | rms_grad_t = T.maximum(rms_grad_t, self.epsilon) 452 | lr_scaled = learning_rate * lr_sc 453 | delta_x_t = - lr_scaled * grad / rms_grad_t 454 | 455 | new_param = param + delta_x_t 456 | # updates 457 | if self.max_colm_norm and param.name in ["W", "w"]: 458 | new_param_final = norm_constraint(tensor_var=new_param, 459 | max_norm=self.max_norm) 460 | else: 461 | new_param_final = new_param 462 | updates.append((param, new_param_final)) 463 | updates.append((mean_square_grad, new_mean_square_grad)) 464 | 465 | return updates 466 | 467 | 468 | class Adam(LearningRule): 469 | """ 470 | Implement Adaptive Moment Estimation. 471 | Adam updates implemented as in [1]_. 472 | Parameters: 473 | beta1 : float 474 | Exponential decay rate for the first moment estimates. 475 | beta2 : float 476 | Exponential decay rate for the second moment estimates. 477 | epsilon : float 478 | Constant for numerical stability. 479 | Credit: 480 | https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py 481 | References 482 | ---------- 483 | .. [1] Kingma, Diederik, and Jimmy Ba (2014): 484 | Adam: A Method for Stochastic Optimization. 485 | arXiv preprint arXiv:1412.6980. 486 | """ 487 | def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, 488 | max_colm_norm=False, max_norm=15.0): 489 | self.beta1 = sharedX_value(beta1, name='beta1', borrow=True) 490 | self.beta2 = sharedX_value(beta2, name='beta2', borrow=True) 491 | self.epsilon = sharedX_value(epsilon, name='epsilon', borrow=True) 492 | self.max_colm_norm = max_colm_norm 493 | self.max_norm = max_norm 494 | 495 | def get_updates(self, learning_rate, params, grads, lr_scalers): 496 | """Compute the parameters' updates. 497 | 498 | """ 499 | t_prev = theano.shared(floatX(0.)) 500 | updates = OrderedDict() 501 | 502 | # Using theano constant to prevent upcasting of float32 503 | one = T.constant(1) 504 | 505 | t = t_prev + 1 506 | a_t = learning_rate*T.sqrt(one-self.beta2**t)/(one-self.beta1**t) 507 | 508 | for param, g_t in zip(params, grads): 509 | value = param.get_value(borrow=True) 510 | m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), 511 | broadcastable=param.broadcastable) 512 | v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), 513 | broadcastable=param.broadcastable) 514 | 515 | m_t = self.beta1*m_prev + (one-self.beta1)*g_t 516 | v_t = self.beta2*v_prev + (one-self.beta2)*g_t**2 517 | step = a_t*m_t/(T.sqrt(v_t) + self.epsilon) 518 | 519 | updates[m_prev] = m_t 520 | updates[v_prev] = v_t 521 | new_param = param - step 522 | if self.max_colm_norm and param.name in ["W", "w"]: 523 | new_param_final = norm_constraint(tensor_var=new_param, 524 | max_norm=self.max_norm) 525 | else: 526 | new_param_final = new_param 527 | updates[param] = new_param_final 528 | 529 | updates[t_prev] = t 530 | 531 | return updates 532 | 533 | 534 | class Adamax(LearningRule): 535 | """ 536 | Adamax updates. 537 | Adamax updates implemented as in [1]_. This is a variant of of the Adam 538 | algorithm based on the infinity norm. 539 | Parameters: 540 | beta1 : float 541 | Exponential decay rate for the first moment estimates. 542 | beta2 : float 543 | Exponential decay rate for the weighted infinity norm estimates. 544 | epsilon : float 545 | Constant for numerical stability. 546 | Credit: 547 | https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py 548 | 549 | References 550 | ---------- 551 | .. [1] Kingma, Diederik, and Jimmy Ba (2014): 552 | Adam: A Method for Stochastic Optimization. 553 | arXiv preprint arXiv:1412.6980. 554 | """ 555 | def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, 556 | max_colm_norm=False, max_norm=15.0): 557 | self.beta1 = sharedX_value(beta1, name='beta1', borrow=True) 558 | self.beta2 = sharedX_value(beta2, name='beta2', borrow=True) 559 | self.epsilon = sharedX_value(epsilon, name='epsilon', borrow=True) 560 | self.max_colm_norm = max_colm_norm 561 | self.max_norm = max_norm 562 | 563 | def get_updates(self, learning_rate, params, grads, lr_scalers): 564 | """Compute the parameters' updates. 565 | 566 | """ 567 | t_prev = theano.shared(floatX(0.)) 568 | updates = OrderedDict() 569 | 570 | # Using theano constant to prevent upcasting of float32 571 | one = T.constant(1) 572 | 573 | t = t_prev + 1 574 | a_t = learning_rate/(one-self.beta1**t) 575 | 576 | for param, g_t in zip(params, grads): 577 | value = param.get_value(borrow=True) 578 | m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), 579 | broadcastable=param.broadcastable) 580 | u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), 581 | broadcastable=param.broadcastable) 582 | 583 | m_t = self.beta1*m_prev + (one-self.beta1)*g_t 584 | u_t = T.maximum(self.beta2*u_prev, abs(g_t)) 585 | step = a_t*m_t/(u_t + self.epsilon) 586 | 587 | updates[m_prev] = m_t 588 | updates[u_prev] = u_t 589 | new_param = param - step 590 | if self.max_colm_norm and param.name in ["W", "w"]: 591 | new_param_final = norm_constraint(tensor_var=new_param, 592 | max_norm=self.max_norm) 593 | else: 594 | new_param_final = new_param 595 | updates[param] = new_param_final 596 | 597 | updates[t_prev] = t 598 | 599 | return updates 600 | -------------------------------------------------------------------------------- /mnist_manip.py: -------------------------------------------------------------------------------- 1 | import keras 2 | from keras.datasets import cifar10 3 | import numpy as np 4 | import os 5 | import sys 6 | import matplotlib.pyplot as plt 7 | import cPickle as pkl 8 | from tools import add_noise 9 | from tools import add_cifar_10 10 | from scipy import ndimage 11 | 12 | 13 | def repeat_it(x, y, nbr): 14 | out_x, out_y = None, None 15 | for i in range(nbr): 16 | gen = add_noise(x) 17 | if out_x is None: 18 | out_x = gen 19 | out_y = y 20 | else: 21 | out_x = np.vstack((out_x, gen)) 22 | out_y = np.hstack((out_y, y)) 23 | return out_x, out_y 24 | 25 | 26 | def repeat_it_cifar(x, y, nbr, x_cifar): 27 | out_x, out_y = None, None 28 | for i in range(nbr): 29 | gen = add_cifar_10(x, x_cifar) 30 | if out_x is None: 31 | out_x = gen 32 | out_y = y 33 | else: 34 | out_x = np.vstack((out_x, gen)) 35 | out_y = np.hstack((out_y, y)) 36 | return out_x, out_y 37 | 38 | # MNIST + noise 39 | #path_data = "./data/mnist.pkl" 40 | #f = open(path_data, 'r') 41 | #train, valid, test = pkl.load(f) 42 | #trainx, trainy = train[0], train[1] 43 | #validx, validy = valid[0], valid[1] 44 | #testx, testy = test[0], test[1] 45 | # 46 | ## random noise 47 | #times_tr, times_vl, times_ts = 2, 2, 5 48 | # 49 | #trainx_noise, trainy_new = repeat_it(trainx, trainy, times_tr) 50 | #validx_noise, validy_new = repeat_it(validx, validy, times_vl) 51 | #testx_noise, testy_new = repeat_it(testx, testy, times_ts) 52 | # 53 | #stuff = [(trainx_noise, trainy_new), (validx_noise, validy_new), 54 | # (testx_noise, testy_new)] 55 | #with open("./data/mnist_noise.pkl", "w") as f: 56 | # pkl.dump(stuff, f) 57 | #path = "./data/mnist_noise/" 58 | #if not os.path.exists(path): 59 | # os.makedirs(path) 60 | #for k in range(trainx_noise.shape[0]): 61 | # fig = plt.figure() 62 | # plt.imshow(trainx_noise[k].reshape(28, 28), cmap='gray') 63 | # fig.savefig(path + str(k) + ".png") 64 | # # blurred 65 | # if k == 10: 66 | # break 67 | 68 | 69 | # MNIST + cifar 10. 70 | path_data = "./data/mnist.pkl" 71 | f = open(path_data, 'r') 72 | train, valid, test = pkl.load(f) 73 | trainx, trainy = train[0], train[1] 74 | validx, validy = valid[0], valid[1] 75 | testx, testy = test[0], test[1] 76 | (x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = cifar10.load_data() 77 | 78 | # random noise 79 | times_tr, times_vl, times_ts = 2, 2, 5 80 | 81 | trainx_noise, trainy_new = repeat_it_cifar(trainx, trainy, times_tr, x_train_cifar[:40000]) 82 | validx_noise, validy_new = repeat_it_cifar(validx, validy, times_vl, x_train_cifar[40000:]) 83 | testx_noise, testy_new = repeat_it_cifar(testx, testy, times_ts, x_test_cifar) 84 | 85 | stuff = [(trainx_noise, trainy_new), (validx_noise, validy_new), 86 | (testx_noise, testy_new)] 87 | with open("./data/mnist_img.pkl", "w") as f: 88 | pkl.dump(stuff, f) 89 | path = "./data/mnist_img/" 90 | if not os.path.exists(path): 91 | os.makedirs(path) 92 | for k in range(trainx_noise.shape[0]): 93 | fig = plt.figure() 94 | plt.imshow(trainx_noise[k].reshape(28, 28), cmap='gray') 95 | fig.savefig(path + str(k) + ".png") 96 | # blurred 97 | if k == 10: 98 | break -------------------------------------------------------------------------------- /non_linearities.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | 4 | def relu(x): 5 | return T.switch(x > 0, x, 0) 6 | 7 | 8 | class NonLinearity: 9 | RELU = "rectifier" 10 | TANH = "tanh" 11 | SIGMOID = "sigmoid" 12 | SOFTMAX = "softmax" 13 | 14 | 15 | def softmax(x): 16 | return T.exp(x)/(T.exp(x).sum(1, keepdims=True)) 17 | 18 | 19 | def get_non_linearity_fn(nonlinearity): 20 | if nonlinearity == NonLinearity.SIGMOID: 21 | return T.nnet.sigmoid 22 | elif nonlinearity == NonLinearity.RELU: 23 | return relu 24 | elif nonlinearity == NonLinearity.TANH: 25 | return T.tanh 26 | elif nonlinearity == NonLinearity.SOFTMAX: 27 | return softmax # T.nnet.softmax 28 | elif nonlinearity is None: 29 | return None 30 | 31 | 32 | def get_non_linearity_str(nonlinearity): 33 | if nonlinearity == T.nnet.sigmoid: 34 | return NonLinearity.SIGMOID 35 | elif nonlinearity == relu: 36 | return NonLinearity.RELU 37 | elif nonlinearity == T.tanh: 38 | return NonLinearity.TANH 39 | elif nonlinearity == T.nnet.softmax: 40 | return None # we do not use any non-linearity. 41 | elif nonlinearity == softmax: 42 | return None # we do not use any non-linearity. 43 | elif nonlinearity is None: 44 | return None 45 | else: 46 | raise ValueError("Unknown non-linearity") 47 | 48 | 49 | class CostType: 50 | MeanSquared = "MeanSquaredCost" 51 | CrossEntropy = "CrossEntropy" 52 | NegativeLogLikelihood = "NegativelogLikelihood" 53 | -------------------------------------------------------------------------------- /normalization.py: -------------------------------------------------------------------------------- 1 | # Based on: https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/ 2 | # normalization.py#L120-L320 3 | import theano 4 | import numpy as np 5 | from theano import tensor as T 6 | 7 | 8 | class BatchNormLayer(object): 9 | """ Implementation of batch normalization from the paper: 10 | Ioffe, Sergey and Szegedy, Christian (2015): 11 | Batch Normalization: Accelerating Deep Network Training by Reducing 12 | Internal Covariate Shift. http://arxiv.org/abs/1502.03167. 13 | """ 14 | def __init__(self, input_shape, axes='auto', epsilon=1e-4, alpha=0.1, 15 | beta=0., gamma=0, mean=0, inv_std=1): 16 | self.input_shape = input_shape 17 | if axes == 'auto': 18 | # default normalizationover lla but the not the second axis. 19 | axes = (0,) + tuple(range(2, len(self.input_shape))) 20 | elif isinstance(axes, int): 21 | axes = (axes,) 22 | self.axes = axes 23 | self.epsilon = epsilon 24 | self.alpha = alpha 25 | 26 | # create params 27 | shape = [size for axis, size in enumerate(self.input_shape) 28 | if axis not in self.axes] 29 | if any(size is None for size in shape): 30 | raise ValueError("BatchNormLayer needs specified input shape for " 31 | "all axes not normalized over.") 32 | if beta is None: 33 | self.beta = 0. 34 | else: 35 | value = np.ones(shape, dtype=theano.config.floatX) * beta 36 | self.beta = theano.shared(value=value.astype(theano.config.floatX), 37 | name="beta", borrow=True) 38 | 39 | if gamma is None: 40 | self.gamma = 0. 41 | else: 42 | value = np.ones(shape, dtype=theano.config.floatX) * gamma 43 | self.gamma = theano.shared( 44 | value=value.astype(theano.config.floatX), 45 | name="gamma", borrow=True) 46 | 47 | value = np.ones(shape, dtype=theano.config.floatX) * mean 48 | self.mean = theano.shared(value=value.astype(theano.config.floatX), 49 | name="mean", borrow=True) 50 | 51 | value = np.ones(shape, dtype=theano.config.floatX) * inv_std 52 | self.inv_std = theano.shared(value=value.astype(theano.config.floatX), 53 | name="inv_std", borrow=True) 54 | self.params = [self.beta, self.gamma] 55 | self.stats = [self.mean, self.inv_std] 56 | 57 | def get_output_for(self, input, deterministic=False, 58 | batch_norm_use_averages=None, 59 | batch_norm_update_averages=None): 60 | input_mean = input.mean(self.axes) 61 | input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon)) 62 | 63 | # decide whether to use the sotred averages or mini-batch statistics 64 | if batch_norm_use_averages is None: 65 | batch_norm_use_averages = deterministic 66 | use_averages = batch_norm_use_averages 67 | 68 | if use_averages: 69 | mean = self.mean 70 | inv_std = self.inv_std 71 | else: 72 | mean = input_mean 73 | inv_std = input_inv_std 74 | 75 | # decide whether to update the stored averages 76 | if batch_norm_update_averages is None: 77 | batch_norm_update_averages = not deterministic 78 | update_averages = batch_norm_update_averages 79 | 80 | if update_averages: 81 | # Trick: To update the stored statistics, we create memory-aliased 82 | # clones of the stored statistics. 83 | running_mean = theano.clone(self.mean, share_inputs=False) 84 | running_inv_std = theano.clone(self.inv_std, share_inputs=False) 85 | # set a default update for them 86 | running_mean.default_update = ((1 - self.alpha) * running_mean + 87 | self.alpha * input_mean) 88 | running_inv_std.default_update = ((1 - self.alpha) * 89 | running_inv_std + 90 | self.alpha * input_inv_std) 91 | # and make sure they end up in the graph without participating in 92 | # the computation (this way their default_update will be collected 93 | # and applied, but the computation will be optimized away): 94 | mean += 0 * running_mean 95 | inv_std += 0 * running_inv_std 96 | # prepare dimshuffle pattern inserting broadcastable axes as needed 97 | param_axes = iter(range(input.ndim - len(self.axes))) 98 | pattern = ['x' if input_axis in self.axes 99 | else next(param_axes) 100 | for input_axis in range(input.ndim)] 101 | 102 | # apply dimshuffle pattern to all parameters 103 | beta = 0 if self.beta is None else self.beta.dimshuffle(pattern) 104 | gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern) 105 | mean = mean.dimshuffle(pattern) 106 | inv_std = inv_std.dimshuffle(pattern) 107 | 108 | # normalize 109 | normalized = (input - mean) * (gamma * inv_std) + beta 110 | return normalized 111 | -------------------------------------------------------------------------------- /outputjobs/.readme.md: -------------------------------------------------------------------------------- 1 | This folder contains the output of the [Slurm](https://slurm.schedmd.com/) jobs. -------------------------------------------------------------------------------- /p100.sl: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Slurm submission script, 4 | # GPU job 5 | # CRIHAN v 1.00 - Jan 2017 6 | # support@criann.fr 7 | 8 | # Not shared resources 9 | #SBATCH --share 10 | 11 | # Job name 12 | #SBATCH -J "lenet" 13 | 14 | # Batch output file 15 | #SBATCH --output ./outputjobs/lenet.o%J 16 | 17 | # Batch error file 18 | #SBATCH --error ./outputjobs/lenet.e%J 19 | 20 | # GPUs architecture and number 21 | # ---------------------------- 22 | # Partition (submission class) 23 | #SBATCH --partition gpu_p100 24 | 25 | # GPUs per compute node 26 | # gpu:4 (maximum) for gpu_k80 27 | # gpu:2 (maximum) for gpu_p100 28 | #SBATCH --gres gpu:1 29 | # ---------------------------- 30 | 31 | # Job time (hh:mm:ss) 32 | #SBATCH --time 24:00:00 33 | 34 | # MPI task maximum memory (MB) 35 | #SBATCH --mem-per-cpu 3000 36 | # ---------------------------- 37 | 38 | #SBATCH --mail-type ALL 39 | # User e-mail address 40 | #SBATCH --mail-user soufiane.belharbi@insa-rouen.fr 41 | 42 | # environments 43 | # --------------------------------- 44 | module load cuda/8.0 45 | module load python/2.7.12 46 | # --------------------------------- 47 | 48 | cd $LOCAL_WORK_DIR/workspace/code/class-invariance-hint/ 49 | 50 | # THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python trainLenet.py lenet_0_1000_3_0_0_1_0_0_True_False_False_False_False.yaml 51 | 52 | -------------------------------------------------------------------------------- /plot_paper.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import cPickle as pkl 4 | from keras.datasets import cifar10 5 | from tools import add_noise 6 | from tools import add_cifar_10 7 | import copy 8 | 9 | 10 | path_data = "./data/out.pkl" 11 | f = open(path_data, 'r') 12 | train, valid, test = pkl.load(f) 13 | trainx, trainy = train[0], train[1] 14 | print trainy 15 | (x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = cifar10.load_data() 16 | # with open("./data/cifar10_data.pkl", 'w') as fx: 17 | # pkl.dump(((x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar)), fx) 18 | #with open("./data/cifar10_data.pkl", 'r') as fx: 19 | # (x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = pkl.load(fx) 20 | 21 | trainx_noise = add_noise(copy.deepcopy(trainx)) 22 | ind = [0, 1, 6, 3, 7] 23 | trainx_img = add_cifar_10(copy.deepcopy(trainx), x_train_cifar[ind], sh=False) 24 | for k in range(trainx_img.shape[0]): 25 | fig = plt.figure() 26 | plt.imshow(trainx_img[k].reshape(28, 28), cmap='gray') 27 | fig.savefig("./data/"+ str(k) + ".png") 28 | x = np.vstack((trainx, trainx_noise, trainx_img)) 29 | # Plot 30 | fig, axes = plt.subplots(3, 5, figsize=(12, 6), 31 | subplot_kw={'xticks': [], 'yticks': []}) 32 | 33 | 34 | i = 0 35 | for ax in axes.flat: 36 | print x_train_cifar.shape 37 | # img = x_train_cifar[i, :, :, 0].reshape(32, 32) 38 | img = x[i, :].reshape(28, 28) 39 | ax.imshow(img, cmap='gray', interpolation="bilinear") 40 | ax.set_aspect("auto") 41 | ax.set_xticklabels([]) 42 | ax.set_yticklabels([]) 43 | i += 1 44 | 45 | fig.subplots_adjust(hspace=0.01, wspace=0.01) 46 | fig.savefig("./data/samples.eps", format="eps", dpi=300) 47 | -------------------------------------------------------------------------------- /submit.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | sbatch ./jobs/10_1000_0.sl 3 | sbatch ./jobs/10_1000_1.sl 4 | sbatch ./jobs/10_1000_2.sl 5 | sbatch ./jobs/10_1000_3.sl 6 | sbatch ./jobs/10_1000_4.sl 7 | sbatch ./jobs/10_1000_5.sl 8 | sbatch ./jobs/10_1000_6.sl 9 | -------------------------------------------------------------------------------- /train3_bin.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import numpy as np 3 | import theano.tensor as T 4 | import os 5 | import sys 6 | import datetime as DT 7 | import shutil 8 | import inspect 9 | import theano 10 | import warnings 11 | import yaml 12 | 13 | from tools import ModelMLP 14 | from tools import NonLinearity 15 | from tools import split_data_to_minibatchs_eval 16 | from tools import sharedX_value 17 | from tools import theano_fns 18 | from learning_rule import AdaDelta 19 | from learning_rule import RMSProp 20 | from learning_rule import Momentum 21 | from tools import evaluate_model 22 | from tools import collect_stats_epoch 23 | from tools import plot_stats 24 | from tools import train_one_epoch 25 | from tools import train_one_epoch_alter 26 | from tools import to_categorical 27 | from tools import plot_classes 28 | from tools import chunks 29 | from tools import plot_penalty_vl 30 | from tools import plot_debug_grad 31 | from tools import plot_debug_ratio_grad 32 | 33 | 34 | # Parse the yaml config. 35 | config_path = "./config_yaml/" 36 | with open(config_path + sys.argv[1], 'r') as fy: 37 | config_exp = yaml.load(fy) 38 | 39 | x_classes = 2 40 | 41 | cs = [1, 7] 42 | debug_code = config_exp["debug_code"] 43 | 44 | if debug_code: 45 | warnings.warn("YOU ARE IN DEBUG MODE! YOUR CODE WILL TAKE MORE TIME!!!!!") 46 | 47 | 48 | def standerize(d, mu=None, sigma=None): 49 | if mu is None: 50 | mu = np.mean(d, axis=0) 51 | sigma = np.std(d, axis=0) 52 | if sigma.nonzero()[0].shape[0] == 0: 53 | raise Exception("std found to be zero!!!!") 54 | norm_d = (d - mu) / sigma 55 | 56 | return norm_d, mu, sigma 57 | 58 | 59 | def get_class_c(x, y, c, nbr): 60 | ind = np.argwhere(y == c) 61 | x_out = x[ind] 62 | y_out = y[ind] 63 | x_out = x_out.reshape(x_out.shape[0], x.shape[1]) 64 | y_out = y_out.reshape(y_out.shape[0],) 65 | 66 | if nbr is not None: 67 | x_out = x_out[:nbr, :] 68 | y_out = y_out[:nbr] 69 | return x_out, y_out 70 | 71 | 72 | def get_data(cs, x, y, nbr, shuffle=False): 73 | datax, datay = None, None 74 | for c in cs: 75 | xx, yy = get_class_c(x, y, c, nbr) 76 | 77 | if datax is None: 78 | datax = xx 79 | datay = yy 80 | else: 81 | datax = np.vstack((datax, xx)) 82 | datay = np.hstack((datay, yy)) 83 | # suffle 84 | if shuffle: 85 | megaxy = np.hstack((datax, datay.reshape(datay.size, 1))) 86 | for i in range(100): 87 | np.random.shuffle(megaxy) 88 | datax = megaxy[:, :-1] 89 | datay = megaxy[:, -1] 90 | else: 91 | return datax, datay 92 | return datax, datay 93 | 94 | 95 | def rename_classes(y): 96 | un = np.unique(y) 97 | un = np.array(sorted(un)) 98 | y_out = y * 0 99 | for u, re in zip(un, range(un.size)): 100 | ind = np.argwhere(y == u) 101 | y_out[ind] = re 102 | return y_out 103 | 104 | 105 | def get_inter_output(model, l_tst, testx_sh): 106 | i_x_vl = T.lvector("ixtst") 107 | 108 | eval_fn_tst = theano.function( 109 | [i_x_vl], 110 | [l.output for l in model.layers], 111 | givens={model.x: testx_sh[i_x_vl]}) 112 | output_v = [ 113 | eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))] 114 | nbr_layers = len(output_v[0]) 115 | 116 | l_val = [] 117 | for l in range(nbr_layers): 118 | tmp = None 119 | for k in output_v: 120 | if tmp is None: 121 | tmp = k[l] 122 | else: 123 | tmp = np.vstack((tmp, k[l])) 124 | l_val.append(tmp) 125 | 126 | return l_val 127 | 128 | 129 | def generate_2d_checkboard(x_born, y_born, s, ss): 130 | """x_born: [-1, 1], y_born:[-1, 1], s=10, ss=20 131 | """ 132 | linex = np.linspace(x_born[0], x_born[1], s, endpoint=False) 133 | liney = np.linspace(y_born[0], y_born[1], s, endpoint=False) 134 | x, y = [], [] 135 | start_y = True 136 | for ix in range(linex.size - 1): 137 | lx, lxnext = linex[ix], linex[ix+1] 138 | for iy in range(liney.size - 1): 139 | ly, lynext = liney[iy], liney[iy+1] 140 | linexx = np.linspace(lx, lxnext, ss, endpoint=False) 141 | lineyy = np.linspace(ly, lynext, ss, endpoint=False) 142 | xv, yv = np.meshgrid(linexx, lineyy) 143 | for i in range(xv.shape[0]): 144 | for j in range(yv.shape[0]): 145 | x.append([xv[i, j], yv[i, j]]) 146 | y.append(start_y) 147 | start_y = not start_y 148 | 149 | y = np.array(y) * 1. 150 | x = np.array(x) 151 | mega = np.hstack((x, y.reshape(y.size, 1))) 152 | for i in range(500): 153 | np.random.shuffle(mega) 154 | print i 155 | x = mega[:, :-1] 156 | y = mega[:, -1] 157 | print x.shape, y.shape 158 | fig = plot_classes(y, x, "", 0., "generated 2D: checkboard.") 159 | fig.savefig("data/2d/cb2d_generated.png", bbox_inches='tight') 160 | return x, y 161 | 162 | 163 | def create_tr_vl_ts_cb(path): 164 | if not os.path.exists(path): 165 | os.makedirs(path) 166 | x, y = generate_2d_checkboard([-1, 1], [-1, 1], 10, 20) 167 | nbr = x.shape[0] 168 | l1 = int(nbr*2/3.) 169 | l2 = int(nbr * ((2/3.) + 1/6.)) 170 | trainx, trainy = x[:l1, :], y[:l1] 171 | validx, validy = x[l1:l2, :], y[l1:l2] 172 | testx, testy = x[l2:, :], y[l2:] 173 | trfig = plot_classes(trainy, trainx, "", 0., "g.tr 2D: checkboard.") 174 | vlfig = plot_classes(validy, validx, "", 0., "g.vl 2D: checkboard.") 175 | tsfig = plot_classes(testy, testx, "", 0., "g.tst 2D: checkboard.") 176 | trfig.savefig(path + "/traingfig.png", bbox_inches='tight') 177 | vlfig.savefig(path + "/validfig.png", bbox_inches='tight') 178 | tsfig.savefig(path + "/testfig.png", bbox_inches='tight') 179 | # dump 180 | with open(path+"/cb.pkl", "w") as f: 181 | stuff = {"trainx": trainx, "trainy": trainy, 182 | "validx": validx, "validy": validy, 183 | "testx": testx, "testy": testy} 184 | pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL) 185 | 186 | # create_tr_vl_ts_cb("data/2d") 187 | 188 | 189 | def generate_2d_data_bin(nbr, mn1, cov1, mn2, cov2): 190 | """Generate 2D points using multivariate normal distribution. 191 | nbr: number of samples per class.""" 192 | x1 = np.random.multivariate_normal(mn1, cov1, nbr) 193 | x2 = np.random.multivariate_normal(mn2, cov2, nbr) 194 | y1 = np.zeros((nbr, 1), dtype=np.float32) 195 | y2 = np.ones((nbr, 1), dtype=np.float32) 196 | x = np.vstack((x1, x2)) 197 | y = np.vstack((y1, y2)) 198 | print x.shape, y.shape 199 | mega = np.hstack((x, y.reshape(y.size, 1))) 200 | for i in range(100): 201 | np.random.shuffle(mega) 202 | x = mega[:, :-1] 203 | y = mega[:, -1] 204 | fig = plot_classes(y, x, "", 0., "generated 2D: multivariate normal.") 205 | return x, y, fig 206 | 207 | 208 | def generate_all_2d_data(path): 209 | if not os.path.exists(path): 210 | os.makedirs(path) 211 | mn1, cov1 = [1, 0], [[1, -0.5], [-0.5, 1]] 212 | mn2, cov2 = [4, 0], [[1, 0], [0, 1]] 213 | trainx, trainy, trainfig = generate_2d_data_bin(25000, mn1, 214 | cov1, mn2, cov2) 215 | minx = np.min(trainx, axis=0) 216 | maxx = np.max(trainx, axis=0) 217 | 218 | trainx = (trainx - minx)/(maxx - minx) 219 | trainfig = plot_classes(trainy, trainx, "", 0., 220 | "generated 2D: multivariate normal.") 221 | validx, validy, validfig = generate_2d_data_bin(5000, mn1, 222 | cov1, mn2, cov2) 223 | validx = (validx - minx)/(maxx - minx) 224 | validfig = plot_classes(trainy, trainx, "", 0., 225 | "generated 2D: multivariate normal.") 226 | testx, testy, testfig = generate_2d_data_bin(5000, mn1, 227 | cov1, mn2, cov2) 228 | trainfig.savefig(path + "/traingfig.png", bbox_inches='tight') 229 | validfig.savefig(path + "/validfig.png", bbox_inches='tight') 230 | testfig.savefig(path + "/testfig.png", bbox_inches='tight') 231 | 232 | 233 | def generate_nested_circles(n): 234 | limits = [0, 1./3, 2./3, 1, 2] 235 | np.random.seed(0) 236 | X = np.random.rand(n, 2)*2-1 237 | Xd = np.sqrt((X**2).sum(axis=1)) 238 | Y = np.zeros((n, ), dtype='bool') 239 | classe = True 240 | for b1, b2 in zip(limits[:-1], limits[1:]): 241 | (idx, ) = np.nonzero(np.logical_and(b1 < Xd, Xd <= b2)) 242 | Y[idx] = classe 243 | classe = not classe 244 | Y = Y.astype(np.float32) 245 | mega = np.hstack((X, Y.reshape(Y.size, 1))) 246 | for i in range(500): 247 | np.random.shuffle(mega) 248 | print i 249 | x = mega[:, :-1] 250 | y = mega[:, -1] 251 | print x.shape, y.shape 252 | fig = plot_classes(y, x, "", 0., "generated 2D: nested circles.") 253 | fig.savefig("data/nestedcircle/nc_generated.png", bbox_inches='tight') 254 | return x, y 255 | 256 | 257 | def create_tr_vl_ts_nc(path, n): 258 | """nested circles""" 259 | if not os.path.exists(path): 260 | os.makedirs(path) 261 | x, y = generate_nested_circles(n) 262 | nbr = x.shape[0] 263 | l1 = int(nbr*2/3.) 264 | l2 = int(nbr * ((2/3.) + 1/6.)) 265 | trainx, trainy = x[:l1, :], y[:l1] 266 | validx, validy = x[l1:l2, :], y[l1:l2] 267 | testx, testy = x[l2:, :], y[l2:] 268 | trfig = plot_classes( 269 | trainy, trainx, "", 0., "g.tr 2D: nested circles." + str(l1)) 270 | vlfig = plot_classes( 271 | validy, validx, "", 0., "g.vl 2D: nested circles." + str(l2-l1)) 272 | tsfig = plot_classes( 273 | testy, testx, "", 0., "g.tst 2D: nested circles." + str(y.size - l2)) 274 | trfig.savefig(path + "/traingfig.png", bbox_inches='tight') 275 | vlfig.savefig(path + "/validfig.png", bbox_inches='tight') 276 | tsfig.savefig(path + "/testfig.png", bbox_inches='tight') 277 | # dump 278 | with open(path+"/nc.pkl", "w") as f: 279 | stuff = {"trainx": trainx, "trainy": trainy, 280 | "validx": validx, "validy": validy, 281 | "testx": testx, "testy": testy} 282 | pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL) 283 | 284 | 285 | # def knn1(model, l_tst, testx_sh, l_tr, trainx_sh): 286 | 287 | # create_tr_vl_ts_nc("data/nestedcircle", 50000) 288 | 289 | # DATA MNIST 290 | # ============================================================================= 291 | path_data = "data/mnist.pkl" 292 | f = open(path_data, 'r') 293 | train, valid, test = pkl.load(f) 294 | trainx, trainy = train[0], train[1] 295 | validx, validy = valid[0], valid[1] 296 | testx, testy = test[0], test[1] 297 | 298 | # How much to take for training? 299 | nbr_sup = config_exp["nbr_sup"] 300 | run = config_exp["run"] 301 | print "RUN:", run 302 | print "SUP: ", nbr_sup 303 | # get the data of each class 304 | # train 305 | nbx = nbr_sup / len(cs) 306 | trainx, trainy = get_data(cs, trainx, trainy, nbx, True) 307 | validx, validy = get_data(cs, validx, validy, None, False) 308 | testx, testy = get_data(cs, testx, testy, None, False) 309 | 310 | # convert the name of the classes from the ream name to: 0, 1, 2, ... 311 | testy_int = testy 312 | trainy = rename_classes(trainy) 313 | validy = rename_classes(validy) 314 | testy = rename_classes(testy) 315 | with open("data/mnist_bin17.pkl", "w") as f17: 316 | stuff = ((trainx, trainy), (validx, validy), (testx, testy)) 317 | print trainx.shape, validx.shape, testx.shape 318 | pkl.dump(stuff, f17) 319 | sys.exit() 320 | # ============================================================================= 321 | 322 | # DATA MNIST -- end 323 | 324 | # DATA CHECKBOARD 325 | #============================================================================== 326 | # p_data = "nestedcircle" 327 | # path_data = "data/" + p_data + "/nc.pkl" 328 | # f = open(path_data, 'r') 329 | # stuff = pkl.load(f) 330 | # trainx, trainy = stuff["trainx"], stuff["trainy"] 331 | # validx, validy = stuff["validx"], stuff["validy"] 332 | # testx, testy = stuff["testx"], stuff["testy"] 333 | # 334 | # 335 | # # How much to take for training? 336 | # nbr_sup = int(sys.argv[6]) 337 | # run = int(sys.argv[7]) 338 | # print "RUN:", run 339 | # print "SUP: ", nbr_sup 340 | # trainx, trainy = trainx[:nbr_sup, :], trainy[:nbr_sup] 341 | # trfig = plot_classes( 342 | # trainy, trainx, "", 0., "tr 2D: nested circles." + str(nbr_sup)) 343 | # trfig.savefig("data/" + p_data + "/trfig_" + str(nbr_sup) + ".png", 344 | # bbox_inches='tight') 345 | # 346 | # testy_int = testy 347 | #============================================================================== 348 | 349 | # DATA CHECKBOARD --end 350 | 351 | # Prepare the pre-shuffling 352 | if not os.path.exists("data/" + str(nbr_sup)): 353 | os.makedirs("data/" + str(nbr_sup)) 354 | trainx_tmp = trainx 355 | trainy_tmp = trainy 356 | 357 | big_mtx = np.hstack((trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1))) 358 | print "Going to shuffle the train data. It takes some time ..." 359 | period = 200 360 | i = 0 361 | #for k in xrange(5000): 362 | # np.random.shuffle(big_mtx) 363 | # if k % period == 0: 364 | # trainx_tmp2 = big_mtx[:, 0:trainx_tmp.shape[1]] 365 | # trainy_tmp2 = big_mtx[:, -1] 366 | # stuff = {"x": trainx_tmp2, "y": trainy_tmp2} 367 | # print k 368 | # with open("data/"+str(nbr_sup) + "/" + str(i) + ".pkl", 'w') as f: 369 | # pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL) 370 | # i += 1 371 | 372 | #with open("data/"+str(nbr_sup) + "/0.pkl") as f: 373 | # stuff = pkl.load(f) 374 | # trainx, trainy = stuff["x"], stuff["y"] 375 | # share over gpu: we can store the whole mnist over the gpu. 376 | # Train 377 | trainx_sh = theano.shared(trainx.astype(theano.config.floatX), 378 | name="trainx", borrow=True) 379 | trainlabels_sh = theano.shared(trainy.astype(theano.config.floatX), 380 | name="trainlabels", borrow=True) 381 | trainy_sh = theano.shared(to_categorical(trainy, x_classes).astype( 382 | theano.config.floatX), name="trainy", borrow=True) 383 | 384 | # valid 385 | validx_sh = theano.shared(validx.astype(theano.config.floatX), 386 | name="validx", borrow=True) 387 | validlabels_sh = theano.shared(validy.astype(theano.config.floatX), 388 | name="validlabels", borrow=True) 389 | # 390 | input = T.fmatrix("x") 391 | input1 = T.fmatrix("x1") 392 | input2 = T.fmatrix("x2") 393 | rng = np.random.RandomState(23455) 394 | # Architecture 395 | nhid_l0 = 300 396 | nhid_l1 = 200 397 | nhid_l2 = 100 398 | 399 | nbr_classes = x_classes 400 | h_ind = config_exp["h_ind"] 401 | h_ind = [int(tt) for tt in h_ind] 402 | 403 | assert len(h_ind) == 4 404 | h0, h1, h2, h3, h4, h5, h6, h7, h8 = None, None, None, None, None, None, None,\ 405 | None, None 406 | l_v = [] 407 | for xx in h_ind: 408 | print xx 409 | if int(xx) == 1: 410 | l_v.append(True) 411 | elif int(xx) == 0: 412 | l_v.append(False) 413 | else: 414 | raise ValueError("Error in applying hint: 0/1") 415 | 416 | hint_type = "l2mean" # "l1mean" 417 | print l_v 418 | corrupt_input_l = config_exp["corrupt_input_l"] 419 | if corrupt_input_l != 0.: 420 | warnings.warn( 421 | "YOU ASKED TO USE DENOISING PROCESS OVER THE INPUTS OF THE FIRST LAYER" 422 | ) 423 | if not config_exp["hint"]: 424 | raise ValueError( 425 | "You asked for densoing process but you are not using the penalty") 426 | start_corrupting = config_exp["start_corrupting"] 427 | warnings.warn( 428 | "CORRUPTION WILL START AFTER:" + str(start_corrupting) + " epochs!!!!!!") 429 | use_sparsity = config_exp["use_sparsity"] 430 | use_sparsity_in_pred = config_exp["use_sparsity_in_pred"] 431 | print "Use sparsity: ", use_sparsity 432 | print "Use sparsity in pred:", use_sparsity_in_pred 433 | layer0 = { 434 | "rng": rng, 435 | "n_in": trainx.shape[1], 436 | "n_out": nhid_l0, 437 | "W": None, 438 | "b": None, 439 | "activation": NonLinearity.SIGMOID, 440 | "hint": hint_type, 441 | "use_hint": l_v[0], 442 | "intended_to_be_corrupted": True, 443 | "corrupt_input_l": corrupt_input_l, 444 | "use_sparsity": use_sparsity, 445 | "use_sparsity_in_pred": use_sparsity_in_pred 446 | } 447 | 448 | layer1 = { 449 | "rng": rng, 450 | "n_in": nhid_l0, 451 | "n_out": nhid_l1, 452 | "W": None, 453 | "b": None, 454 | "activation": NonLinearity.SIGMOID, 455 | "hint": hint_type, 456 | "use_hint": l_v[1], 457 | "use_sparsity": use_sparsity, 458 | "use_sparsity_in_pred": use_sparsity_in_pred 459 | } 460 | 461 | layer2 = { 462 | "rng": rng, 463 | "n_in": nhid_l1, 464 | "n_out": nhid_l2, 465 | "W": None, 466 | "b": None, 467 | "activation": NonLinearity.SIGMOID, 468 | "hint": hint_type, 469 | "use_hint": l_v[2], 470 | "use_sparsity": use_sparsity, 471 | "use_sparsity_in_pred": use_sparsity_in_pred 472 | } 473 | 474 | #layer3 = { 475 | # "rng": rng, 476 | # "n_in": nhid_l2, 477 | # "n_out": nhid_l3, 478 | # "W": None, 479 | # "b": None, 480 | # "activation": NonLinearity.SIGMOID, 481 | # "hint": l_v[3] 482 | # } 483 | # 484 | #layer4 = { 485 | # "rng": rng, 486 | # "n_in": nhid_l3, 487 | # "n_out": nhid_l4, 488 | # "W": None, 489 | # "b": None, 490 | # "activation": NonLinearity.SIGMOID, 491 | # "hint": l_v[4] 492 | # } 493 | # 494 | #layer5 = { 495 | # "rng": rng, 496 | # "n_in": nhid_l4, 497 | # "n_out": nhid_l5, 498 | # "W": None, 499 | # "b": None, 500 | # "activation": NonLinearity.SIGMOID, 501 | # "hint": l_v[5] 502 | # } 503 | # 504 | #layer6 = { 505 | # "rng": rng, 506 | # "n_in": nhid_l5, 507 | # "n_out": nhid_l6, 508 | # "W": None, 509 | # "b": None, 510 | # "activation": NonLinearity.SIGMOID, 511 | # "hint": l_v[6] 512 | # } 513 | # 514 | #layer7 = { 515 | # "rng": rng, 516 | # "n_in": nhid_l6, 517 | # "n_out": nhid_l7, 518 | # "W": None, 519 | # "b": None, 520 | # "activation": NonLinearity.SIGMOID, 521 | # "hint": l_v[7] 522 | # } 523 | 524 | output_layer = { 525 | "rng": rng, 526 | "n_in": nhid_l2, 527 | "n_out": nbr_classes, 528 | "W": None, 529 | "b": None, 530 | "activation": NonLinearity.SOFTMAX, 531 | "hint": hint_type, 532 | "use_hint": l_v[3], 533 | "use_sparsity": False, 534 | "use_sparsity_in_pred": False 535 | } 536 | layers = [layer0, layer1, layer2, output_layer] 537 | l1, l2 = 0., 0. 538 | reg_bias = True 539 | margin = sharedX_value(1., name="margin") 540 | similair = theano.shared(np.array([0, 1], dtype=theano.config.floatX), 541 | name="sim") 542 | model = ModelMLP(layers, input, input1, input2, 543 | trainx_sh, trainlabels_sh, trainy_sh, 544 | validx_sh, validlabels_sh, margin, similair, 545 | l1_reg=l1, l2_reg=l2, 546 | reg_bias=reg_bias) 547 | 548 | size_model = str(trainx.shape[1]) +\ 549 | '_'.join([str(l["n_in"]) for l in layers]) + "_" + str(nbr_classes) 550 | path_model_init_params = "init_params/" + size_model + ".pkl" 551 | if not os.path.isfile(path_model_init_params): 552 | model.save_params(path_model_init_params, catched=False) 553 | else: 554 | model.set_params_vals(path_model_init_params) 555 | 556 | train_batch_size = 100 557 | valid_batch_size = 1000 558 | 559 | max_epochs = config_exp["max_epochs"] 560 | lr_vl = 1e-7 561 | lr = sharedX_value(lr_vl, name="lr") 562 | h_w = sharedX_value(1., name="hw") 563 | s_w = sharedX_value(1., name="sw") 564 | lambda_sparsity = sharedX_value(0., name="l_sparsity") 565 | 566 | # Compile functions: train/valid 567 | updater = AdaDelta(decay=0.95) 568 | 569 | # updater = Momentum(0.9, nesterov_momentum=False, imagenet=False, 570 | # imagenetDecay=5e-4, max_colm_norm=False) 571 | 572 | hint = config_exp["hint"] 573 | # "hint", "noHint" 574 | if hint: 575 | tag = "hint" 576 | else: 577 | tag = "noHint" 578 | 579 | norm_gsup = config_exp["norm_gsup"] 580 | norm_gh = config_exp["norm_gh"] 581 | fns = theano_fns(model, learning_rate=lr, 582 | h_w=h_w, s_w=s_w, lambda_sparsity=lambda_sparsity, 583 | updater=updater, tag=tag, 584 | max_colm_norm=False, max_norm=15.0, 585 | norm_gsup=norm_gsup, norm_gh=norm_gh) 586 | 587 | eval_fn, eval_fn_tr = fns["eval_fn"], fns["eval_fn_tr"] 588 | # Things to track during training: epoch and minibatch 589 | train_stats = {"tr_error_ep": [], "vl_error_ep": [], "tr_cost_ep": [], 590 | "tr_error_mn": [], "vl_error_mn": [], "tr_cost_mn": [], 591 | "current_nb_mb": 0, "best_epoch": 0, "best_mn": 0} 592 | 593 | names = [] 594 | for l, i in zip(layers, range(len(layers))): 595 | if l["hint"] is not None: 596 | names.append(i) 597 | debug = {"grad_sup": [], "grad_hint": [], "penalty": [], "names": names} 598 | # Eval before start training 599 | l_vl = chunks(range(validx.shape[0]), valid_batch_size) 600 | l_tr = chunks(range(trainx.shape[0]), valid_batch_size) 601 | vl_err_start = np.mean( 602 | [eval_fn(np.array(l_vl[kk])) for kk in range(len(l_vl))]) 603 | tr_err_start = np.mean( 604 | [eval_fn_tr(np.array(l_tr[kk])) for kk in range(len(l_tr))]) 605 | print vl_err_start, tr_err_start 606 | 607 | # Exp stamp 608 | time_exp = DT.datetime.now().strftime('%m_%d_%Y_%H_%M') 609 | tag_text = "_".join([str(l["hint"]) for l in layers]) 610 | h_exp = "_".join([str(e) for e in h_ind]) 611 | fold_exp = "exps/" + tag + "_" + h_exp + "_" + size_model + "_" + time_exp 612 | if not os.path.exists(fold_exp): 613 | os.makedirs(fold_exp) 614 | 615 | shutil.copy(inspect.stack()[0][1], fold_exp) 616 | shutil.copy(config_path+sys.argv[1], fold_exp) 617 | 618 | # Start training 619 | stop, i = False, 0 620 | div = any([l["hint"] is "contrastive" for l in layers]) 621 | shuffle_period = 1 # epochs 622 | do_shuffle = True 623 | extreme_random = config_exp["extreme_random"] 624 | if extreme_random: 625 | print "Extreme randomness." 626 | else: 627 | print "Same shuffle." 628 | kk = 1 629 | 630 | # TEST BEFORE START TRAINING 631 | testx_sh = theano.shared(testx.astype(theano.config.floatX), 632 | name="testx", borrow=True) 633 | testlabels_sh = theano.shared(testy.astype(theano.config.floatX), 634 | name="testlabels", borrow=True) 635 | 636 | i_x_vl = T.lvector("ixtst") 637 | y_vl = T.vector("y") 638 | error = T.mean(T.neq(T.argmax(model.output, axis=1), y_vl)) 639 | 640 | output_fn_test = [error, model.output] 641 | 642 | eval_fn_tst = theano.function( 643 | [i_x_vl], output_fn_test, 644 | givens={model.x: testx_sh[i_x_vl], 645 | y_vl: testlabels_sh[i_x_vl]}) 646 | l_tst = chunks(range(testx.shape[0]), valid_batch_size) 647 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))] 648 | print test_error_l[0][0] 649 | 650 | test_error = np.mean([l[0] for l in test_error_l]) 651 | print "Test error:", test_error 652 | prediction = None 653 | for l in test_error_l: 654 | if prediction is None: 655 | prediction = l[1] 656 | else: 657 | prediction = np.vstack((prediction, l[1])) 658 | 659 | 660 | with open(fold_exp+"/pred_before.pkl", "w") as fp: 661 | pkl.dump({"y": testy, "pred": prediction}, fp) 662 | 663 | 664 | # fig_scatter = plot_classes(y=testy_int, cord=prediction, names=cs, 665 | # test_error=test_error, message="BEFORE train") 666 | # fig_scatter.savefig(fold_exp+"/pred_before.png", bbox_inches='tight') 667 | 668 | 669 | while i < max_epochs: 670 | if i >= start_corrupting: 671 | warnings.warn( 672 | "SETTING THE CORRUPTION LEVEL TO:" + str(corrupt_input_l)) 673 | model.layers[0].corrupt_input_l.set_value( 674 | np.cast[theano.config.floatX](corrupt_input_l)) 675 | else: 676 | warnings.warn("SETTING THE CORRUPTION LEVEL TO: 0") 677 | model.layers[0].corrupt_input_l.set_value( 678 | np.cast[theano.config.floatX](0.)) 679 | stop = (i == max_epochs - 1) 680 | tx = DT.datetime.now() 681 | stats = train_one_epoch( 682 | model, fns, i, fold_exp, train_stats, vl_err_start, tag, 683 | train_batch_size, l_vl, l_tr, div, stop=stop, 684 | debug=debug, debug_code=debug_code) 685 | txx = DT.datetime.now() 686 | print "CORRUPTION LEVEL VALUE: " +\ 687 | str(model.layers[0].corrupt_input_l.get_value()) 688 | print "One epoch", DT.datetime.now() - tx 689 | train_stats = collect_stats_epoch(stats, train_stats) 690 | if (i % 100 == 0 or stop) and debug_code: 691 | plot_debug_grad(debug, tag_text, fold_exp, "sup") 692 | plot_penalty_vl(debug, tag_text, fold_exp) 693 | if tag == "hint": 694 | plot_debug_grad(debug, tag_text, fold_exp, "hint") 695 | plot_debug_ratio_grad(debug, fold_exp, "h/s") 696 | plot_debug_ratio_grad(debug, fold_exp, "s/h") 697 | 698 | if stop: 699 | plot_stats(train_stats, "ep", fold_exp, tag) 700 | with open(fold_exp + "/train_stats.pkl", 'w') as f_ts: 701 | pkl.dump(train_stats, f_ts) 702 | with open(fold_exp + "/train_debug.pkl", 'w') as f_ts: 703 | pkl.dump(debug, f_ts) 704 | i += 1 705 | # shuffle the data 706 | 707 | print "Going to shuffle the train data." 708 | 709 | if do_shuffle and i % shuffle_period == 0 and not stop: 710 | if extreme_random: 711 | trainx_tmp = model.trainx_sh.get_value() 712 | trainy_tmp = model.trainlabels_sh.get_value() 713 | big_mtx = np.hstack( 714 | (trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1))) 715 | for k in xrange(5): 716 | np.random.shuffle(big_mtx) 717 | trainx_tmp = big_mtx[:, 0:trainx_tmp.shape[1]] 718 | trainy_tmp = big_mtx[:, -1] 719 | else: 720 | with open("data/"+str(nbr_sup) + "/" + str(kk) + ".pkl") as f: 721 | stuff = pkl.load(f) 722 | trainx_tmp, trainy_tmp = stuff["x"], stuff["y"] 723 | model.trainlabels_sh.set_value(trainy_tmp.astype(theano.config.floatX)) 724 | model.trainy_sh.set_value( 725 | to_categorical( 726 | trainy_tmp, nbr_classes).astype(theano.config.floatX)) 727 | model.trainx_sh.set_value(trainx_tmp.astype(theano.config.floatX)) 728 | kk += 1 729 | if kk > 240: 730 | kk = 0 731 | print "Finished loading shuffled data. Updated the train set on GPU." 732 | del stats 733 | print "This part took:", DT.datetime.now() - txx 734 | # new_v = min([1., h_w.get_value() + 0.01]) 735 | # h_w.set_value(np.cast[theano.config.floatX](new_v)) 736 | # Update the importance of the hint 737 | # if i >= 1: 738 | # # new_v = min([1., h_w.get_value() + 0.1]) 739 | # h_w.set_value(np.cast[theano.config.floatX](1.)) 740 | 741 | 742 | # Perform the test 743 | # Set the model's param to the best catched ones 744 | model.set_model_to_catched_params() 745 | # share test data 746 | 747 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))] 748 | 749 | test_error = np.mean([l[0] for l in test_error_l]) 750 | print "Test error:", test_error 751 | 752 | prediction = None 753 | for l in test_error_l: 754 | if prediction is None: 755 | prediction = l[1] 756 | else: 757 | prediction = np.vstack((prediction, l[1])) 758 | 759 | 760 | with open(fold_exp+"/pred_after.pkl", "w") as fp: 761 | pkl.dump({"y": testy, "pred": prediction}, fp) 762 | 763 | ############################################################################## 764 | # GET INTERMEDIATE VALUE AND PLOT THEM. POSSIBLE ONLY WHEN THE INTERMEDIATE 765 | # VALUES ARE 2D. 766 | # inter_vl = get_inter_output(model, l_tst, testx_sh) 767 | # plot the intermediate values 768 | # ll = 0 769 | # for vi in inter_vl: 770 | # fig = plot_classes( 771 | # testy_int, vi, "", test_error, 772 | # "pred. 2D: mnist 1/7. layer" + str(ll)) 773 | # fig.savefig( 774 | # fold_exp + "/predinterlayer" + str(ll) + ".png", bbox_inches='tight') 775 | # ll += 1 776 | 777 | ############################################################################### 778 | 779 | # fig_scatter = plot_classes(y=testy_int, cord=prediction, names=cs, 780 | # test_error=test_error, message="AFTER train") 781 | # fig_scatter.savefig(fold_exp+"/pred_after.png", bbox_inches='tight') 782 | # save min valid 783 | vl_pathfile = "exps/" + "run_" + str(run) + "_sup_" + str(nbr_sup) + "_" +\ 784 | h_exp + "_c_l_" + str(corrupt_input_l) + "_start_at_" +\ 785 | str(start_corrupting) + "_debug_" + str(debug_code) +\ 786 | "_use_sparse_" + str(use_sparsity) + "_use_spar_pred_" +\ 787 | str(use_sparsity_in_pred) + "_" + "norm_" + str(norm_gsup) + "_" +\ 788 | str(norm_gh) + "_" + time_exp + ".txt" 789 | with open(vl_pathfile, 'w') as f: 790 | f.write("Exp. folder: " + fold_exp + "\n") 791 | f.write( 792 | "valid error:" + str( 793 | np.min(train_stats["vl_error_mn"]) * 100.) + " % \n") 794 | f.write("Test error:" + str(test_error * 100.) + " % \n") 795 | shutil.copy(vl_pathfile, fold_exp) 796 | -------------------------------------------------------------------------------- /train3_new_dup.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import numpy as np 3 | import theano.tensor as T 4 | import os 5 | import sys 6 | import datetime as DT 7 | import shutil 8 | import inspect 9 | import theano 10 | import warnings 11 | import yaml 12 | 13 | from tools import ModelMLP 14 | from tools import NonLinearity 15 | from tools import split_data_to_minibatchs_eval 16 | from tools import sharedX_value 17 | from tools import theano_fns 18 | from tools import theano_fns_double_up 19 | from learning_rule import AdaDelta 20 | from learning_rule import RMSProp 21 | from learning_rule import Momentum 22 | from tools import evaluate_model 23 | from tools import collect_stats_epoch 24 | from tools import plot_stats 25 | from tools import train_one_epoch 26 | from tools import train_one_epoch_alter 27 | from tools import to_categorical 28 | from tools import plot_classes 29 | from tools import chunks 30 | from tools import plot_penalty_vl 31 | from tools import plot_debug_grad 32 | from tools import plot_debug_ratio_grad 33 | from sklearn import manifold 34 | from tools import plot_representations 35 | 36 | 37 | # Parse the yaml config. 38 | config_path = "./config_yaml/" 39 | with open(config_path + sys.argv[1], 'r') as fy: 40 | config_exp = yaml.load(fy) 41 | 42 | x_classes = 10 43 | 44 | debug_code = config_exp["debug_code"] 45 | 46 | if debug_code: 47 | warnings.warn("YOU ARE IN DEBUG MODE! YOUR CODE WILL TAKE MORE TIME!!!!!") 48 | 49 | 50 | def standerize(d, mu=None, sigma=None): 51 | if mu is None: 52 | mu = np.mean(d, axis=0) 53 | sigma = np.std(d, axis=0) 54 | if sigma.nonzero()[0].shape[0] == 0: 55 | raise Exception("std found to be zero!!!!") 56 | norm_d = (d - mu) / sigma 57 | 58 | return norm_d, mu, sigma 59 | 60 | 61 | def get_inter_output(model, l_tst, testx_sh): 62 | i_x_vl = T.lvector("ixtst") 63 | 64 | eval_fn_tst = theano.function( 65 | [i_x_vl], 66 | [l.output for l in model.layers], 67 | givens={model.x: testx_sh[i_x_vl]}) 68 | output_v = [ 69 | eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))] 70 | nbr_layers = len(output_v[0]) 71 | 72 | l_val = [] 73 | for l in range(nbr_layers): 74 | tmp = None 75 | for k in output_v: 76 | if tmp is None: 77 | tmp = k[l] 78 | else: 79 | tmp = np.vstack((tmp, k[l])) 80 | l_val.append(tmp) 81 | 82 | return l_val 83 | 84 | 85 | # create_tr_vl_ts_cb("data/2d") 86 | 87 | # def knn1(model, l_tst, testx_sh, l_tr, trainx_sh): 88 | 89 | # create_tr_vl_ts_nc("data/nestedcircle", 50000) 90 | 91 | # DATA MNIST 92 | # ============================================================================= 93 | path_data = "data/mnist.pkl" 94 | f = open(path_data, 'r') 95 | train, valid, test = pkl.load(f) 96 | trainx, trainy = train[0], train[1] 97 | validx, validy = valid[0], valid[1] 98 | testx, testy = test[0], test[1] 99 | 100 | # How much to take for training? 101 | nbr_sup = config_exp["nbr_sup"] 102 | run = config_exp["run"] 103 | print "RUN:", run 104 | print "SUP: ", nbr_sup 105 | trainx, trainy = trainx[:nbr_sup], trainy[:nbr_sup] 106 | # Prepare the pre-shuffling 107 | if not os.path.exists("data/" + str(nbr_sup)): 108 | os.makedirs("data/" + str(nbr_sup)) 109 | trainx_tmp = trainx 110 | trainy_tmp = trainy 111 | 112 | print trainy.shape 113 | big_mtx = np.hstack((trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1))) 114 | print "Going to shuffle the train data. It takes some time ..." 115 | period = 200 116 | i = 0 117 | #for k in xrange(5000): 118 | # np.random.shuffle(big_mtx) 119 | # if k % period == 0: 120 | # trainx_tmp2 = big_mtx[:, 0:trainx_tmp.shape[1]] 121 | # trainy_tmp2 = big_mtx[:, -1] 122 | # stuff = {"x": trainx_tmp2, "y": trainy_tmp2} 123 | # print k 124 | # with open("data/"+str(nbr_sup) + "/" + str(i) + ".pkl", 'w') as f: 125 | # pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL) 126 | # i += 1 127 | 128 | #with open("data/"+str(nbr_sup) + "/0.pkl") as f: 129 | # stuff = pkl.load(f) 130 | # trainx, trainy = stuff["x"], stuff["y"] 131 | # share over gpu: we can store the whole mnist over the gpu. 132 | # Train 133 | trainx_sh = theano.shared(trainx.astype(theano.config.floatX), 134 | name="trainx", borrow=True) 135 | trainlabels_sh = theano.shared(trainy.astype(theano.config.floatX), 136 | name="trainlabels", borrow=True) 137 | trainy_sh = theano.shared(to_categorical(trainy, x_classes).astype( 138 | theano.config.floatX), name="trainy", borrow=True) 139 | 140 | # valid 141 | validx_sh = theano.shared(validx.astype(theano.config.floatX), 142 | name="validx", borrow=True) 143 | validlabels_sh = theano.shared(validy.astype(theano.config.floatX), 144 | name="validlabels", borrow=True) 145 | # 146 | input = T.fmatrix("x") 147 | input1 = T.fmatrix("x1") 148 | input2 = T.fmatrix("x2") 149 | rng = np.random.RandomState(23455) 150 | # Architecture 151 | nhid_l0 = 1200 152 | nhid_l1 = 1200 153 | nhid_l2 = 200 154 | 155 | nbr_classes = x_classes 156 | use_batch_normalization = config_exp["use_batch_normalization"] 157 | h_ind = config_exp["h_ind"] 158 | h_ind = [int(tt) for tt in h_ind] 159 | 160 | assert len(h_ind) == 4 161 | h0, h1, h2, h3, h4, h5, h6, h7, h8 = None, None, None, None, None, None, None,\ 162 | None, None 163 | l_v = [] 164 | for xx in h_ind: 165 | print xx 166 | if int(xx) == 1: 167 | l_v.append(True) 168 | elif int(xx) == 0: 169 | l_v.append(False) 170 | else: 171 | raise ValueError("Error in applying hint: 0/1") 172 | 173 | hint_type = "l2sum" # "l1mean" 174 | print l_v 175 | corrupt_input_l = config_exp["corrupt_input_l"] 176 | if corrupt_input_l != 0.: 177 | warnings.warn( 178 | "YOU ASKED TO USE DENOISING PROCESS OVER THE INPUTS OF THE FIRST LAYER" 179 | ) 180 | if not config_exp["hint"]: 181 | raise ValueError( 182 | "You asked for densoing process but you are not using the penalty") 183 | start_corrupting = config_exp["start_corrupting"] 184 | warnings.warn( 185 | "CORRUPTION WILL START AFTER:" + str(start_corrupting) + " epochs!!!!!!") 186 | use_sparsity = config_exp["use_sparsity"] 187 | use_sparsity_in_pred = config_exp["use_sparsity_in_pred"] 188 | print "Use sparsity: ", use_sparsity 189 | print "Use sparsity in pred:", use_sparsity_in_pred 190 | use_unsupervised = config_exp["use_unsupervised"] 191 | 192 | layer0 = { 193 | "rng": rng, 194 | "n_in": trainx.shape[1], 195 | "n_out": nhid_l0, 196 | "W": None, 197 | "b": None, 198 | "activation": NonLinearity.SIGMOID, 199 | "hint": hint_type, 200 | "use_hint": l_v[0], 201 | "intended_to_be_corrupted": True, 202 | "corrupt_input_l": corrupt_input_l, 203 | "use_sparsity": use_sparsity, 204 | "use_sparsity_in_pred": use_sparsity_in_pred, 205 | "use_unsupervised": use_unsupervised, 206 | "use_batch_normalization": use_batch_normalization[0] 207 | } 208 | 209 | layer1 = { 210 | "rng": rng, 211 | "n_in": nhid_l0, 212 | "n_out": nhid_l1, 213 | "W": None, 214 | "b": None, 215 | "activation": NonLinearity.SIGMOID, 216 | "hint": hint_type, 217 | "use_hint": l_v[1], 218 | "use_sparsity": use_sparsity, 219 | "use_sparsity_in_pred": use_sparsity_in_pred, 220 | "use_unsupervised": use_unsupervised, 221 | "use_batch_normalization": use_batch_normalization[1] 222 | } 223 | 224 | layer2 = { 225 | "rng": rng, 226 | "n_in": nhid_l1, 227 | "n_out": nhid_l2, 228 | "W": None, 229 | "b": None, 230 | "activation": NonLinearity.SIGMOID, 231 | "hint": hint_type, 232 | "use_hint": l_v[2], 233 | "use_sparsity": use_sparsity, 234 | "use_sparsity_in_pred": use_sparsity_in_pred, 235 | "use_unsupervised": use_unsupervised, 236 | "use_batch_normalization": use_batch_normalization[2] 237 | } 238 | 239 | #layer3 = { 240 | # "rng": rng, 241 | # "n_in": nhid_l2, 242 | # "n_out": nhid_l3, 243 | # "W": None, 244 | # "b": None, 245 | # "activation": NonLinearity.SIGMOID, 246 | # "hint": l_v[3] 247 | # } 248 | # 249 | #layer4 = { 250 | # "rng": rng, 251 | # "n_in": nhid_l3, 252 | # "n_out": nhid_l4, 253 | # "W": None, 254 | # "b": None, 255 | # "activation": NonLinearity.SIGMOID, 256 | # "hint": l_v[4] 257 | # } 258 | # 259 | #layer5 = { 260 | # "rng": rng, 261 | # "n_in": nhid_l4, 262 | # "n_out": nhid_l5, 263 | # "W": None, 264 | # "b": None, 265 | # "activation": NonLinearity.SIGMOID, 266 | # "hint": l_v[5] 267 | # } 268 | # 269 | #layer6 = { 270 | # "rng": rng, 271 | # "n_in": nhid_l5, 272 | # "n_out": nhid_l6, 273 | # "W": None, 274 | # "b": None, 275 | # "activation": NonLinearity.SIGMOID, 276 | # "hint": l_v[6] 277 | # } 278 | # 279 | #layer7 = { 280 | # "rng": rng, 281 | # "n_in": nhid_l6, 282 | # "n_out": nhid_l7, 283 | # "W": None, 284 | # "b": None, 285 | # "activation": NonLinearity.SIGMOID, 286 | # "hint": l_v[7] 287 | # } 288 | 289 | output_layer = { 290 | "rng": rng, 291 | "n_in": nhid_l2, 292 | "n_out": nbr_classes, 293 | "W": None, 294 | "b": None, 295 | "activation": NonLinearity.SOFTMAX, 296 | "hint": hint_type, 297 | "use_hint": l_v[3], 298 | "use_sparsity": False, 299 | "use_sparsity_in_pred": False, 300 | "use_unsupervised": use_unsupervised, 301 | "use_batch_normalization": use_batch_normalization[3] 302 | } 303 | layers = [layer0, layer1, layer2, output_layer] 304 | l1, l2 = 0., 0. 305 | reg_bias = True 306 | margin = sharedX_value(1., name="margin") 307 | similair = theano.shared(np.array([0, 1], dtype=theano.config.floatX), 308 | name="sim") 309 | model = ModelMLP(layers, input, input1, input2, 310 | trainx_sh, trainlabels_sh, trainy_sh, 311 | validx_sh, validlabels_sh, margin, similair, 312 | l1_reg=l1, l2_reg=l2, 313 | reg_bias=reg_bias) 314 | 315 | size_model = str(trainx.shape[1]) +\ 316 | '_'.join([str(l["n_in"]) for l in layers]) + "_" + str(nbr_classes) 317 | path_model_init_params = "init_params/" + size_model + '_' +\ 318 | str(config_exp["repet"]) + ".pkl" 319 | if not os.path.isfile(path_model_init_params): 320 | model.save_params(path_model_init_params, catched=False) 321 | else: 322 | model.set_params_vals(path_model_init_params) 323 | 324 | train_batch_size = 100 325 | valid_batch_size = 1000 326 | 327 | max_epochs = config_exp["max_epochs"] 328 | lr_vl = 1e-7 329 | lr = sharedX_value(lr_vl, name="lr") 330 | h_w = sharedX_value(config_exp["h_w"], name="hw") 331 | s_w = sharedX_value(1., name="sw") 332 | unsup_w = sharedX_value(1., name="unsw") 333 | lambda_sparsity = sharedX_value(0., name="l_sparsity") 334 | 335 | # Compile functions: train/valid 336 | updater_sup = AdaDelta(decay=0.95) 337 | updater_hint = AdaDelta(decay=0.95) 338 | updater_unsup = AdaDelta(decay=0.95) 339 | updater = {"sup": updater_sup, 'hint': updater_hint, "unsup": updater_unsup} 340 | 341 | # updater = Momentum(0.9, nesterov_momentum=False, imagenet=False, 342 | # imagenetDecay=5e-4, max_colm_norm=False) 343 | 344 | hint = config_exp["hint"] 345 | # "hint", "noHint" 346 | if hint: 347 | tag = "hint" 348 | else: 349 | tag = "noHint" 350 | 351 | norm_gsup = config_exp["norm_gsup"] 352 | norm_gh = config_exp["norm_gh"] 353 | fns = theano_fns_double_up( 354 | model, learning_rate=lr, 355 | h_w=h_w, s_w=s_w, unsup_w=unsup_w, lambda_sparsity=lambda_sparsity, 356 | updater=updater, tag=tag, 357 | max_colm_norm=False, max_norm=15.0, 358 | norm_gsup=norm_gsup, norm_gh=norm_gh) 359 | 360 | eval_fn, eval_fn_tr = fns["eval_fn"], fns["eval_fn_tr"] 361 | # Things to track during training: epoch and minibatch 362 | train_stats = {"tr_error_ep": [], "vl_error_ep": [], "tr_cost_ep": [], 363 | "tr_error_mn": [], "vl_error_mn": [], "tr_cost_mn": [], 364 | "current_nb_mb": 0, "best_epoch": 0, "best_mn": 0} 365 | 366 | names = [] 367 | for l, i in zip(layers, range(len(layers))): 368 | if l["hint"] is not None: 369 | names.append(i) 370 | debug = {"grad_sup": [], "grad_hint": [], "penalty": [], "names": names} 371 | # Eval before start training 372 | l_vl = chunks(range(validx.shape[0]), valid_batch_size) 373 | l_tr = chunks(range(trainx.shape[0]), valid_batch_size) 374 | vl_err_start = np.mean( 375 | [eval_fn(np.array(l_vl[kk])) for kk in range(len(l_vl))]) 376 | tr_err_start = np.mean( 377 | [eval_fn_tr(np.array(l_tr[kk])) for kk in range(len(l_tr))]) 378 | print vl_err_start, tr_err_start 379 | 380 | # Exp stamp 381 | time_exp = DT.datetime.now().strftime('%m_%d_%Y_%H_%M_%s') 382 | tag_text = "_".join([str(l["hint"]) for l in layers]) 383 | h_exp = "_".join([str(e) for e in h_ind]) 384 | fold_exp = "exps/" + tag + "_" + str(nbr_sup) + "_" + h_exp + "_" +\ 385 | size_model + "_" + time_exp 386 | if not os.path.exists(fold_exp): 387 | os.makedirs(fold_exp) 388 | 389 | shutil.copy(inspect.stack()[0][1], fold_exp) 390 | shutil.copy(config_path+sys.argv[1], fold_exp) 391 | 392 | # Start training 393 | stop, i = False, 0 394 | div = any([l["hint"] is "contrastive" for l in layers]) 395 | shuffle_period = 1 # epochs 396 | do_shuffle = True 397 | extreme_random = config_exp["extreme_random"] 398 | if extreme_random: 399 | print "Extreme randomness." 400 | else: 401 | print "Same shuffle." 402 | kk = 1 403 | 404 | # TEST BEFORE START TRAINING 405 | testx_sh = theano.shared(testx.astype(theano.config.floatX), 406 | name="testx", borrow=True) 407 | testlabels_sh = theano.shared(testy.astype(theano.config.floatX), 408 | name="testlabels", borrow=True) 409 | 410 | i_x_vl = T.lvector("ixtst") 411 | y_vl = T.vector("y") 412 | error = T.mean(T.neq(T.argmax(model.output, axis=1), y_vl)) 413 | 414 | output_fn_test = [error, model.output, model.layers[-2].output] 415 | 416 | eval_fn_tst = theano.function( 417 | [i_x_vl], output_fn_test, 418 | givens={model.x: testx_sh[i_x_vl], 419 | y_vl: testlabels_sh[i_x_vl]}) 420 | l_tst = chunks(range(testx.shape[0]), valid_batch_size) 421 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))] 422 | print test_error_l[0][0] 423 | 424 | test_error = np.mean([l[0] for l in test_error_l]) 425 | print "Test error:", test_error 426 | prediction = None 427 | for l in test_error_l: 428 | if prediction is None: 429 | prediction = l[1] 430 | else: 431 | prediction = np.vstack((prediction, l[1])) 432 | 433 | 434 | with open(fold_exp+"/pred_before.pkl", "w") as fp: 435 | pkl.dump({"y": testy, "pred": prediction}, fp) 436 | 437 | best_vl_error = np.finfo(np.float).max 438 | start_hint_epoch = config_exp["start_hint"] 439 | 440 | while i < max_epochs: 441 | if i >= start_corrupting: 442 | warnings.warn( 443 | "SETTING THE CORRUPTION LEVEL TO:" + str(corrupt_input_l)) 444 | model.layers[0].corrupt_input_l.set_value( 445 | np.cast[theano.config.floatX](corrupt_input_l)) 446 | else: 447 | warnings.warn("SETTING THE CORRUPTION LEVEL TO: 0") 448 | model.layers[0].corrupt_input_l.set_value( 449 | np.cast[theano.config.floatX](0.)) 450 | stop = (i == max_epochs - 1) 451 | tx = DT.datetime.now() 452 | stats = train_one_epoch_alter( 453 | model, fns, i, fold_exp, train_stats, vl_err_start, tag, 454 | train_batch_size, l_vl, l_tr, div, stop=stop, 455 | debug=debug, debug_code=debug_code, h_w=h_w) 456 | txx = DT.datetime.now() 457 | print "CORRUPTION LEVEL VALUE: " +\ 458 | str(model.layers[0].corrupt_input_l.get_value()) 459 | print "One epoch", DT.datetime.now() - tx 460 | train_stats = collect_stats_epoch(stats, train_stats) 461 | if (i % 100 == 0 or stop) and debug_code: 462 | plot_debug_grad(debug, tag_text, fold_exp, "sup") 463 | plot_penalty_vl(debug, tag_text, fold_exp) 464 | if tag == "hint": 465 | plot_debug_grad(debug, tag_text, fold_exp, "hint") 466 | plot_debug_ratio_grad(debug, fold_exp, "h/s") 467 | plot_debug_ratio_grad(debug, fold_exp, "s/h") 468 | 469 | if stop: 470 | plot_stats(train_stats, "ep", fold_exp, tag) 471 | with open(fold_exp + "/train_stats.pkl", 'w') as f_ts: 472 | pkl.dump(train_stats, f_ts) 473 | with open(fold_exp + "/train_debug.pkl", 'w') as f_ts: 474 | pkl.dump(debug, f_ts) 475 | i += 1 476 | # shuffle the data 477 | 478 | print "Going to shuffle the train data." 479 | 480 | if do_shuffle and i % shuffle_period == 0 and not stop: 481 | if extreme_random: 482 | trainx_tmp = model.trainx_sh.get_value() 483 | trainy_tmp = model.trainlabels_sh.get_value() 484 | big_mtx = np.hstack( 485 | (trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1))) 486 | for k in xrange(5): 487 | np.random.shuffle(big_mtx) 488 | trainx_tmp = big_mtx[:, 0:trainx_tmp.shape[1]] 489 | trainy_tmp = big_mtx[:, -1] 490 | else: 491 | with open("data/"+str(nbr_sup) + "/" + str(kk) + ".pkl") as f: 492 | stuff = pkl.load(f) 493 | trainx_tmp, trainy_tmp = stuff["x"], stuff["y"] 494 | model.trainlabels_sh.set_value(trainy_tmp.astype(theano.config.floatX)) 495 | model.trainy_sh.set_value( 496 | to_categorical( 497 | trainy_tmp, nbr_classes).astype(theano.config.floatX)) 498 | model.trainx_sh.set_value(trainx_tmp.astype(theano.config.floatX)) 499 | kk += 1 500 | if kk > 240: 501 | kk = 0 502 | print "Finished loading shuffled data. Updated the train set on GPU." 503 | del stats 504 | print "This part took:", DT.datetime.now() - txx 505 | 506 | print "MIN VALID ", np.min(train_stats["vl_error_mn"]), " *********" 507 | # # If there was no improvement... 508 | if (i > start_hint_epoch) and hint: 509 | # new_v = min([1., h_w.get_value() + 0.01]) 510 | new_v = 1. 511 | h_w.set_value(np.cast[theano.config.floatX](new_v)) 512 | # print "NO IMPROV. PUSHING THE NET..............................." 513 | # best_vl_error = np.min(train_stats["vl_error_mn"]) 514 | # Update the importance of the hint 515 | # if i >= 1: 516 | # # new_v = min([1., h_w.get_value() + 0.1]) 517 | # h_w.set_value(np.cast[theano.config.floatX](1.)) 518 | 519 | 520 | # Perform the test 521 | # Set the model's param to the best catched ones 522 | model.set_model_to_catched_params() 523 | # share test data 524 | 525 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))] 526 | train_error_l = [eval_fn_tst(np.array(l_tr[kkk])) for kkk in range(len(l_tr))] 527 | 528 | test_error = np.mean([l[0] for l in test_error_l]) 529 | print "Test error:", test_error 530 | # Train 531 | 532 | # Test 533 | # last hidden layer representations. 534 | with open(fold_exp+"/last_hidden_rep_test.pkl", "w") as fhr: 535 | stuff_hrep_tst = None 536 | for k in test_error_l: 537 | if stuff_hrep_tst is None: 538 | stuff_hrep_tst = l[2] 539 | else: 540 | stuff_hrep_tst = np.vstack((stuff_hrep_tst, l[2])) 541 | 542 | stuff_hrep_tr = None 543 | for k in train_error_l: 544 | if stuff_hrep_tr is None: 545 | stuff_hrep_tr = l[2] 546 | else: 547 | stuff_hrep_tr = np.vstack((stuff_hrep_tr, l[2])) 548 | pkl.dump( 549 | {"x_hint_repr_tst": stuff_hrep_tst, "y_tst": testy, "ximg_tst": testx, 550 | "x_hint_repr_tr": stuff_hrep_tr, "y_tr": trainy, "ximg_tr": trainx}, 551 | fhr) 552 | # plot t-SNE of the opriginal images 553 | tx0 = DT.datetime.now() 554 | tsne_original = manifold.TSNE(n_components=2, init='pca', random_state=0) 555 | X_tsne_original = tsne_original.fit_transform(testx) 556 | fig_tsne_org = plot_representations( 557 | X_tsne_original, testy, "t-SNE embedding of mnist original images.") 558 | fig_tsne_org.savefig(fold_exp+"/original_rep_test.eps", format='eps', 559 | dpi=1200, bbox_inches='tight') 560 | print "t-SNE of original images took:", DT.datetime.now() - tx0 561 | # plot t-SNE of the prediction 562 | tx0 = DT.datetime.now() 563 | tsne_lasthidden_rep = manifold.TSNE(n_components=2, init='pca', 564 | random_state=0) 565 | X_tsne_lhrep = tsne_original.fit_transform(stuff_hrep_tst) 566 | fig_tsne_lhrep = plot_representations( 567 | X_tsne_lhrep, testy, 568 | "t-SNE embedding of the last hidden representation of the MLP" + 569 | "applied over mnist.") 570 | fig_tsne_lhrep.savefig(fold_exp+"/lasth_rep_mlp_test.eps", format='eps', 571 | dpi=1200, bbox_inches='tight') 572 | print "t-SNE of hidden representation took:", DT.datetime.now() - tx0 573 | prediction = None 574 | for l in test_error_l: 575 | if prediction is None: 576 | prediction = l[1] 577 | else: 578 | prediction = np.vstack((prediction, l[1])) 579 | 580 | 581 | with open(fold_exp+"/pred_after.pkl", "w") as fp: 582 | pkl.dump({"y": testy, "pred": prediction}, fp) 583 | 584 | ############################################################################## 585 | # GET INTERMEDIATE VALUE AND PLOT THEM. POSSIBLE ONLY WHEN THE INTERMEDIATE 586 | # VALUES ARE 2D. 587 | # inter_vl = get_inter_output(model, l_tst, testx_sh) 588 | # plot the intermediate values 589 | # ll = 0 590 | # for vi in inter_vl: 591 | # fig = plot_classes( 592 | # testy_int, vi, "", test_error, 593 | # "pred. 2D: mnist 1/7. layer" + str(ll)) 594 | # fig.savefig( 595 | # fold_exp + "/predinterlayer" + str(ll) + ".png", bbox_inches='tight') 596 | # ll += 1 597 | 598 | ############################################################################### 599 | 600 | # fig_scatter = plot_classes(y=testy_int, cord=prediction, names=cs, 601 | # test_error=test_error, message="AFTER train") 602 | # fig_scatter.savefig(fold_exp+"/pred_after.png", bbox_inches='tight') 603 | # save min valid 604 | vl_pathfile = "exps/" + "run_" + str(run) + "_sup_" + str(nbr_sup) + "_" +\ 605 | h_exp + "_c_l_" + str(corrupt_input_l) + "_start_at_" +\ 606 | str(start_corrupting) + "_debug_" + str(debug_code) +\ 607 | "_use_sparse_" + str(use_sparsity) + "_use_spar_pred_" +\ 608 | str(use_sparsity_in_pred) + "_" + "norm_" + str(norm_gsup) + "_" +\ 609 | str(norm_gh) + "_" + time_exp + ".txt" 610 | with open(vl_pathfile, 'w') as f: 611 | f.write("Exp. folder: " + fold_exp + "\n") 612 | f.write( 613 | "valid error:" + str( 614 | np.min(train_stats["vl_error_mn"]) * 100.) + " % \n") 615 | f.write("Test error:" + str(test_error * 100.) + " % \n") 616 | shutil.copy(vl_pathfile, fold_exp) 617 | -------------------------------------------------------------------------------- /trainLenet.py: -------------------------------------------------------------------------------- 1 | import cPickle as pkl 2 | import numpy as np 3 | import theano.tensor as T 4 | import os 5 | import sys 6 | import datetime as DT 7 | import shutil 8 | import inspect 9 | import theano 10 | import warnings 11 | 12 | 13 | from tools import LeNet 14 | from tools import NonLinearity 15 | from tools import split_data_to_minibatchs_eval 16 | from tools import sharedX_value 17 | from tools import theano_fns 18 | from tools import theano_fns_double_up 19 | from learning_rule import AdaDelta 20 | from learning_rule import RMSProp 21 | from learning_rule import Momentum 22 | from tools import evaluate_model 23 | from tools import collect_stats_epoch 24 | from tools import plot_stats 25 | from tools import train_one_epoch 26 | from tools import train_one_epoch_alter 27 | from tools import to_categorical 28 | from tools import chunks 29 | from tools import plot_penalty_vl 30 | from tools import plot_debug_grad 31 | from tools import plot_debug_ratio_grad 32 | import yaml 33 | from sklearn import manifold 34 | from tools import plot_representations 35 | 36 | 37 | # Parse the yaml config. 38 | config_path = "./config_yaml/" 39 | with open(config_path + sys.argv[1], 'r') as fy: 40 | config_exp = yaml.load(fy) 41 | 42 | x_classes = 10 43 | debug_code = config_exp["debug_code"] 44 | if debug_code: 45 | warnings.warn("YOU ARE IN DEBUG MODE! YOUR CODE WILL TAKE MORE TIME!!!!!") 46 | 47 | 48 | def standerize(d, mu=None, sigma=None): 49 | if mu is None: 50 | mu = np.mean(d, axis=0) 51 | sigma = np.std(d, axis=0) 52 | if sigma.nonzero()[0].shape[0] == 0: 53 | raise Exception("std found to be zero!!!!") 54 | norm_d = (d - mu) / sigma 55 | 56 | return norm_d, mu, sigma 57 | 58 | path_data = "data/mnist.pkl" 59 | f = open(path_data, 'r') 60 | train, valid, test = pkl.load(f) 61 | trainx, trainy = train[0], train[1] 62 | validx, validy = valid[0], valid[1] 63 | testx, testy = test[0], test[1] 64 | # Rehape 3D 65 | validx = validx.reshape((validx.shape[0], 1, 28, 28)) 66 | testx = testx.reshape((testx.shape[0], 1, 28, 28)) 67 | 68 | # How much to take for training? 69 | nbr_sup = config_exp["nbr_sup"] 70 | run = config_exp["run"] 71 | print "RUN:", run 72 | print "SUP: ", nbr_sup 73 | trainx, trainy = trainx[:nbr_sup], trainy[:nbr_sup] 74 | # Prepare the pre-shuffling 75 | if not os.path.exists("data/" + str(nbr_sup)): 76 | os.makedirs("data/" + str(nbr_sup)) 77 | trainx_tmp = trainx 78 | trainy_tmp = trainy 79 | 80 | # big_mtx = np.hstack((trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1))) 81 | print "Going to shuffle the train data. It takes some time ..." 82 | period = 200 83 | i = 0 84 | #for k in xrange(5000): 85 | # np.random.shuffle(big_mtx) 86 | # if k % period == 0: 87 | # trainx_tmp2 = big_mtx[:, 0:trainx_tmp.shape[1]] 88 | # trainy_tmp2 = big_mtx[:, -1] 89 | # stuff = {"x": trainx_tmp2, "y": trainy_tmp2} 90 | # print k 91 | # with open("data/"+str(nbr_sup) + "/" + str(i) + ".pkl", 'w') as f: 92 | # pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL) 93 | # i += 1 94 | 95 | #with open("data/"+str(nbr_sup) + "/0.pkl") as f: 96 | # stuff = pkl.load(f) 97 | # trainx, trainy = stuff["x"], stuff["y"] 98 | # share over gpu: we can store the whole mnist over the gpu. 99 | # Train 100 | trainx = trainx.reshape((trainx.shape[0], 1, 28, 28)) 101 | trainx_sh = theano.shared(trainx.astype(theano.config.floatX), 102 | name="trainx", borrow=True) 103 | trainlabels_sh = theano.shared(trainy.astype(theano.config.floatX), 104 | name="trainlabels", borrow=True) 105 | trainy_sh = theano.shared(to_categorical(trainy, 10).astype( 106 | theano.config.floatX), name="trainy", borrow=True) 107 | # trainy_sh = T.cast(trainy_sh, 'int32') 108 | 109 | # valid 110 | validx_sh = theano.shared(validx.astype(theano.config.floatX), 111 | name="validx", borrow=True) 112 | validlabels_sh = theano.shared(validy.astype(theano.config.floatX), 113 | name="validlabels", borrow=True) 114 | # 115 | input = T.tensor4("x") 116 | input1 = T.tensor4("x1") 117 | input2 = T.tensor4("x2") 118 | rng = np.random.RandomState(23455) 119 | 120 | nbr_classes = x_classes 121 | use_batch_normalization = config_exp["use_batch_normalization"] 122 | h_ind = config_exp["h_ind"] 123 | h_ind = [int(tt) for tt in h_ind] 124 | 125 | assert len(h_ind) == 4 126 | 127 | l_v = [] 128 | for xx in h_ind: 129 | print xx 130 | if int(xx) == 1: 131 | l_v.append(True) 132 | elif int(xx) == 0: 133 | l_v.append(False) 134 | else: 135 | raise ValueError("Error in applying hint: 0/1") 136 | 137 | hint_type = "l2sum" 138 | print l_v 139 | corrupt_input_l = config_exp["corrupt_input_l"] 140 | if corrupt_input_l != 0.: 141 | warnings.warn( 142 | "YOU ASKED TO USE DENOISING PROCESS OVER THE INPUTS OF THE FIRST LAYER" 143 | ) 144 | if not config_exp["hint"]: 145 | raise ValueError( 146 | "You asked for densoing process but you are not using the penalty") 147 | start_corrupting = config_exp["start_corrupting"] 148 | warnings.warn( 149 | "CORRUPTION WILL START AFTER:" + str(start_corrupting) + " epochs!!!!!!") 150 | use_sparsity = config_exp["use_sparsity"] 151 | use_sparsity_in_pred = config_exp["use_sparsity_in_pred"] 152 | print "Use sparsity: ", use_sparsity 153 | print "Use sparsity in pred:", use_sparsity_in_pred 154 | use_unsupervised = config_exp["use_unsupervised"] 155 | layer0 = { 156 | "rng": rng, 157 | "n_in": 1, 158 | "n_out": 20, 159 | "W": None, 160 | "b": None, 161 | "activation": NonLinearity.TANH, 162 | "hint": hint_type, 163 | "use_hint": l_v[0], 164 | "intended_to_be_corrupted": True, 165 | "corrupt_input_l": corrupt_input_l, 166 | "use_sparsity": use_sparsity, 167 | "use_sparsity_in_pred": use_sparsity_in_pred, 168 | "use_unsupervised": use_unsupervised, 169 | "use_batch_normalization": use_batch_normalization[0] 170 | } 171 | 172 | layer1 = { 173 | "rng": rng, 174 | "n_in": 20, 175 | "n_out": 50, 176 | "W": None, 177 | "b": None, 178 | "activation": NonLinearity.TANH, 179 | "hint": hint_type, 180 | "use_hint": l_v[1], 181 | "use_sparsity": use_sparsity, 182 | "use_sparsity_in_pred": use_sparsity_in_pred, 183 | "use_unsupervised": use_unsupervised, 184 | "use_batch_normalization": use_batch_normalization[1] 185 | } 186 | 187 | layer2 = { 188 | "rng": rng, 189 | "n_in": 50*4*4, 190 | "n_out": 500, 191 | "W": None, 192 | "b": None, 193 | "activation": NonLinearity.TANH, 194 | "hint": hint_type, 195 | "use_hint": l_v[2], 196 | "use_sparsity": use_sparsity, 197 | "use_sparsity_in_pred": use_sparsity_in_pred, 198 | "use_unsupervised": use_unsupervised, 199 | "use_batch_normalization": use_batch_normalization[2] 200 | } 201 | 202 | 203 | output_layer = { 204 | "rng": rng, 205 | "n_in": 500, 206 | "n_out": nbr_classes, 207 | "W": None, 208 | "b": None, 209 | "activation": NonLinearity.SOFTMAX, 210 | "hint": hint_type, 211 | "use_hint": l_v[3], 212 | "use_sparsity": False, 213 | "use_sparsity_in_pred": False, 214 | "use_unsupervised": use_unsupervised, 215 | "use_batch_normalization": use_batch_normalization[3] 216 | } 217 | layers = [layer0, layer1, layer2, output_layer] 218 | l1, l2 = 0., 0. 219 | margin = sharedX_value(1., name="margin") 220 | similair = theano.shared(np.array([0, 1], dtype=theano.config.floatX), 221 | name="sim") 222 | train_batch_size = 100 223 | valid_batch_size = train_batch_size 224 | model = LeNet(layers, input, input1, input2, 225 | trainx_sh, trainlabels_sh, trainy_sh, 226 | validx_sh, validlabels_sh, margin, similair, 227 | l1_reg=l1, l2_reg=l2, 228 | reg_bias=False, 229 | batch_size=None) 230 | 231 | size_model = str(trainx.shape[1]) +\ 232 | '_'.join([str(l["n_in"]) for l in layers]) + "_" + str(nbr_classes) 233 | path_model_init_params = "init_params/" + size_model + '_' +\ 234 | str(config_exp["repet"]) + ".pkl" 235 | if not os.path.isfile(path_model_init_params): 236 | model.save_params(path_model_init_params, catched=False) 237 | else: 238 | model.set_params_vals(path_model_init_params) 239 | 240 | 241 | max_epochs = config_exp["max_epochs"] 242 | lr_vl = 1e-7 243 | lr = sharedX_value(lr_vl, name="lr") 244 | h_w = sharedX_value(config_exp["h_w"], name="hw") 245 | s_w = sharedX_value(1., name="sw") 246 | unsup_w = sharedX_value(1., name="unsw") 247 | lambda_sparsity = sharedX_value(1e-3, name="l_sparsity") 248 | 249 | # Compile functions: train/valid 250 | updater_sup = AdaDelta(decay=0.95) 251 | updater_hint = AdaDelta(decay=0.95) 252 | updater_unsup = AdaDelta(decay=0.95) 253 | updater = {"sup": updater_sup, 'hint': updater_hint, "unsup": updater_unsup} 254 | 255 | # updater = Momentum(0.9, nesterov_momentum=False, imagenet=False, 256 | # imagenetDecay=5e-4, max_colm_norm=False) 257 | 258 | hint = config_exp["hint"] 259 | # "hint", "noHint" 260 | if hint: 261 | tag = "hint" 262 | else: 263 | tag = "noHint" 264 | 265 | norm_gsup = config_exp["norm_gsup"] 266 | norm_gh = config_exp["norm_gh"] 267 | fns = theano_fns_double_up( 268 | model, learning_rate=lr, 269 | h_w=h_w, s_w=s_w, unsup_w=unsup_w, lambda_sparsity=lambda_sparsity, 270 | updater=updater, tag=tag, 271 | max_colm_norm=False, max_norm=15.0, 272 | norm_gsup=norm_gsup, norm_gh=norm_gh) 273 | 274 | eval_fn, eval_fn_tr = fns["eval_fn"], fns["eval_fn_tr"] 275 | # Things to track during training: epoch and minibatch 276 | train_stats = {"tr_error_ep": [], "vl_error_ep": [], "tr_cost_ep": [], 277 | "tr_error_mn": [], "vl_error_mn": [], "tr_cost_mn": [], 278 | "current_nb_mb": 0, "best_epoch": 0, "best_mn": 0} 279 | 280 | names = [] 281 | for l, i in zip(layers, range(len(layers))): 282 | if l["hint"] is not None: 283 | names.append(i) 284 | debug = {"grad_sup": [], "grad_hint": [], "penalty": [], "names": names} 285 | # Eval before start training 286 | l_vl = chunks(range(validx.shape[0]), valid_batch_size) 287 | l_tr = chunks(range(trainx.shape[0]), valid_batch_size) 288 | vl_err_start = np.mean( 289 | [eval_fn(np.array(l_vl[kk])) for kk in range(len(l_vl))]) 290 | tr_err_start = np.mean( 291 | [eval_fn_tr(np.array(l_tr[kk])) for kk in range(len(l_tr))]) 292 | print vl_err_start, tr_err_start 293 | 294 | # Exp stamp 295 | time_exp = DT.datetime.now().strftime('%m_%d_%Y_%H_%M_%s') 296 | tag_text = "_".join([str(l["hint"]) for l in layers]) 297 | h_exp = "_".join([str(e) for e in h_ind]) 298 | fold_exp = "exps/lenet_" + tag + "_" + str(nbr_sup) + "_" + h_exp + "_" +\ 299 | size_model + "_" + time_exp 300 | if not os.path.exists(fold_exp): 301 | os.makedirs(fold_exp) 302 | 303 | shutil.copy(inspect.stack()[0][1], fold_exp) 304 | shutil.copy(config_path+sys.argv[1], fold_exp) 305 | 306 | # Start training 307 | stop, i = False, 0 308 | div = any([l["hint"] is "contrastive" for l in layers]) 309 | shuffle_period = 1 # epochs 310 | do_shuffle = True 311 | extreme_random = config_exp["extreme_random"] 312 | if extreme_random: 313 | print "Extreme randomness." 314 | else: 315 | print "Same shuffle." 316 | kk = 1 317 | start_hint_epoch = config_exp["start_hint"] 318 | 319 | while i < max_epochs: 320 | if i >= start_corrupting: 321 | warnings.warn( 322 | "SETTING THE CORRUPTION LEVEL TO:" + str(corrupt_input_l)) 323 | model.layers[0].corrupt_input_l.set_value( 324 | np.cast[theano.config.floatX](corrupt_input_l)) 325 | else: 326 | warnings.warn("SETTING THE CORRUPTION LEVEL TO: 0") 327 | model.layers[0].corrupt_input_l.set_value( 328 | np.cast[theano.config.floatX](0.)) 329 | stop = (i == max_epochs - 1) 330 | tx = DT.datetime.now() 331 | stats = train_one_epoch_alter( 332 | model, fns, i, fold_exp, train_stats, vl_err_start, tag, 333 | train_batch_size, l_vl, l_tr, div, stop=stop, 334 | debug=debug, debug_code=debug_code) 335 | txx = DT.datetime.now() 336 | print "CORRUPTION LEVEL VALUE: " +\ 337 | str(model.layers[0].corrupt_input_l.get_value()) 338 | print "One epoch", DT.datetime.now() - tx 339 | train_stats = collect_stats_epoch(stats, train_stats) 340 | if (i % 100 == 0 or stop) and debug_code: 341 | plot_debug_grad(debug, tag_text, fold_exp, "sup") 342 | plot_penalty_vl(debug, tag_text, fold_exp) 343 | if tag == "hint": 344 | plot_debug_grad(debug, tag_text, fold_exp, "hint") 345 | plot_debug_ratio_grad(debug, fold_exp, "h/s") 346 | plot_debug_ratio_grad(debug, fold_exp, "s/h") 347 | 348 | if stop: 349 | plot_stats(train_stats, "ep", fold_exp, tag) 350 | with open(fold_exp + "/train_stats.pkl", 'w') as f_ts: 351 | pkl.dump(train_stats, f_ts) 352 | with open(fold_exp + "/train_debug.pkl", 'w') as f_ts: 353 | pkl.dump(debug, f_ts) 354 | i += 1 355 | # shuffle the data 356 | 357 | print "Going to shuffle the train data." 358 | 359 | if do_shuffle and i % shuffle_period == 0 and not stop: 360 | if extreme_random: 361 | trainx_tmp = model.trainx_sh.get_value() 362 | trainx_tmp = trainx_tmp.reshape((trainx_tmp.shape[0], 28*28)) 363 | trainy_tmp = model.trainlabels_sh.get_value() 364 | big_mtx = np.hstack( 365 | (trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1))) 366 | for k in xrange(5): 367 | np.random.shuffle(big_mtx) 368 | trainx_tmp = big_mtx[:, 0:trainx_tmp.shape[1]] 369 | trainy_tmp = big_mtx[:, -1] 370 | else: 371 | with open("data/"+str(nbr_sup) + "/" + str(kk) + ".pkl") as f: 372 | stuff = pkl.load(f) 373 | trainx_tmp, trainy_tmp = stuff["x"], stuff["y"] 374 | trainx_tmp = trainx_tmp.reshape((trainx_tmp.shape[0], 1, 28, 28)) 375 | model.trainlabels_sh.set_value(trainy_tmp.astype(theano.config.floatX)) 376 | model.trainy_sh.set_value( 377 | to_categorical( 378 | trainy_tmp, nbr_classes).astype(theano.config.floatX)) 379 | # model.trainy_sh = T.cast(model.trainy_sh, 'int32') 380 | model.trainx_sh.set_value(trainx_tmp.astype(theano.config.floatX)) 381 | kk += 1 382 | if kk > 240: 383 | kk = 0 384 | print "Finished loading shuffled data. Updated the train set on GPU." 385 | del stats 386 | print "This part took:", DT.datetime.now() - txx 387 | if (i > start_hint_epoch) and hint: 388 | # new_v = min([1., h_w.get_value() + 0.1]) 389 | new_v = 1. 390 | h_w.set_value(np.cast[theano.config.floatX](new_v)) 391 | # Update the importance of the hint 392 | # if i >= 1: 393 | # # new_v = min([1., h_w.get_value() + 0.1]) 394 | # h_w.set_value(np.cast[theano.config.floatX](1.)) 395 | 396 | 397 | # Perform the test 398 | # Set the model's param to the best catched ones 399 | model.set_model_to_catched_params() 400 | # share test data 401 | testx_sh = theano.shared(testx.astype(theano.config.floatX), 402 | name="testx", borrow=True) 403 | testlabels_sh = theano.shared(testy.astype(theano.config.floatX), 404 | name="testlabels", borrow=True) 405 | 406 | i_x_vl = T.lvector("ixtst") 407 | y_vl = T.vector("y") 408 | error = T.mean(T.neq(T.argmax(model.output, axis=1), y_vl)) 409 | 410 | output_fn_test = [error, model.output, model.layers[-2].output] 411 | 412 | eval_fn_tst = theano.function( 413 | [i_x_vl], output_fn_test, 414 | givens={model.x: testx_sh[i_x_vl], 415 | y_vl: testlabels_sh[i_x_vl]}) 416 | l_tst = chunks(range(testx.shape[0]), valid_batch_size) 417 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))] 418 | train_error_l = [eval_fn_tst(np.array(l_tr[kkk])) for kkk in range(len(l_tr))] 419 | 420 | test_error = np.mean([l[0] for l in test_error_l]) 421 | print "Test error:", test_error 422 | 423 | # Test 424 | # last hidden layer representations. 425 | with open(fold_exp+"/last_hidden_rep_test.pkl", "w") as fhr: 426 | stuff_hrep_tst = None 427 | for k in test_error_l: 428 | if stuff_hrep_tst is None: 429 | stuff_hrep_tst = l[2] 430 | else: 431 | stuff_hrep_tst = np.vstack((stuff_hrep_tst, l[2])) 432 | 433 | stuff_hrep_tr = None 434 | for k in train_error_l: 435 | if stuff_hrep_tr is None: 436 | stuff_hrep_tr = l[2] 437 | else: 438 | stuff_hrep_tr = np.vstack((stuff_hrep_tr, l[2])) 439 | pkl.dump( 440 | {"x_hint_repr_tst": stuff_hrep_tst, "y_tst": testy, 441 | "ximg_tst": testx.reshape((testx.shape[0], 28*28)), 442 | "x_hint_repr_tr": stuff_hrep_tr, "y_tr": trainy, 443 | "ximg_tr": trainx.reshape((trainx.shape[0], 28*28))}, 444 | fhr) 445 | # plot t-SNE of the opriginal images 446 | tx0 = DT.datetime.now() 447 | tsne_original = manifold.TSNE(n_components=2, init='pca', random_state=0) 448 | X_tsne_original = tsne_original.fit_transform( 449 | testx.reshape((testx.shape[0], 28*28))) 450 | fig_tsne_org = plot_representations( 451 | X_tsne_original, testy, "t-SNE embedding of mnist original images.") 452 | fig_tsne_org.savefig(fold_exp+"/original_rep_test.eps", format='eps', 453 | dpi=1200, bbox_inches='tight') 454 | print "t-SNE of original images took:", DT.datetime.now() - tx0 455 | # plot t-SNE of the prediction 456 | tx0 = DT.datetime.now() 457 | tsne_lasthidden_rep = manifold.TSNE(n_components=2, init='pca', 458 | random_state=0) 459 | X_tsne_lhrep = tsne_original.fit_transform(stuff_hrep_tst) 460 | fig_tsne_lhrep = plot_representations( 461 | X_tsne_lhrep, testy, 462 | "t-SNE embedding of the last hidden representation of the MLP" + 463 | "applied over mnist.") 464 | fig_tsne_lhrep.savefig(fold_exp+"/lasth_rep_mlp_test.eps", format='eps', 465 | dpi=1200, bbox_inches='tight') 466 | print "t-SNE of hidden representation took:", DT.datetime.now() - tx0 467 | # save min valid 468 | vl_pathfile = "exps/" + "LeNet_run_" + str(run) + "_sup_" + str(nbr_sup) +\ 469 | "_" + h_exp + "_c_l_" + str(corrupt_input_l) + "_start_at_" +\ 470 | str(start_corrupting) + "_debug_" + str(debug_code) +\ 471 | "_use_sparse_" + str(use_sparsity) + "_use_spar_pred_" +\ 472 | str(use_sparsity_in_pred) + "_" + time_exp + ".txt" 473 | with open(vl_pathfile, 'w') as f: 474 | f.write("Exp. folder: " + fold_exp + "\n") 475 | f.write( 476 | "valid error:" + str( 477 | np.min(train_stats["vl_error_mn"]) * 100.) + " % \n") 478 | f.write("Test error:" + str(test_error * 100.) + " % \n") 479 | shutil.copy(vl_pathfile, fold_exp) 480 | --------------------------------------------------------------------------------