├── README.md
├── __init__.py
├── ae.py
├── basic_layer.py
├── config_yaml
    ├── train3_new_dup_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml
    └── trainLenet_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml
├── data
    ├── __init__.py
    ├── cifar.py
    ├── mnist.pkl.zip
    ├── mnist_bin17.pkl.zip
    └── out.pkl
├── dataset.py
├── exps
    └── .readme.md
├── filterit.py
├── generate_exps.py
├── generate_exps_lenet.py
├── generate_exps_search.py
├── init_params
    └── .readme.md
├── job0.sh
├── joblenet.sh
├── jobs
    ├── .readme.md
    ├── 110_1000_4_trainLenet_0_1000_3_0_0_0_0_4_False_False_False_False_False_110.sl
    └── 20_1000_0_train3_new_dup_0_1000_3_0_0_1_0_0_True_False_False_False_False_20.sl
├── k80.sl
├── layer.py
├── layers.py
├── learning_rate.py
├── learning_rule.py
├── mnist_manip.py
├── non_linearities.py
├── normalization.py
├── outputjobs
    └── .readme.md
├── p100.sl
├── plot_paper.py
├── submit.sh
├── tools.py
├── train3_bin.py
├── train3_new_dup.py
└── trainLenet.py


/README.md:
--------------------------------------------------------------------------------
 1 | ### Neural Networks Regularization Through Class-wise Invariant Representation Learning.
 2 | 
 3 | This repository contains the code of the paper `Neural Networks Regularization Through Class-wise Invariant Representation Learning. S.Belharbi, C.Chatelain, R.Hérault, S.Adam. 2017.`[ArXiv](https://arxiv.org/abs/1709.01867).
 4 | 
 5 | *Please cite this paper if you use the code in this repository as part of a published research project.*
 6 | 
 7 | Requirements:
 8 | - Python (2.7).
 9 | - Theano (0.9).
10 | - Numpy  (1.13).
11 | - Keras (2.0).
12 | - Matplotlib (1.2)
13 | - Yaml (3.10).
14 | 
15 | To run this code, you need to uncompress the MNIST dataset:
16 | ```sh
17 | $ unzip data/mnist.pkl.zip -d data/
18 | $ unzip data/mnist_bin17.pkl.zip -d data/
19 | ```
20 | 
21 | To generate *mnist-noise* and *mnist-img*, please see the file `mnist_manip.py`.
22 | 
23 | The folder `config_yaml` contains [yaml](http://www.yaml.org/start.html) files to configure an experiment. For instance, this is the content of the yaml file to run an experiment using an mlp with 3 hidden layers:
24 | ```yaml
25 | corrupt_input_l: 0.0
26 | debug_code: false
27 | extreme_random: true
28 | h_ind: [false, false, true, false]
29 | h_w: 0.0
30 | hint: true
31 | max_epochs: 400
32 | model: train3_new_dup
33 | nbr_sup: 1000
34 | norm_gh: false
35 | norm_gsup: false
36 | repet: 0
37 | run: 0
38 | start_corrupting: 0
39 | start_hint: 110
40 | use_batch_normalization: [false, false, false, false]
41 | use_sparsity: false
42 | use_sparsity_in_pred: false
43 | use_unsupervised: false
44 | ```
45 | To run this experiment on a GPU:
46 | ```sh
47 | $ THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32  python train3_new_dup.py train3_new_dup_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml 
48 | ```
49 | 
50 | To use [Slurm](https://slurm.schedmd.com/), see the folder `jobs`.


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbelharbi/learning-class-invariant-features/96338d071edb1e2e030373deaebd366c5a84b7c3/__init__.py


--------------------------------------------------------------------------------
/ae.py:
--------------------------------------------------------------------------------
  1 | # Based on: https://github.com/caglar/autoencoders.git
  2 | # http://www-etud.iro.umontreal.ca/~gulcehrc/
  3 | import theano
  4 | import theano.tensor as T
  5 | from theano.tensor.shared_randomstreams import RandomStreams
  6 | 
  7 | from basic_layer import Layer
  8 | from non_linearities import NonLinearity, CostType, relu
  9 | import numpy as np
 10 | import cPickle as pkl
 11 | 
 12 | from collections import OrderedDict
 13 | 
 14 | theano.config.warn.subtensor_merge_bug = False
 15 | 
 16 | 
 17 | class AEHiddenLayer(Layer):
 18 |     def __init__(self,
 19 |                  input,
 20 |                  n_in,
 21 |                  n_out,
 22 |                  n_in_dec=None,
 23 |                  n_out_dec=None,
 24 |                  W=None,
 25 |                  b=None,
 26 |                  num_pieces=1,
 27 |                  bhid=None,
 28 |                  activation=T.nnet.sigmoid,
 29 |                  sparse_initialize=False,
 30 |                  tied_weights=True,
 31 |                  rng=None):
 32 |         """
 33 |         Typical hidden layer for an auto-encoder: The units are fully connected
 34 |         and have sigmoidal activation function. Weight matrix (W) is of shape
 35 |         (n_in, n_out) and the bias vector (b) is of shape(n_out,).
 36 | 
 37 |         Hidden units activation is given by: sigmoid(dot(input, w)+ b)
 38 | 
 39 |         :type rng: numpy.random.RandomState
 40 |         :param rng: a random number generator used to initiaze the weights.
 41 | 
 42 |         :type input: theano.tensor.dmatrix
 43 |         :param input: a symbolic tensor of shape (n_examples, n_in)
 44 | 
 45 |         :type n_in: int
 46 |         :param n_in: dimension of the input
 47 | 
 48 |         :type n_out: int
 49 |         :param n_out: number of hidden units
 50 | 
 51 |         :type activation: theano.Op or function
 52 |         :param activation:  Non linearity to be applied in the hidden layer.
 53 |         """
 54 |         if rng is None:
 55 |             rng = np.random.RandomState()
 56 | 
 57 |         super(AEHiddenLayer, self).__init__(
 58 |             input=input,
 59 |             input1=None,
 60 |             input2=None,
 61 |             input_vl=None,
 62 |             n_in=n_in,
 63 |             n_out=n_out,
 64 |             num_pieces=num_pieces,
 65 |             activation=activation,
 66 |             sparse_initialize=sparse_initialize,
 67 |             rng=rng)
 68 | 
 69 |         self.reset_layer()
 70 | 
 71 |         if W is not None:
 72 |             self.W = W
 73 | 
 74 |         if b is not None:
 75 |             self.b = b
 76 | 
 77 |         if bhid is not None:
 78 |             self.b_prime = bhid
 79 |         else:
 80 |             if n_in_dec is not None:
 81 |                 b_values = np.zeros((n_out_dec), dtype=theano.config.floatX)
 82 |             else:
 83 |                 b_values = np.zeros(
 84 |                     (self.n_in/num_pieces), dtype=theano.config.floatX)
 85 | 
 86 |             self.b_prime = theano.shared(value=b_values, name="b_prime")
 87 | 
 88 |         if tied_weights:
 89 |             self.W_prime = self.W.T
 90 |         else:
 91 |             if n_in_dec is not None and n_out_dec is not None:
 92 |                 W_values = np.asarray(
 93 |                     self.rng.normal(loc=0.,
 94 |                                     scale=0.005,
 95 |                                     size=(n_out_dec, n_in_dec)),
 96 |                     dtype=theano.config.floatX)
 97 |             else:
 98 |                 if self.activation == theano.tensor.tanh:
 99 |                     born = np.sqrt(6. / (self.n_in + self.n_out))
100 |                 else:
101 |                     born = 4 * np.sqrt(6. / (self.n_in + self.n_out))
102 |                 W_values = np.asarray(
103 |                     self.rng.uniform(
104 |                         low=-born,
105 |                         high=born,
106 |                         size=(self.n_out, self.n_in)),
107 |                     dtype=theano.config.floatX)
108 | 
109 |             self.W_prime = theano.shared(value=W_values, name='W_prime',
110 |                                          borrow=True)
111 |             self.params += [self.W_prime]
112 | 
113 |         self.params += [self.b_prime]
114 |         self.setup_outputs(input)
115 | 
116 |     def setup_outputs(self, input):
117 |         lin_output = T.dot(input, self.W) + self.b
118 |         self.output = (
119 |             lin_output if self.activation is None
120 |             else self.activation(lin_output))
121 | 
122 |     def get_outputs(self, input):
123 |         self.setup_outputs(input)
124 |         return self.output
125 | 
126 | 
127 | class Autoencoder(object):
128 |     """
129 |     Typical implementation of an autoencoder.
130 |     """
131 |     def __init__(
132 |             self,
133 |             input,
134 |             nvis,
135 |             nhid=None,
136 |             nvis_dec=None,
137 |             nhid_dec=None,
138 |             rnd=None,
139 |             bhid=None,
140 |             cost_type=CostType.MeanSquared,
141 |             momentum=1,
142 |             num_pieces=1,
143 |             L2_reg=-1,
144 |             L1_reg=-1,
145 |             sparse_initialize=False,
146 |             nonlinearity=NonLinearity.TANH,
147 |             W=None,
148 |             b=None,
149 |             bvis=None,
150 |             tied_weights=True,
151 |             reverse=False):
152 | 
153 |         assert reverse is False
154 |         self.input = input
155 |         self.nvis = nvis
156 |         self.nhid = nhid
157 |         self.bhid = bhid
158 |         self.bvis = bvis
159 |         self.momentum = momentum
160 |         self.nonlinearity = nonlinearity
161 |         self.tied_weights = tied_weights
162 |         self.gparams = None
163 |         self.reverse = reverse
164 |         self.activation = self.get_non_linearity_fn()
165 |         self.catched_params = {}
166 | 
167 |         if cost_type == CostType.MeanSquared:
168 |             self.cost_type = CostType.MeanSquared
169 |         elif cost_type == CostType.CrossEntropy:
170 |             self.cost_type = CostType.CrossEntropy
171 | 
172 |         if rnd is None:
173 |             self.rnd = np.random.RandomState(1231)
174 |         else:
175 |             self.rnd = rnd
176 | 
177 |         self.srng = RandomStreams(seed=1231)
178 | 
179 |         self.hidden = AEHiddenLayer(input=input,
180 |                                     n_in=nvis,
181 |                                     n_out=nhid,
182 |                                     num_pieces=num_pieces,
183 |                                     n_in_dec=nvis_dec,
184 |                                     W=W,
185 |                                     b=b,
186 |                                     n_out_dec=nhid_dec,
187 |                                     activation=self.activation,
188 |                                     tied_weights=tied_weights,
189 |                                     sparse_initialize=sparse_initialize,
190 |                                     rng=rnd)
191 | 
192 |         self.params = self.hidden.params
193 | 
194 |         self.sparse_initialize = sparse_initialize
195 | 
196 |         self.L1_reg = L1_reg
197 |         self.L2_reg = L2_reg
198 | 
199 |         self.L1 = 0
200 |         self.L2 = 0
201 | 
202 |         if input is not None:
203 |             self.x = input
204 |         else:
205 |             self.x = T.matrix('x_input', dtype=theano.config.floatX)
206 | 
207 |     def set_regularization_l1(self, L1_reg):
208 |         if L1_reg != -1:
209 |             self.L1 += abs(self.hidden.W).sum()
210 |             if not self.tied_weights:
211 |                 self.L1 += abs(self.hidden.W_prime).sum()
212 | 
213 |     def set_regularization_l2(self, L2_reg):
214 |         if L2_reg != -1:
215 |             self.L2 += (self.hidden.W_prime**2).sum()
216 |             if not self.tied_weights:
217 |                 self.L2 += (self.hidden.W**2).sum()
218 | 
219 |     def catch_params(self):
220 |         for param in self.params:
221 |             self.catched_params[param.name] = param.get_value()
222 | 
223 |     def nonlinearity_fn(self, d_in=None, recons=False):
224 |         if self.nonlinearity == NonLinearity.SIGMOID:
225 |             return T.nnet.sigmoid(d_in)
226 |         elif self.nonlinearity == NonLinearity.RELU and not recons:
227 |             return T.maximum(d_in, 0)
228 |         elif self.nonlinearity == NonLinearity.RELU and recons:
229 |             return T.nnet.softplus(d_in)
230 |         elif self.nonlinearity == NonLinearity.TANH:
231 |             return T.tanh(d_in)
232 |         elif self.nonlinearity is None:
233 |             return d_in
234 | 
235 |     def get_non_linearity_fn(self):
236 |         if self.nonlinearity == NonLinearity.SIGMOID:
237 |             return T.nnet.sigmoid
238 |         elif self.nonlinearity == NonLinearity.RELU:
239 |             return relu
240 |         elif self.nonlinearity == NonLinearity.TANH:
241 |             return T.tanh
242 |         elif self.nonlinearity is None:
243 |             return None
244 | 
245 |     def encode(self, x_in=None, center=True):
246 |         if x_in is None:
247 |             x_in = self.x
248 | 
249 |         act = self.nonlinearity_fn(T.dot(x_in, self.hidden.W) + self.hidden.b)
250 |         if center:
251 |             act = act - act.mean(0)
252 |         return act
253 | 
254 |     def encode_linear(self, x_in=None):
255 |         if x_in is None:
256 |             x_in = self.x_in
257 | 
258 |         lin_out = T.dot(x_in, self.hidden.W) + self.hidden.b
259 |         return self.nonlinearity_fn(lin_out), lin_out
260 | 
261 |     def decode(self, h):
262 |         return self.nonlinearity_fn(
263 |             T.dot(h, self.hidden.W_prime) + self.hidden.b_prime)
264 | 
265 |     def get_rec_cost(self, x_rec, eyes=False):
266 |         """
267 |         Returns the reconstruction cost.
268 |         """
269 |         if self.cost_type == CostType.MeanSquared:
270 |             return T.mean(((self.x - x_rec)**2).sum(axis=1))
271 |         elif self.cost_type == CostType.CrossEntropy:
272 |             return T.mean(
273 |                 (T.nnet.binary_crossentropy(x_rec, self.x)).mean(axis=1))
274 | 
275 |     def get_rec_cost_face(self, x_rec):
276 |         """
277 |         Returns the reconstruction cost.
278 |         """
279 |         d_eyes = (
280 |             (self.x[:, 37] - self.x[:, 46])**2 +
281 |             (self.x[:, 37] - self.x[:, 46])**2).T
282 |         if self.cost_type == CostType.MeanSquared:
283 |             return T.mean(((self.x - x_rec)**2).sum(axis=1) / d_eyes)
284 |         elif self.cost_type == CostType.CrossEntropy:
285 |             return T.mean(
286 |                 (T.nnet.binary_crossentropy(
287 |                     x_rec, self.x)).mean(axis=1) / d_eyes)
288 | 
289 |     def kl_divergence(self, p, p_hat):
290 |         return p * T.log(p) - T.log(p_hat) + (1-p) * T.log(1-p) -\
291 |             (1-p_hat) * T.log(1-p_hat)
292 | 
293 |     def sparsity_penality(self, h, sparsity_level=0.05, sparse_reg=1e-3,
294 |                           batch_size=-1):
295 |         if batch_size == -1 or batch_size == 0:
296 |             raise Exception("Invalid batch size")
297 | 
298 |         sparsity_level = T.extra_ops.repeat(sparsity_level, self.nhid)
299 |         sparsity_penality = 0
300 |         avg_act = h.mean(axis=0)
301 |         kl_div = self.kl_divergence(sparsity_level, avg_act)
302 |         sparsity_penality = sparse_reg * kl_div.sum()
303 |         return sparsity_penality
304 | 
305 |     def act_grads(self, inputs):
306 |         h, acts = self.encode_linear(inputs)
307 |         h_grad = T.grad(h.sum(), acts)
308 |         return (h, h_grad)
309 | 
310 |     def jacobian_h_x(self, inputs):
311 |         h, act_grad = self.act_grads(inputs)
312 |         jacobian = self.hidden.W * act_grad.dimshuffle(0, 'x', 1)
313 |         return (h, T.reshape(jacobian, newshape=(self.nhid, self.nvis)))
314 | 
315 |     def compute_jacobian_h_x(self, inputs):
316 |         inputs = theano.shared(inputs.flatten())
317 |         h = self.encode(inputs)
318 |         # see later
319 |         # h = h.faltten()
320 |         # inputs = inputs.flatten()
321 |         # inputs = T.reshape(inputs, newshape=(self.nvis))
322 |         J = theano.gradient.jacobian(h, inputs)
323 |         return h, J
324 | 
325 |     def sample_one_step(self, x, sigma):
326 |         # h, J_t = self.jacobian_h_x(x)
327 |         h, J_t = self.compute_jacobian_h_x(x)
328 |         eps = self.srng.normal(avg=0, size=(self.nhid, 1), std=sigma)
329 |         jacob_w_eps = T.dot(J_t.T, eps)
330 |         delta_h = T.dot(J_t, jacob_w_eps)
331 |         perturbed_h = h + delta_h.T
332 |         x = self.decode(perturbed_h)
333 |         return x
334 | 
335 |     def sample_scan(self, x, sigma, n_steps, samples):
336 |         # Enable on-the-fly graph computations
337 |         #  theano.config.compute_test_value = "raise"
338 |         in_val = T.fmatrix("input_values")
339 |         # in_val.tag.test_value = np.asarray(
340 |         #    np.random.rand(1, 784), dtype=theano.config.floatX)
341 |         s_sigma = T.fscalr("sigma_values")
342 |         # s_sigma = np.asarray(
343 |         #    np.random.rand(1), dtype=theano.config.floatX)
344 |         mode = "FAST_RUN"
345 |         values, updates = theano.scan(fn=self.sample_one_step,
346 |                                       outputs_info=in_val,
347 |                                       non_sequences=s_sigma,
348 |                                       n_steps=n_steps,
349 |                                       mode=mode)
350 |         ae_sampler = theano.function(inputs=[in_val, s_sigma],
351 |                                      outputs=values[-1],
352 |                                      updates=updates)
353 |         samples = ae_sampler(x, sigma)
354 |         return samples
355 | 
356 |     def sample_old(self, x, sigma, n_steps):
357 |         # Enable on-the-fly graph computations
358 |         # theano.config.compute_test_value = "raise"
359 |         # in_val = T.fmatrix('input_values")
360 |         # in_val.tag.test_value = np.asarray(
361 |         #   np.random.rand(1, 784), dtype=theano.config.floatX)
362 |         # s_sigma = T.fscalar("sigma_value")
363 |         # s_sigma = np.asarray(
364 |         #   np.random.rand(1), dtype=theano.config.floatX)
365 |         # mode = "FAST_RUN"
366 |         samples = []
367 |         sample = x
368 |         samples.append(x)
369 |         for i in xrange(n_steps):
370 |             print "Sample %d ..." % i
371 |             sampler = self.sample_one_step(sample, sigma)
372 |             sample = sampler.eval()
373 |             samples.append(sample)
374 |         return samples
375 | 
376 |     def get_sgd_updates(self, learning_rate, lr_scaler=1.0, batch_size=1,
377 |                         sparsity_level=-1, sparse_reg=-1, x_in=None):
378 |         h = self.encode(x_in)
379 |         x_rec = self.decode(h)
380 |         cost = self.get_rec_cost(x_rec)
381 | 
382 |         if self.L1_reg != -1 and self.L1_reg is not None:
383 |             cost += self.L1_reg * self.L1
384 | 
385 |         if self.L2_reg != -1 and self.L2_reg is not None:
386 |             cost += self.L2_reg * self.L2
387 | 
388 |         if sparsity_level != -1 and sparse_reg != -1:
389 |             sparsity_penal = self.sparsity_penality(
390 |                 h, sparsity_level, sparse_reg, batch_size)
391 |             cost += sparsity_penal
392 | 
393 |         self.gparams = T.grad(cost, self.params)
394 |         updates = OrderedDict({})
395 |         for param, gparam in zip(self.params, self.gparams):
396 |             updates[param] = self.momentum * param - lr_scaler * \
397 |                 learning_rate * gparam
398 |         return (cost, updates, h, x_rec)
399 | 
400 |     def get_train_cost(self, batch_size=1, sparsity_level=-1, sparse_reg=-1,
401 |                        x_in=None, face=False):
402 |         h = self.encode(x_in)
403 |         x_rec = self.decode(h)
404 |         cost = self.get_rec_cost(x_rec)
405 | 
406 |         if self.L1_reg != -1 and self.L1_reg is not None:
407 |             cost += self.L1_reg * self.L1
408 | 
409 |         if self.L2_reg != -1 and self.L2_reg is not None:
410 |             cost += self.L2_reg * self.L2
411 | 
412 |         if sparsity_level != -1 and sparse_reg != -1:
413 |             sparsity_penal = self.sparsity_penality(
414 |                 h, sparsity_level, sparse_reg, batch_size)
415 |             cost += sparsity_penal
416 | 
417 |         return (cost, h, x_rec)
418 | 
419 |     def get_train_cost_clean(self):
420 |         h = self.encode(self.x)
421 |         x_rec = self.decode(h)
422 |         cost = self.get_rec_cost(x_rec)
423 | 
424 |         cost += self.L1
425 |         cost += self.L2
426 | 
427 |         return cost
428 | 
429 |     def save_params(self, weights_file, catched=False):
430 |         """Save the model's parameters."""
431 |         f_dump = open(weights_file, "w")
432 |         params_vls = {}
433 |         if catched:
434 |             if self.catched_params != {}:
435 |                 params_vls = self.catched_params
436 |             else:
437 |                 raise ValueError(
438 |                     "You asked to save catched params," +
439 |                     "but you didn't catch any!!!!!!!")
440 |         else:
441 |             for param in self.params:
442 |                 params_vls[param.name] = param.get_value()
443 |         pkl.dump(params_vls, f_dump, protocol=pkl.HIGHEST_PROTOCOL)
444 |         f_dump.close()
445 | 
446 |     def set_params_vals(self, weights_file):
447 |         """Set the values of the parameters."""
448 |         with open(weights_file, 'r') as f:
449 |             params_vls = pkl.load(f)
450 |             for param in self.params:
451 |                 param.set_value(params_vls[param.name])
452 | 
453 |     def fit(self,
454 |             data=None,
455 |             learning_rate=0.1,
456 |             batch_size=100,
457 |             n_epochs=20,
458 |             lr_scalar=0.998,
459 |             weights_file="out/ae_weights_mnist.npy"):
460 |         """
461 |         Fit the data to the autoencoder (training).
462 |         """
463 |         if data is None:
464 |             raise Exception("Data can't be empty.")
465 | 
466 |         index = T.lscalar("index")
467 |         data_shared = theano.shared(
468 |             np.asarray(data, dtype=theano.config.floatX))
469 |         n_batches = data.shape[0] / batch_size
470 |         (cost, updates) = self.get_sgd_updates(
471 |             learning_rate, lr_scalar, batch_size)
472 |         train_ae = theano.function(
473 |             [index], cost, updates=updates,
474 |             givens={
475 |                 self.x: data_shared[index*batch_size: (index+1)*batch_size]})
476 | 
477 |         print "Start training the ae."
478 |         ae_costs = []
479 | 
480 |         for epoch in xrange(n_epochs):
481 |             print "Training at epoch %d" % epoch
482 |             cost_one_epoch = []
483 |             for batch_index in xrange(n_batches):
484 |                 cost_one_epoch.append(train_ae(batch_index))
485 |             print "Training at epoch %d, %f" % (epoch, np.mean(cost_one_epoch))
486 |             ae_costs.append(np.mean(cost_one_epoch))
487 | 
488 |         print "Saving files ..."
489 |         self.save_params(weights_file)
490 |         return ae_costs
491 | 


--------------------------------------------------------------------------------
/basic_layer.py:
--------------------------------------------------------------------------------
  1 | # Based on: https://github.com/caglar/autoencoders.git
  2 | # http://www-etud.iro.umontreal.ca/~gulcehrc/
  3 | from __future__ import division
  4 | import numpy as np
  5 | import theano
  6 | from theano import tensor as T
  7 | import warnings
  8 | 
  9 | 
 10 | from theano.tensor.signal import pool
 11 | from theano.tensor.nnet import conv2d
 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 13 | from non_linearities import NonLinearity, CostType, relu, get_non_linearity_str
 14 | 
 15 | 
 16 | def sharedX_value(value, name=None, borrow=None, dtype=None):
 17 |     """Share a single value after transforming it to floatX type.
 18 | 
 19 |     value: a value
 20 |     name: variable name (str)
 21 |     borrow: boolean
 22 |     dtype: the type of the value when shared. default: theano.config.floatX
 23 |     """
 24 |     if dtype is None:
 25 |         dtype = theano.config.floatX
 26 |     return theano.shared(
 27 |         theano._asarray(value, dtype=dtype), name=name, borrow=borrow)
 28 | 
 29 | 
 30 | class Layer(object):
 31 |     """
 32 |     A general base layer class for neural network.
 33 |     for training, the layer takes a pair of samples (input1, input2).
 34 |     input1 and input2 belong to the same class.
 35 |     input: sample for the supervised part.
 36 |     input1: first samples
 37 |     input2: second sample
 38 |     intended_to_be_corrupted: boolean. If True, we create a corruptor
 39 |         for the input. This indicates that may be at some point in the
 40 |         future the inputs of this layer may be corrupted.
 41 |     corrupt_input_l: Float. If !=0., only the input1 and input2 will be
 42 |         corrupted.
 43 |     NOTE:
 44 |         Basically, only the input of the first layer is corrupted. There is
 45 |             no interest/reason in corrupting the intermediate inputs.
 46 |     """
 47 |     def __init__(self,
 48 |                  input,
 49 |                  input1,
 50 |                  input2,
 51 |                  input_vl,
 52 |                  n_in,
 53 |                  n_out,
 54 |                  activation=T.nnet.sigmoid,
 55 |                  sparse_initialize=False,
 56 |                  num_pieces=1,
 57 |                  non_zero_units=25,
 58 |                  rng=None,
 59 |                  hint="l1mean",
 60 |                  use_hint=False,
 61 |                  intended_to_be_corrupted=False,
 62 |                  corrupt_input_l=0.,
 63 |                  use_sparsity=False,
 64 |                  use_sparsity_in_pred=False,
 65 |                  use_batch_normalization=False):
 66 | 
 67 |         assert hint is not None
 68 |         self.num_pieces = num_pieces
 69 |         self.corrupt_input_l = sharedX_value(corrupt_input_l, name="cor_l")
 70 |         self.intended_to_be_corrupted = intended_to_be_corrupted
 71 |         self.rng = np.random.RandomState(123)
 72 |         self.theano_rng = RandomStreams(self.rng.randint(2 ** 30))
 73 |         self.input = input
 74 |         self.input1 = input1  # x1
 75 |         self.input2 = input2  # x2
 76 |         self.input_vl = input_vl  # bn input used for validation.
 77 |         self.n_in = n_in
 78 |         self.n_out = n_out
 79 |         self.rng = rng
 80 |         self.sparse_initialize = sparse_initialize
 81 |         self.non_zero_units = non_zero_units
 82 |         self.W = None
 83 |         self.b = None
 84 |         self.sparser = None
 85 |         self.activation = activation
 86 |         self.hint = hint
 87 |         self.use_hint = use_hint
 88 |         self.use_sparsity = use_sparsity
 89 |         self.use_sparsity_in_pred = use_sparsity_in_pred
 90 |         self.use_batch_normalization = use_batch_normalization
 91 |         self.bn = None
 92 | 
 93 |     def reset_layer(self):
 94 |         """
 95 |         initailize the layer's parameters to random.
 96 |         """
 97 |         if self.W is None:
 98 |             if self.sparse_initialize:
 99 |                 W_values = self.sparse_initialize_weights()
100 |             else:
101 |                 if self.activation == theano.tensor.tanh:
102 |                     born = np.sqrt(6. / (self.n_in + self.n_out))
103 |                 else:
104 |                     born = 4 * np.sqrt(6. / (self.n_in + self.n_out))
105 |                 W_values = np.asarray(self.rng.uniform(
106 |                     low=-born,
107 |                     high=born,
108 |                     size=(self.n_in, self.n_out)),
109 |                     dtype=theano.config.floatX)
110 | 
111 |             self.W = theano.shared(value=W_values, name='W', borrow=True)
112 | 
113 |         if self.b is None:
114 |             b_values = np.zeros(int(self.n_out/self.num_pieces),
115 |                                 dtype=theano.config.floatX)
116 |             self.b = theano.shared(value=b_values, name='b', borrow=True)
117 | 
118 |         if self.sparser is None:
119 |             s_values = np.ones(
120 |                 int(self.n_out/self.num_pieces), dtype=theano.config.floatX)
121 |             self.sparser = theano.shared(value=s_values, name='sparser',
122 |                                          borrow=True)
123 |         # The layer parameters
124 |         self.params = [self.W, self.b]
125 | 
126 |     def get_corrupted_input(self, input):
127 |         """This function keeps 1-self.corruption_input_l entries of the inputs
128 |         the  same and zero-out randomly selected subset of size
129 |         self.coruption_input_l.
130 | 
131 |         """
132 |         return self.theano_rng.binomial(size=input.shape, n=1,
133 |                                         p=1 - self.corrupt_input_l,
134 |                                         dtype=theano.config.floatX) * input
135 | 
136 |     def sparse_initialization_weights(self):
137 |         """
138 |         Implement the sparse initialization technique as described in
139 |         J. Marten, 'Deep learning via Hessian-free optimization', ICML, 2010.
140 |         http://icml2010.haifa.il.ibm.com/papers/458.pdf
141 |         """
142 |         W = []
143 |         mu, sigma = 0, 1/self.non_zero_units
144 | 
145 |         for i in xrange(self.n_in):
146 |             row = np.zeros(self.n_out)
147 |             non_zeros = self.rng.normal(mu, sigma, self.non_zero_units)
148 |             # non_zeros /= non_zeros.sum()
149 |             non_zero_idxs = self.rng.permutation(
150 |                 self.n_out)[0:self.non_zero_units]
151 |             for j in xrange(self.non_zero_units):
152 |                 row[non_zero_idxs[j]] = non_zeros[j]
153 |             W.append(row)
154 |         W = np.asarray(W, dtype=theano.config.floatX)
155 |         return W
156 | 


--------------------------------------------------------------------------------
/config_yaml/train3_new_dup_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml:
--------------------------------------------------------------------------------
 1 | corrupt_input_l: 0.0
 2 | debug_code: false
 3 | extreme_random: true
 4 | h_ind: [false, false, true, false]
 5 | h_w: 0.0
 6 | hint: true
 7 | max_epochs: 400
 8 | model: train3_new_dup
 9 | nbr_sup: 1000
10 | norm_gh: false
11 | norm_gsup: false
12 | repet: 0
13 | run: 0
14 | start_corrupting: 0
15 | start_hint: 110
16 | use_batch_normalization: [false, false, false, false]
17 | use_sparsity: false
18 | use_sparsity_in_pred: false
19 | use_unsupervised: false
20 | 


--------------------------------------------------------------------------------
/config_yaml/trainLenet_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml:
--------------------------------------------------------------------------------
 1 | corrupt_input_l: 0.0
 2 | debug_code: false
 3 | extreme_random: true
 4 | h_ind: [false, false, true, false]
 5 | h_w: 0.0
 6 | hint: true
 7 | max_epochs: 400
 8 | model: trainLenet
 9 | nbr_sup: 1000
10 | norm_gh: false
11 | norm_gsup: false
12 | repet: 0
13 | run: 0
14 | start_corrupting: 0
15 | start_hint: 110
16 | use_batch_normalization: [false, false, false, false]
17 | use_sparsity: false
18 | use_sparsity_in_pred: false
19 | use_unsupervised: false
20 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbelharbi/learning-class-invariant-features/96338d071edb1e2e030373deaebd366c5a84b7c3/data/__init__.py


--------------------------------------------------------------------------------
/data/cifar.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | from keras.datasets import cifar10
 3 | import numpy as np
 4 | import os
 5 | import sys
 6 | import matplotlib.pyplot as plt
 7 | import cPickle as pkl
 8 | 
 9 | 
10 | (x_train, y_train), (x_test, y_test) = cifar10.load_data()
11 | y_test = y_test.reshape(y_test.size)
12 | y_train = y_train.reshape(y_train.size)
13 | cl = np.unique(y_train)
14 | vl_ind = []
15 | for i in cl:
16 |     ind = np.argwhere(y_train == i)
17 |     ind = ind.reshape(ind.size)
18 |     for k in range(1000):
19 |         np.random.shuffle(ind)
20 |     vl_ind.extend(ind[:int(len(ind)/10)])
21 |     # debug
22 | #    path = "./CIFAR10/" + str(i) + '/'
23 | #    if not os.path.exists(path):
24 | #        os.makedirs(path)
25 | #    for k in ind:
26 | #        fig = plt.figure()
27 | #        plt.imshow(x_train[k])
28 | #        fig.savefig(path + str(k) + ".png")
29 | 
30 | x_vl = x_train[vl_ind]
31 | y_vl = y_train[vl_ind]
32 | ind_tr = []
33 | for i in range(x_train.shape[0]):
34 |     if i not in vl_ind:
35 |         ind_tr.append(i)
36 | for i in range(10000):
37 |     np.random.shuffle(ind_tr)
38 | 
39 | new_x_train = x_train[ind_tr]
40 | new_y_train = y_train[ind_tr]
41 | 
42 | stuff = [(x_train, y_train), (x_vl, y_vl), (x_test, y_test)]
43 | for e in stuff:
44 |     print e[0].shape, e[1].shape
45 | 
46 | with open("cifar10.pkl", "w") as f:
47 |     pkl.dump(stuff, f)
48 | 


--------------------------------------------------------------------------------
/data/mnist.pkl.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbelharbi/learning-class-invariant-features/96338d071edb1e2e030373deaebd366c5a84b7c3/data/mnist.pkl.zip


--------------------------------------------------------------------------------
/data/mnist_bin17.pkl.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sbelharbi/learning-class-invariant-features/96338d071edb1e2e030373deaebd366c5a84b7c3/data/mnist_bin17.pkl.zip


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
 1 | # Based on: https://github.com/caglar/autoencoders.git
 2 | # http://www-etud.iro.umontreal.ca/~gulcehrc/
 3 | from __future__ import division
 4 | 
 5 | import pickle as pkl
 6 | import math
 7 | import numpy as np
 8 | 
 9 | 
10 | class Dataset(object):
11 |     def __init__(self, is_binary=False):
12 |         self.is_binary = is_binary
13 | 
14 |         # Examples
15 |         self.Xtrain = None
16 |         self.Xtest = None
17 | 
18 |         # Labels
19 |         self.Ytrain = None
20 |         self.Ytest = None
21 | 
22 |         self.Xtrain_pres = None
23 |         self.Xtest_pres = None
24 | 
25 |         self.sparsity = 0.0
26 |         self.n_examples = 0
27 | 
28 |     def _get_data(self, data_path):
29 |         if data_path.endswith("pkl") or data_path.endswith("pickle"):
30 |             data = pkl.load(open(data_path, "rb"))
31 |         else:
32 |             data = np.load(data_path)
33 |         return data
34 | 
35 |     def binarize_labels(self, labels=None):
36 |         # Largest label is for the images without different objects.
37 |         last_lbl = np.max(labels)
38 |         binarized_lbls = []
39 |         if self.is_binary:
40 |             for label in labels:
41 |                 if label == last_lbl:
42 |                     binarized_lbls.append(0)
43 |                 else:
44 |                     binarized_lbls.append(1)
45 |         return binarized_lbls
46 | 
47 |     def setup_dataset(self, data_path=None, train_split_scale=0.0):
48 |         data = self._get_data(data_path)
49 |         self.n_examples = data[0].shape[0]
50 |         ntrain = math.floor(self.n_examples * train_split_scale)
51 | 
52 |         self.Xtrain = data[0][:ntrain]
53 |         self.Xtrain_pres = data[2][:ntrain]
54 |         self.Xtest = data[0][ntrain:]
55 |         self.Xtest_pres = data[2][ntrain:]
56 | 
57 |         if train_split_scale != 0.0:
58 |             self.Ytrain = np.array(
59 |                 self.binarize_labels(data[1][:ntrain].flatten())
60 |                 if self.is_binary else data[1][:ntrain].flatten())
61 | 
62 |         if train_split_scale != 1.0:
63 |             self.Ytest = np.array(
64 |                 self.binarize_labels(data[1][ntrain:].flatten())
65 |                 if self.is_binary else data[1][ntrain:].flatten())
66 | 
67 |     def comp_sparsity(self):
68 |         num_sparse_els = 0
69 |         for el in self.Xtrain.flatten():
70 |             if el == 0:
71 |                 num_sparse_els += 1
72 |         for el in self.Xtest.flatten():
73 |             if el == 0:
74 |                 num_sparse_els += 1
75 |         self.sparsity = (num_sparse_els / self.n_examples)
76 |         return self.sparsity
77 | 


--------------------------------------------------------------------------------
/exps/.readme.md:
--------------------------------------------------------------------------------
1 | This folder contains the output of the experiments.


--------------------------------------------------------------------------------
/filterit.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import os
 3 | from operator import eq
 4 | import numpy as np
 5 | import sys
 6 | 
 7 | 
 8 | def get_vl_tst(f):
 9 |     with open(f, 'r') as ff:
10 |         cont = ff.readlines()
11 |     cont = [x.strip() for x in cont]
12 |     # vl:
13 |     vl = float(cont[1].split(":")[1].split(" ")[0])
14 |     tst = float(cont[2].split(":")[1].split(" ")[0])
15 |     return [vl, tst]
16 | 
17 | 
18 | def get_all_yamls_perfs(folder, h_ind=[1, 1, 1, 1], hint=[True, False],
19 |                         norm_gh=True,
20 |                         norm_gsup=True, nbr_sup=1000):
21 |     """Collect all the yaml files and the performance files."""
22 |     h_ind_n = [str(i) for i in h_ind]
23 |     h_ind = [bool(k) for k in h_ind]
24 |     path_to_exps = folder
25 |     list_exps = next(os.walk(path_to_exps))[1]
26 |     list_exps = [e for e in list_exps if e.startswith("hint") or
27 |                  e.startswith("no")]
28 |     list_exps = [path_to_exps + e for e in list_exps]
29 |     # Start filtering
30 |     filtered_list = []
31 |     list_start_hint = []
32 |     for d in list_exps:
33 |         # Get the yaml file
34 |         for file in os.listdir(d):
35 |             if file.endswith(".yaml"):
36 |                 yaml_file = os.path.join(d, file)
37 |                 # print yaml_file
38 |         # Satrt filtering ...
39 |         # Read the yaml file
40 |         with open(yaml_file, 'r') as y:
41 |             yaml_cont = yaml.load(y)
42 |         if yaml_cont["hint"] not in hint:
43 |             continue
44 |         if yaml_cont["norm_gh"] != norm_gh:
45 |             continue
46 |         if yaml_cont["norm_gsup"] != norm_gsup:
47 |             continue
48 |         if yaml_cont["nbr_sup"] != nbr_sup:
49 |             continue
50 |         if not all(map(eq, yaml_cont["h_ind"], h_ind)):
51 |             continue
52 |         # Get the per file.
53 |         for file in os.listdir(d):
54 |             if file.endswith(".txt"):
55 |                 perf_file = os.path.join(d, file)
56 |                 filtered_list.append(perf_file)
57 |         list_start_hint.append(yaml_cont["start_hint"])
58 |     # No that you are done collecting the appropriate files.
59 |     # COmpute the mean+-std
60 |     vl, tst = [], []
61 |     for file in filtered_list:
62 |         [v, t] = get_vl_tst(file)
63 |         vl.append(v)
64 |         tst.append(t)
65 |     # remove the largest and smallest value (test error)
66 |     comb = zip(vl, tst, list_start_hint)
67 |     sorted_comb = sorted(comb, key=lambda tup: tup[1])
68 |     print "(vl, tst, start_hint)", len(comb)
69 |     for el in sorted_comb:
70 |         print el
71 |     # remove the best and the worst.
72 |     sorted_comb.pop(0)
73 |     sorted_comb.pop(-1)
74 |     vl, tst, list_start_hint = zip(*sorted_comb)
75 |     # back to original lists.
76 |     m_vl = np.mean(vl)
77 |     std_vl = np.std(vl)
78 |     m_tst = np.mean(tst)
79 |     std_tst = np.std(tst)
80 |     print str(len(filtered_list)), "_".join(h_ind_n), " norm_gh:",\
81 |         str(norm_gh),\
82 |         " norm_gsup:", str(norm_gsup),\
83 |         " vl:", str(m_vl), "+-", str(std_vl), " tst:", str(m_tst), "+-",\
84 |         str(std_tst), "\n"
85 | 
86 | inds = [[0, 0, 1, 0]]
87 | norm_gh = False
88 | norm_gsup = False
89 | hint = [True, False]
90 | nbr_sup = int(sys.argv[2])
91 | path_exps = str(sys.argv[1])
92 | 
93 | for e in inds:
94 |     get_all_yamls_perfs(path_exps, h_ind=e, hint=hint, norm_gh=norm_gh,
95 |                         norm_gsup=norm_gsup, nbr_sup=nbr_sup)
96 | 


--------------------------------------------------------------------------------
/generate_exps.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | 
  4 | 
  5 | def get_name_exp_from_yaml(d):
  6 |     name = ""
  7 |     name = str(d["run"]) + "_"
  8 |     name += str(d["nbr_sup"]) + "_"
  9 |     name += str(len(d["h_ind"]) - 1) + "_"
 10 |     name += "_".join([str(int(k)) for k in d["h_ind"]]) + "_"
 11 |     name += str(d["repet"]) + "_"
 12 |     name += str(d["hint"]) + "_"
 13 |     name += str(d["norm_gsup"]) + "_"
 14 |     name += str(d["norm_gh"]) + "_"
 15 |     name += str(d["debug_code"]) + "_"
 16 |     name += str(d["use_unsupervised"])
 17 | 
 18 |     return name
 19 | 
 20 | 
 21 | def save_file(exp, rep, max_rep):
 22 |     # grad normalization
 23 |     # conf_norm = [(1, 0), (0, 1), (1, 1)]
 24 |     conf_norm = [(0, 0)]
 25 |     for c in conf_norm:
 26 |         exp["norm_gh"] = bool(c[0])
 27 |         exp["norm_gsup"] = bool(c[1])
 28 |         if rep == max_rep - 1:
 29 |             exp["debug_code"] = True
 30 |         print "Just forces debuge to TRUE *********"
 31 |         exp["debug_code"] = False
 32 |         name = get_name_exp_from_yaml(exp)
 33 |         with open(fold_exps+"/"+name+".yaml", "w") as fyaml:
 34 |             yaml.dump(exp, fyaml)
 35 |             f.write("python " + runner + " " + name + ".yaml \n")
 36 | # Default
 37 | nbr_layers = 3
 38 | use_unsupervised = False
 39 | exp = {"debug_code": False,
 40 |        "nbr_sup": 1000,
 41 |        "run": 45,
 42 |        "h_ind": [False for i in range(nbr_layers+1)],
 43 |        "use_batch_normalization": [False for i in range(nbr_layers+1)],
 44 |        "corrupt_input_l": 0.,
 45 |        "start_corrupting": 0,
 46 |        "use_sparsity": False,
 47 |        "use_sparsity_in_pred": False,
 48 |        "max_epochs": 400,
 49 |        "hint": False,
 50 |        "extreme_random": True,
 51 |        "norm_gsup": False,
 52 |        "norm_gh": False,
 53 |        "repet": 0,
 54 |        "use_unsupervised": use_unsupervised,
 55 |        "h_w": 1.,
 56 |        "start_hint": 5
 57 |        }
 58 | nbr_sup_ = [1000, 3000, 5000, 50000]
 59 | h_w_vls = [.0, .0, .0, .0]
 60 | start_hint_vl = [2, 2, 1, 1]
 61 | run = 0
 62 | fold_exps = "config_yaml"
 63 | bash_name = "job0.sh"
 64 | f = open(bash_name, "w")
 65 | f.write("#!/usr/bin/env bash \n")
 66 | runner = "train3_new_dup.py"
 67 | max_rep = 7
 68 | for nbr, h_w, start_hint in zip(nbr_sup_, h_w_vls, start_hint_vl):
 69 |     for rep in range(max_rep):
 70 |         print rep
 71 |         exp["nbr_sup"] = nbr
 72 |         # we need one run for an MLP without hint.
 73 |         if rep == 0:
 74 |             exp["debug_code"] = False
 75 |         else:
 76 |             exp["debug_code"] = False
 77 |         exp["h_ind"] = [False for k in range(nbr_layers+1)]
 78 |         exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)]
 79 |         exp["use_batch_normalization"][-2] = False
 80 |         exp["hint"] = False
 81 |         exp["run"] = run
 82 |         exp["repet"] = rep
 83 |         exp["norm_gh"] = False
 84 |         exp["norm_gsup"] = False
 85 |         exp["max_epochs"] = 2000
 86 |         exp["start_hint"] = 0
 87 |         exp["h_w"] = h_w
 88 |         name = get_name_exp_from_yaml(exp)
 89 |         with open(fold_exps+"/"+name+".yaml", "w") as fyaml:
 90 |             yaml.dump(exp, fyaml)
 91 |             f.write("python " + runner + " " + name + ".yaml \n")
 92 | 
 93 |         exp["max_epochs"] = 400
 94 |         exp["debug_code"] = False
 95 |         # ******* Train inly the layer before the output.
 96 |         exp["h_ind"] = [False for k in range(nbr_layers+1)]
 97 |         exp["h_ind"][-2] = True
 98 |         exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)]
 99 |         exp["use_batch_normalization"][-2] = True
100 |         exp["hint"] = True
101 |         exp["run"] = run
102 |         exp["repet"] = rep
103 |         exp["start_hint"] = start_hint
104 |         save_file(exp, rep, max_rep)
105 |         continue
106 |         # *****
107 |         # Exclusive layers
108 |         for i in range(nbr_layers+1):
109 |             exp["h_ind"] = [False for k in range(nbr_layers+1)]
110 |             exp["h_ind"][i] = True
111 |             exp["hint"] = True
112 |             exp["run"] = run
113 |             exp["repet"] = rep
114 |             save_file(exp, rep, max_rep)
115 | 
116 |         # From input to output
117 | #        exp["h_ind"] = [False for k in range(nbr_layers+1)]
118 | #        exp["h_ind"][0] = True
119 | #        for kk in range(1, nbr_layers+1):
120 | #            exp["h_ind"][kk] = True
121 | #            save_file(exp, rep, max_rep)
122 |         # From output to input
123 |         exp["h_ind"] = [False for k in range(nbr_layers+1)]
124 |         exp["h_ind"][-1] = True
125 |         for kk in range(-2, -(nbr_layers+2), -1):
126 |             exp["h_ind"][kk] = True
127 |             save_file(exp, rep, max_rep)
128 | f.close()
129 | os.system("chmod +x " + bash_name)
130 | 


--------------------------------------------------------------------------------
/generate_exps_lenet.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | 
  4 | 
  5 | def get_name_exp_from_yaml(d):
  6 |     name = "lenet_"
  7 |     name += str(d["run"]) + "_"
  8 |     name += str(d["nbr_sup"]) + "_"
  9 |     name += str(len(d["h_ind"]) - 1) + "_"
 10 |     name += "_".join([str(int(k)) for k in d["h_ind"]]) + "_"
 11 |     name += str(d["repet"]) + "_"
 12 |     name += str(d["hint"]) + "_"
 13 |     name += str(d["norm_gsup"]) + "_"
 14 |     name += str(d["norm_gh"]) + "_"
 15 |     name += str(d["debug_code"]) + "_"
 16 |     name += str(d["use_unsupervised"])
 17 | 
 18 |     return name
 19 | 
 20 | 
 21 | def save_file(exp, rep, max_rep):
 22 |     # grad normalization
 23 |     # conf_norm = [(1, 0), (0, 1), (1, 1)]
 24 |     conf_norm = [(0, 0)]
 25 |     for c in conf_norm:
 26 |         exp["norm_gh"] = bool(c[0])
 27 |         exp["norm_gsup"] = bool(c[1])
 28 |         if rep == max_rep - 1:
 29 |             exp["debug_code"] = True
 30 |         print "Just forces debuge to TRUE *********"
 31 |         exp["debug_code"] = False
 32 |         name = get_name_exp_from_yaml(exp)
 33 |         with open(fold_exps+"/"+name+".yaml", "w") as fyaml:
 34 |             yaml.dump(exp, fyaml)
 35 |             f.write("python " + runner + " " + name + ".yaml \n")
 36 | # Default
 37 | nbr_layers = 3
 38 | use_unsupervised = False
 39 | exp = {"debug_code": False,
 40 |        "nbr_sup": 1000,
 41 |        "run": 45,
 42 |        "h_ind": [False for i in range(nbr_layers+1)],
 43 |        "use_batch_normalization": [False for i in range(nbr_layers+1)],
 44 |        "corrupt_input_l": 0.,
 45 |        "start_corrupting": 0,
 46 |        "use_sparsity": False,
 47 |        "use_sparsity_in_pred": False,
 48 |        "max_epochs": 400,
 49 |        "hint": False,
 50 |        "extreme_random": True,
 51 |        "norm_gsup": False,
 52 |        "norm_gh": False,
 53 |        "repet": 0,
 54 |        "use_unsupervised": use_unsupervised,
 55 |        "h_w": 1.,
 56 |        "start_hint": 5
 57 |        }
 58 | nbr_sup_ = [1000, 3000, 5000, 50000]
 59 | h_w_vls = [.0, .0, .0, .0]
 60 | start_hint_vl = [1, 1, 1, 1]
 61 | run = 0
 62 | fold_exps = "config_yaml"
 63 | bash_name = "joblenet.sh"
 64 | f = open(bash_name, "w")
 65 | f.write("#!/usr/bin/env bash \n")
 66 | runner = "trainLenet.py"
 67 | max_rep = 7
 68 | for nbr, h_w, start_hint in zip(nbr_sup_, h_w_vls, start_hint_vl):
 69 |     for rep in range(max_rep):
 70 |         print rep
 71 |         exp["nbr_sup"] = nbr
 72 |         # we need one run for an MLP without hint.
 73 |         if rep == 0:
 74 |             exp["debug_code"] = False
 75 |         else:
 76 |             exp["debug_code"] = False
 77 |         exp["h_ind"] = [False for k in range(nbr_layers+1)]
 78 |         exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)]
 79 |         exp["use_batch_normalization"][-2] = False
 80 |         exp["hint"] = False
 81 |         exp["run"] = run
 82 |         exp["repet"] = rep
 83 |         exp["norm_gh"] = False
 84 |         exp["norm_gsup"] = False
 85 |         exp["max_epochs"] = 2000
 86 |         exp["start_hint"] = 0
 87 |         print h_w
 88 |         exp["h_w"] = h_w
 89 |         name = get_name_exp_from_yaml(exp)
 90 | #        with open(fold_exps+"/"+name+".yaml", "w") as fyaml:
 91 | #            yaml.dump(exp, fyaml)
 92 | #            f.write("python " + runner + " " + name + ".yaml \n")
 93 | 
 94 |         # The layer just before the softmax.
 95 |         exp["max_epochs"] = 400
 96 |         exp["debug_code"] = False
 97 |         # ******* Train inly the layer before the output.
 98 |         exp["h_ind"] = [False for k in range(nbr_layers+1)]
 99 |         exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)]
100 |         exp["use_batch_normalization"][-2] = True
101 |         exp["h_ind"][-2] = True
102 |         exp["hint"] = True
103 |         exp["run"] = run
104 |         exp["repet"] = rep
105 |         exp["start_hint"] = start_hint
106 |         save_file(exp, rep, max_rep)
107 |         continue
108 | 
109 |         # The output of the last cnn layer.
110 |         exp["max_epochs"] = 400
111 |         exp["debug_code"] = False
112 |         exp["h_ind"] = [False for k in range(nbr_layers+1)]
113 |         exp["h_ind"][-3] = True
114 |         exp["hint"] = True
115 |         exp["run"] = run
116 |         exp["repet"] = rep
117 |         save_file(exp, rep, max_rep)
118 | 
119 |         # The last two layers beofre the softmax.
120 |         exp["max_epochs"] = 400
121 |         exp["debug_code"] = False
122 |         exp["h_ind"] = [False for k in range(nbr_layers+1)]
123 |         exp["h_ind"][-2] = True
124 |         exp["h_ind"][-3] = True
125 |         exp["hint"] = True
126 |         exp["run"] = run
127 |         exp["repet"] = rep
128 |         save_file(exp, rep, max_rep)
129 |         continue
130 |         # *****
131 |         # Exclusive layers
132 |         for i in range(nbr_layers+1):
133 |             exp["h_ind"] = [False for k in range(nbr_layers+1)]
134 |             exp["h_ind"][i] = True
135 |             exp["hint"] = True
136 |             exp["run"] = run
137 |             exp["repet"] = rep
138 |             save_file(exp, rep, max_rep)
139 | 
140 |         # From input to output
141 | #        exp["h_ind"] = [False for k in range(nbr_layers+1)]
142 | #        exp["h_ind"][0] = True
143 | #        for kk in range(1, nbr_layers+1):
144 | #            exp["h_ind"][kk] = True
145 | #            save_file(exp, rep, max_rep)
146 |         # From output to input
147 |         exp["h_ind"] = [False for k in range(nbr_layers+1)]
148 |         exp["h_ind"][-1] = True
149 |         for kk in range(-2, -(nbr_layers+2), -1):
150 |             exp["h_ind"][kk] = True
151 |             save_file(exp, rep, max_rep)
152 | f.close()
153 | os.system("chmod +x " + bash_name)
154 | 


--------------------------------------------------------------------------------
/generate_exps_search.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | 
  4 | 
  5 | def get_name_exp_from_yaml(d):
  6 |     name = ""
  7 |     name = str(d["run"]) + "_"
  8 |     name += str(d["nbr_sup"]) + "_"
  9 |     name += str(len(d["h_ind"]) - 1) + "_"
 10 |     name += "_".join([str(int(k)) for k in d["h_ind"]]) + "_"
 11 |     name += str(d["repet"]) + "_"
 12 |     name += str(d["hint"]) + "_"
 13 |     name += str(d["norm_gsup"]) + "_"
 14 |     name += str(d["norm_gh"]) + "_"
 15 |     name += str(d["debug_code"]) + "_"
 16 |     name += str(d["use_unsupervised"]) + "_"
 17 |     name += str(d["start_hint"])
 18 | 
 19 |     return name
 20 | 
 21 | 
 22 | def save_file(exp, rep, max_rep):
 23 |     # grad normalization
 24 |     # conf_norm = [(1, 0), (0, 1), (1, 1)]
 25 |     conf_norm = [(0, 0)]
 26 |     for c in conf_norm:
 27 |         exp["norm_gh"] = bool(c[0])
 28 |         exp["norm_gsup"] = bool(c[1])
 29 |         print "Just forces debuge to TRUE *********"
 30 |         exp["debug_code"] = False
 31 |         name = get_name_exp_from_yaml(exp)
 32 |         with open(fold_exps+"/"+name+".yaml", "w") as fyaml:
 33 |             yaml.dump(exp, fyaml)
 34 |             f.write("python " + runner + " " + name + ".yaml \n")
 35 | # Default
 36 | nbr_layers = 3
 37 | use_unsupervised = False
 38 | exp = {"debug_code": False,
 39 |        "nbr_sup": 1000,
 40 |        "run": 45,
 41 |        "h_ind": [False for i in range(nbr_layers+1)],
 42 |        "use_batch_normalization": [False for i in range(nbr_layers+1)],
 43 |        "corrupt_input_l": 0.,
 44 |        "start_corrupting": 0,
 45 |        "use_sparsity": False,
 46 |        "use_sparsity_in_pred": False,
 47 |        "max_epochs": 400,
 48 |        "hint": False,
 49 |        "extreme_random": True,
 50 |        "norm_gsup": False,
 51 |        "norm_gh": False,
 52 |        "repet": 0,
 53 |        "use_unsupervised": use_unsupervised,
 54 |        "h_w": 1.,
 55 |        "start_hint": 5
 56 |        }
 57 | gpu = "p100.sl"
 58 | nbr_sup_ = 1000
 59 | h_w_vls = .0
 60 | # start_hint_vl = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]
 61 | start_hint_vl = [10, 10, 10, 10, 10, 10, 10]
 62 | run = 0
 63 | fold_exps = "config_yaml"
 64 | folder_jobs = "jobs"
 65 | bash_name = "submit.sh"
 66 | f = open(bash_name, "w")
 67 | f.write("#!/usr/bin/env bash \n")
 68 | runner = "train3_new_dup.py"
 69 | max_rep = 7
 70 | flags = "THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 "
 71 | rep = 0
 72 | fgpu = open(gpu, "r")
 73 | gpu_cont = fgpu.read()
 74 | 
 75 | for start_hint in start_hint_vl:
 76 |     exp["nbr_sup"] = nbr_sup_
 77 |     exp["run"] = run
 78 |     exp["norm_gh"] = False
 79 |     exp["norm_gsup"] = False
 80 |     exp["start_hint"] = 0
 81 |     exp["h_w"] = h_w_vls
 82 | 
 83 |     exp["max_epochs"] = 400
 84 |     exp["debug_code"] = False
 85 |     # ******* Train inly the layer before the output.
 86 |     exp["h_ind"] = [False for k in range(nbr_layers+1)]
 87 |     exp["h_ind"][-2] = True
 88 |     exp["use_batch_normalization"] = [False for k in range(nbr_layers+1)]
 89 |     exp["use_batch_normalization"][-2] = False
 90 |     exp["hint"] = True
 91 |     exp["run"] = run
 92 |     exp["repet"] = rep
 93 |     exp["start_hint"] = start_hint
 94 |     name = get_name_exp_from_yaml(exp)
 95 |     with open(fold_exps+"/"+name+".yaml", "w") as fyaml:
 96 |         yaml.dump(exp, fyaml)
 97 |     name_job = str(start_hint) + "_" + str(nbr_sup_) + "_" + str(rep) + ".sl"
 98 |     with open(folder_jobs + "/" + name_job, "w") as fjob:
 99 |         fjob.write(gpu_cont + "\n")
100 |         fjob.write(flags + " python " + runner + " " + name + ".yaml \n")
101 |         # save_file(exp, rep, max_rep)
102 |     f.write("sbatch ./" + folder_jobs + "/" + name_job + " \n")
103 |     rep += 1
104 | 
105 | f.close()
106 | fgpu.close()
107 | os.system("chmod +x " + bash_name)
108 | 


--------------------------------------------------------------------------------
/init_params/.readme.md:
--------------------------------------------------------------------------------
1 | This folder contains the initial parameters of the models.


--------------------------------------------------------------------------------
/job0.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash 
2 | 


--------------------------------------------------------------------------------
/joblenet.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash 
 2 | python trainLenet.py lenet_0_1000_3_0_0_1_0_0_True_False_False_False_False.yaml 
 3 | python trainLenet.py lenet_0_1000_3_0_0_1_0_1_True_False_False_False_False.yaml 
 4 | python trainLenet.py lenet_0_1000_3_0_0_1_0_2_True_False_False_False_False.yaml 
 5 | python trainLenet.py lenet_0_1000_3_0_0_1_0_3_True_False_False_False_False.yaml 
 6 | python trainLenet.py lenet_0_1000_3_0_0_1_0_4_True_False_False_False_False.yaml 
 7 | python trainLenet.py lenet_0_1000_3_0_0_1_0_5_True_False_False_False_False.yaml 
 8 | python trainLenet.py lenet_0_1000_3_0_0_1_0_6_True_False_False_False_False.yaml 
 9 | python trainLenet.py lenet_0_3000_3_0_0_1_0_0_True_False_False_False_False.yaml 
10 | python trainLenet.py lenet_0_3000_3_0_0_1_0_1_True_False_False_False_False.yaml 
11 | python trainLenet.py lenet_0_3000_3_0_0_1_0_2_True_False_False_False_False.yaml 
12 | python trainLenet.py lenet_0_3000_3_0_0_1_0_3_True_False_False_False_False.yaml 
13 | python trainLenet.py lenet_0_3000_3_0_0_1_0_4_True_False_False_False_False.yaml 
14 | python trainLenet.py lenet_0_3000_3_0_0_1_0_5_True_False_False_False_False.yaml 
15 | python trainLenet.py lenet_0_3000_3_0_0_1_0_6_True_False_False_False_False.yaml 
16 | python trainLenet.py lenet_0_5000_3_0_0_1_0_0_True_False_False_False_False.yaml 
17 | python trainLenet.py lenet_0_5000_3_0_0_1_0_1_True_False_False_False_False.yaml 
18 | python trainLenet.py lenet_0_5000_3_0_0_1_0_2_True_False_False_False_False.yaml 
19 | python trainLenet.py lenet_0_5000_3_0_0_1_0_3_True_False_False_False_False.yaml 
20 | python trainLenet.py lenet_0_5000_3_0_0_1_0_4_True_False_False_False_False.yaml 
21 | python trainLenet.py lenet_0_5000_3_0_0_1_0_5_True_False_False_False_False.yaml 
22 | python trainLenet.py lenet_0_5000_3_0_0_1_0_6_True_False_False_False_False.yaml 
23 | python trainLenet.py lenet_0_50000_3_0_0_1_0_0_True_False_False_False_False.yaml 
24 | python trainLenet.py lenet_0_50000_3_0_0_1_0_1_True_False_False_False_False.yaml 
25 | python trainLenet.py lenet_0_50000_3_0_0_1_0_2_True_False_False_False_False.yaml 
26 | python trainLenet.py lenet_0_50000_3_0_0_1_0_3_True_False_False_False_False.yaml 
27 | python trainLenet.py lenet_0_50000_3_0_0_1_0_4_True_False_False_False_False.yaml 
28 | python trainLenet.py lenet_0_50000_3_0_0_1_0_5_True_False_False_False_False.yaml 
29 | python trainLenet.py lenet_0_50000_3_0_0_1_0_6_True_False_False_False_False.yaml 
30 | 


--------------------------------------------------------------------------------
/jobs/.readme.md:
--------------------------------------------------------------------------------
1 | This folder contains the [Slurm](https://slurm.schedmd.com/) jobs.


--------------------------------------------------------------------------------
/jobs/110_1000_4_trainLenet_0_1000_3_0_0_0_0_4_False_False_False_False_False_110.sl:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Slurm submission script, 
 4 | # GPU job 
 5 | # CRIHAN v 1.00 - Jan 2017 
 6 | # support@criann.fr
 7 | 
 8 | # Not shared resources
 9 | #SBATCH --share
10 | 
11 | # Job name
12 | #SBATCH -J "lenet"
13 | 
14 | # Batch output file
15 | #SBATCH --output ./outputjobs/lenet.o%J
16 | 
17 | # Batch error file
18 | #SBATCH --error ./outputjobs/lenet.e%J
19 | 
20 | # GPUs architecture and number
21 | # ----------------------------
22 | # Partition (submission class)
23 | #SBATCH --partition gpu_p100 
24 | 
25 | # GPUs per compute node
26 | #   gpu:4 (maximum) for gpu_k80 
27 | #   gpu:2 (maximum) for gpu_p100 
28 | #SBATCH --gres gpu:1
29 | # ----------------------------
30 | 
31 | # Job time (hh:mm:ss)
32 | #SBATCH --time 24:00:00
33 | 
34 | # MPI task maximum memory (MB)
35 | #SBATCH --mem-per-cpu 32000 
36 | # ----------------------------
37 | 
38 | #SBATCH --mail-type ALL
39 | # User e-mail address
40 | #SBATCH --mail-user soufiane.belharbi@insa-rouen.fr
41 | 
42 | # environments
43 | # ---------------------------------
44 | module load cuda/8.0
45 | module load python/2.7.12
46 | # ---------------------------------
47 | 
48 | cd $LOCAL_WORK_DIR/workspace/code/class-invariance-hint/
49 | 
50 | 
51 | 
52 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32  python trainLenet.py trainLenet_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml 
53 | 


--------------------------------------------------------------------------------
/jobs/20_1000_0_train3_new_dup_0_1000_3_0_0_1_0_0_True_False_False_False_False_20.sl:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Slurm submission script, 
 4 | # GPU job 
 5 | # CRIHAN v 1.00 - Jan 2017 
 6 | # support@criann.fr
 7 | 
 8 | # Not shared resources
 9 | #SBATCH --share
10 | 
11 | # Job name
12 | #SBATCH -J "lenet"
13 | 
14 | # Batch output file
15 | #SBATCH --output ./outputjobs/lenet.o%J
16 | 
17 | # Batch error file
18 | #SBATCH --error ./outputjobs/lenet.e%J
19 | 
20 | # GPUs architecture and number
21 | # ----------------------------
22 | # Partition (submission class)
23 | #SBATCH --partition gpu_p100 
24 | 
25 | # GPUs per compute node
26 | #   gpu:4 (maximum) for gpu_k80 
27 | #   gpu:2 (maximum) for gpu_p100 
28 | #SBATCH --gres gpu:1
29 | # ----------------------------
30 | 
31 | # Job time (hh:mm:ss)
32 | #SBATCH --time 24:00:00
33 | 
34 | # MPI task maximum memory (MB)
35 | #SBATCH --mem-per-cpu 32000 
36 | # ----------------------------
37 | 
38 | #SBATCH --mail-type ALL
39 | # User e-mail address
40 | #SBATCH --mail-user soufiane.belharbi@insa-rouen.fr
41 | 
42 | # environments
43 | # ---------------------------------
44 | module load cuda/8.0
45 | module load python/2.7.12
46 | # ---------------------------------
47 | 
48 | cd $LOCAL_WORK_DIR/workspace/code/class-invariance-hint/
49 | 
50 | 
51 | 
52 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32  python train3_new_dup.py train3_new_dup_0_1000_3_0_0_0_0_0_False_False_False_False_False_110.yaml 
53 | 


--------------------------------------------------------------------------------
/k80.sl:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Slurm submission script, 
 4 | # GPU job 
 5 | # CRIHAN v 1.00 - Jan 2017 
 6 | # support@criann.fr
 7 | 
 8 | # Not shared resources
 9 | #SBATCH --share
10 | 
11 | # Job name
12 | #SBATCH -J "lenet"
13 | 
14 | # Batch output file
15 | #SBATCH --output ./outputjobs/lenet.o%J
16 | 
17 | # Batch error file
18 | #SBATCH --error ./outputjobs/lenet.e%J
19 | 
20 | # GPUs architecture and number
21 | # ----------------------------
22 | # Partition (submission class)
23 | #SBATCH --partition gpu_k80 
24 | 
25 | # GPUs per compute node
26 | #   gpu:4 (maximum) for gpu_k80 
27 | #   gpu:2 (maximum) for gpu_p100 
28 | #SBATCH --gres gpu:1
29 | # ----------------------------
30 | 
31 | # Job time (hh:mm:ss)
32 | #SBATCH --time 24:00:00
33 | 
34 | # MPI task maximum memory (MB)
35 | #SBATCH --mem-per-cpu 3000 
36 | # ----------------------------
37 | 
38 | #SBATCH --mail-type ALL
39 | # User e-mail address
40 | #SBATCH --mail-user soufiane.belharbi@insa-rouen.fr
41 | 
42 | # environments
43 | # ---------------------------------
44 | module load cuda/8.0
45 | module load python/2.7.12
46 | # ---------------------------------
47 | 
48 | cd $LOCAL_WORK_DIR/workspace/code/class-invariance-hint/
49 | 
50 | # THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python trainLenet.py lenet_0_1000_3_0_0_1_0_0_True_False_False_False_False.yaml 
51 | 
52 | 


--------------------------------------------------------------------------------
/layer.py:
--------------------------------------------------------------------------------
  1 | # Based on: https://github.com/caglar/autoencoders.git
  2 | # http://www-etud.iro.umontreal.ca/~gulcehrc/
  3 | from __future__ import division
  4 | import numpy as np
  5 | import theano
  6 | from theano import tensor as T
  7 | import warnings
  8 | 
  9 | 
 10 | from theano.tensor.signal import pool
 11 | from theano.tensor.nnet import conv2d
 12 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 13 | from basic_layer import Layer
 14 | from ae import Autoencoder
 15 | from non_linearities import NonLinearity, CostType, relu, get_non_linearity_str
 16 | from normalization import BatchNormLayer
 17 | 
 18 | 
 19 | def sharedX_value(value, name=None, borrow=None, dtype=None):
 20 |     """Share a single value after transforming it to floatX type.
 21 | 
 22 |     value: a value
 23 |     name: variable name (str)
 24 |     borrow: boolean
 25 |     dtype: the type of the value when shared. default: theano.config.floatX
 26 |     """
 27 |     if dtype is None:
 28 |         dtype = theano.config.floatX
 29 |     return theano.shared(
 30 |         theano._asarray(value, dtype=dtype), name=name, borrow=borrow)
 31 | 
 32 | 
 33 | class HiddenLayer(Layer):
 34 |     def __init__(self, input, input1, input2, input_vl, n_in, n_out, W=None,
 35 |                  b=None,
 36 |                  activation=T.tanh, rng=None, hint=None, use_hint=False,
 37 |                  intended_to_be_corrupted=False, corrupt_input_l=0.,
 38 |                  use_sparsity=False, use_sparsity_in_pred=False,
 39 |                  use_unsupervised=False, use_batch_normalization=False):
 40 |         """
 41 |         Typical hidden layer of an MLP: units are fully connected and have
 42 |         tangente hyperbolic activation function. Weight matrix (W) is of shape
 43 |         (n_in, n_out) and the bias vector (b) is of shape (nout,).
 44 | 
 45 |         Hidden unit activation is given by: tanh(dot(input, w)+ b)
 46 | 
 47 |         :type rng: numpy.random.RandomState
 48 |         :param rng: a random number generator used to initiaze the weights.
 49 | 
 50 |         :type input: theano.tensor.dmatrix
 51 |         :param input: a symbolic tensor of shape (n_examples, n_in)
 52 | 
 53 |         :type n_in: int
 54 |         :param n_in: dimension of the input
 55 | 
 56 |         :type n_out: int
 57 |         :param n_out: number of hidden units
 58 | 
 59 |         :type activation: theano.Op or function
 60 |         :param activation:  Non linearity to be applied in the hidden layer.
 61 |         """
 62 |         if rng is None:
 63 |             rng = np.random.RandomState()
 64 | 
 65 |         super(HiddenLayer, self).__init__(
 66 |             input, input1, input2, input_vl, n_in, n_out,
 67 |             activation=activation,
 68 |             rng=rng, hint=hint, use_hint=use_hint,
 69 |             intended_to_be_corrupted=intended_to_be_corrupted,
 70 |             corrupt_input_l=corrupt_input_l,
 71 |             use_sparsity=use_sparsity,
 72 |             use_sparsity_in_pred=use_sparsity_in_pred,
 73 |             use_batch_normalization=use_batch_normalization)
 74 |         self.reset_layer()
 75 | 
 76 |         if W is not None:
 77 |             self.W = W
 78 | 
 79 |         if b is not None:
 80 |             self.b = b
 81 | 
 82 |         self.params = [self.W, self.b]
 83 |         if self.use_batch_normalization:
 84 |             # we normalize the output of the layer, not its input.
 85 |             # it does not matter the size of the minibatch (10).
 86 |             self.bn = BatchNormLayer([100, n_out])
 87 | 
 88 |         self.setup_outputs(input)
 89 |         self.setup_outputs_vl(input_vl)
 90 |         self.setup_outputs1(input1)
 91 |         self.setup_outputs2(input2)
 92 |         # Create the associated auto-encoder: tied-wights AE.
 93 |         self.use_unsupervised = use_unsupervised
 94 |         self.ae = Autoencoder(
 95 |             input=input, nvis=n_in, nhid=n_out, cost_type=CostType.MeanSquared,
 96 |             nonlinearity=get_non_linearity_str(activation), W=self.W, b=self.b,
 97 |             tied_weights=True, reverse=False)
 98 | 
 99 |     def setup_outputs(self, input):
100 |         # lin_output = T.dot(input, self.W) + self.b
101 |         if self.intended_to_be_corrupted:
102 |             warnings.warn("Input 1 Will be corrupted!!!!!!")
103 |             lin_output = T.dot(
104 |                 self.get_corrupted_input(input), self.W) + self.b
105 |         else:
106 |             lin_output = T.dot(input, self.W) + self.b
107 | 
108 |         # Normalize the linear transformation, (if there is bn)
109 |         if self.use_batch_normalization:
110 |             assert self.bn is not None
111 |             lin_output = self.bn.get_output_for(
112 |                 lin_output, deterministic=False, batch_norm_use_averages=False,
113 |                 batch_norm_update_averages=True)
114 |         self.output = (
115 |             lin_output if self.activation is None
116 |             else self.activation(lin_output))
117 |         if self.use_sparsity_in_pred:
118 |             assert self.use_sparsity
119 |             self.output = self.output * self.sparser
120 | 
121 |     def setup_outputs_vl(self, input):
122 |         """Setup the outputs for the test.
123 |         Specifically for the batch normalization output.
124 |         """
125 |         lin_output = T.dot(input, self.W) + self.b
126 |         # Normalize the linear transformation.
127 |         if self.use_batch_normalization:
128 |             assert self.bn is not None
129 |             lin_output = self.bn.get_output_for(
130 |                 lin_output, deterministic=False, batch_norm_use_averages=False,
131 |                 batch_norm_update_averages=True)
132 |         self.output_vl = (
133 |             lin_output if self.activation is None
134 |             else self.activation(lin_output))
135 |         if self.use_sparsity_in_pred:
136 |             assert self.use_sparsity
137 |             self.output = self.output * self.sparser
138 | 
139 |     def setup_outputs1(self, input):
140 |         if self.intended_to_be_corrupted:
141 |             warnings.warn("Input 1 Will be corrupted!!!!!!")
142 |             lin_output = T.dot(
143 |                 self.get_corrupted_input(input), self.W) + self.b
144 |         else:
145 |             lin_output = T.dot(input, self.W) + self.b
146 |         # Batch normalization
147 |         if self.use_batch_normalization:
148 |             assert self.bn is not None
149 |             lin_output = self.bn.get_output_for(
150 |                 lin_output, deterministic=False,
151 |                 batch_norm_use_averages=False,
152 |                 batch_norm_update_averages=False)
153 |         # We compute the distance over the linear transformation.
154 | #        self.output1 = lin_output
155 |         self.output1 = (
156 |             lin_output if self.activation is None
157 |             else self.activation(lin_output))
158 |         if self.use_sparsity_in_pred:
159 |             assert self.use_sparsity
160 |             self.output1 = self.output1 * self.sparser
161 | 
162 |     def setup_outputs2(self, input):
163 |         if self.intended_to_be_corrupted:
164 |             warnings.warn("Input 2 Will be corrupted!!!!!!")
165 |             lin_output = T.dot(
166 |                 self.get_corrupted_input(input), self.W) + self.b
167 |         else:
168 |             lin_output = T.dot(input, self.W) + self.b
169 |         # Batch normalization
170 |         if self.use_batch_normalization:
171 |             assert self.bn is not None
172 |             lin_output = self.bn.get_output_for(
173 |                 lin_output, deterministic=False,
174 |                 batch_norm_use_averages=False,
175 |                 batch_norm_update_averages=False)
176 |         # We compute the distance over the linear transformation.
177 | #        self.output2 = lin_output
178 |         self.output2 = (
179 |             lin_output if self.activation is None
180 |             else self.activation(lin_output))
181 |         if self.use_sparsity_in_pred:
182 |             assert self.use_sparsity
183 |             self.output2 = self.output2 * self.sparser
184 | 
185 |     def get_outputs(self, input):
186 |         self.setup_outputs(input)
187 |         return self.output
188 | 
189 |     def get_outputs1(self, input):
190 |         self.setup_outputs1(input)
191 |         return self.output1
192 | 
193 |     def get_outputs2(self, input):
194 |         self.setup_outputs2(input)
195 |         return self.output2
196 | 
197 |     def _squared_magn(self, x):
198 |         """Returns the sum of the squared values of an array."""
199 |         return (x**2).sum(axis=1)
200 | 
201 |     def _magnitude(self, x):
202 |         """Returns the magnitude of an array."""
203 |         return T.sqrt(T.maximum(self._squared_magn(x), 1e-7))
204 |         # np.finfo(theano.config.floatX).tiny))
205 | 
206 |     def get_arc_cosine_penalty(self):
207 |         """Calculate the arccosine distance in [0, 1].
208 |         0: the two vectors are very similar. (have the same orientation)
209 |         1: the two vectors are very disimilar (have the opposite orientation).
210 |         The cosine similarity does not take in consideration the magnitude
211 |         of the vectors. It considers only thier orientation (angle).
212 |         Therefore, two vectors are similar if they have the same angle.
213 |         See: https://en.wikipedia.org/wiki/Cosine_similarity
214 |         """
215 |         # tiny value:
216 | #        tiny = sharedX_value(np.finfo(dtype=theano.config.floatX).tiny,
217 | #                             name="tiny")
218 |         # the gradient of sqrt at 0 is undefinded (nan).
219 |         # use a tiny value instead of 0.
220 |         # OLD SOLUTION
221 | #        denom = T.sqrt(
222 | #            T.sum(self.output1**2, axis=1) * T.sum(self.output2**2, axis=1))
223 | #        nomin = (self.output1 * self.output2).sum(1)
224 | #        cosine = nomin/denom  # the cosine betwen the two vectors
225 | #        pi = sharedX_value(np.pi, name="pi")
226 | #        minus1 = sharedX_value(-1., name="minus1")
227 | #        plus1 = sharedX_value(1. - np.finfo(dtype=theano.config.floatX).eps,
228 | #                              name="plus1")
229 | #        # Need to be clipped. accos() gives nan when sin is close to 1.
230 | #        angle = T.arccos(T.clip(
231 | #            cosine, minus1.get_value(), plus1.get_value()))/pi
232 |         # OLD SOLUTION
233 | #        plus1 = sharedX_value(1. - np.finfo(dtype=theano.config.floatX).eps,
234 | #                              name="plus1")
235 |         pi = sharedX_value(np.pi, name="pi")
236 |         cosine = T.clip(((self.output1 * self.output2).sum(axis=1) / (
237 |             self._magnitude(self.output1) * self._magnitude(self.output2))),
238 |             -1, 1 - 1e-7)
239 |         angle = T.clip(T.arccos(cosine) / pi, 0, 1)
240 | 
241 |         return angle
242 | 
243 |     def get_l2_penalty(self, ind=0):
244 |         """calculate the Euclidean distance between the two outputs."""
245 |         dif = (self.output1 - self.output2)
246 |         if self.use_sparsity:
247 |             dif = dif * self.sparser
248 |         if ind == 0:
249 |             return (dif**2).sum(1)
250 |         elif ind == 1:
251 |             return (dif**2).mean(1)
252 |         else:
253 |             raise ValueError("ind error.")
254 | 
255 |     def get_l1_penalty(self, ind=0):
256 |         """calculate the Manhattan distance between the two outputs."""
257 |         dif = (self.output1 - self.output2)
258 |         if self.use_sparsity:
259 |             dif = dif * self.sparser
260 |         if ind == 0:
261 |             return (abs(dif)).sum(1)
262 |         elif ind == 1:
263 |             return (abs(dif)).mean(1)
264 |         else:
265 |             raise ValueError("ind error.")
266 | 
267 |     def get_contrastive(self, sim, margin):
268 |         distance = ((self.output1 - self.output2)**2).sum(1)
269 |         converge = (1. - sim) * distance
270 |         contraste = sim * T.maximum(0, margin - distance)
271 | 
272 |         return converge + contraste
273 | 
274 |     def get_divergence(self, sim, margin):
275 |         distance = ((self.output1 - self.output2)**2).sum(1) ** (1/2.)
276 |         contraste = sim * T.maximum(0, margin - distance)
277 | 
278 |         return contraste
279 | 
280 |     def insepct_get_l1_conv(self, sim, margin):
281 |         return (1. - sim) * self.get_l1_penalty(ind=1)
282 | 
283 |     def inscpect_get_l1_div(self, sim, margin):
284 |         distance = ((self.output1 - self.output2)**2).sum(1)
285 |         contraste = sim * T.maximum(0, margin - distance)
286 |         return contraste
287 | 
288 |     def inspect_get_l1_distance(self, sim, margin):
289 |         distance = ((self.output1 - self.output2)**2).sum(1)
290 |         d = sim * distance
291 |         return d
292 | 
293 |     def get_penalty(self, sim, margin):
294 |         if self.hint is "l1sum":
295 |             return (1. - sim) * self.get_l1_penalty(ind=0)
296 |         elif self.hint is "l1mean":
297 |             return (1. - sim) * self.get_l1_penalty(ind=1)
298 |         elif self.hint is "l2sum":
299 |             return (1. - sim) * self.get_l2_penalty(ind=0)
300 |         elif self.hint is "l2mean":
301 |             return (1. - sim) * self.get_l2_penalty(ind=1)
302 |         elif self.hint is "arccos":
303 |             return (1. - sim) * self.get_arc_cosine_penalty()
304 |         elif self.hint is "l1sumcos":
305 |             return (1. - sim) * (
306 |                 self.get_l1_penalty(ind=0) + self.get_arc_cosine_penalty())
307 |         elif self.hint is "l1meancos":
308 |             return (1. - sim) * (
309 |                 self.get_l1_penalty(ind=1) + self.get_arc_cosine_penalty())
310 |         elif self.hint is "l2sumcos":
311 |             return (1. - sim) * (
312 |                 self.get_l2_penalty(ind=0) + self.get_arc_cosine_penalty())
313 |         elif self.hint is "l2meancos":
314 |             return (1. - sim) * (
315 |                 self.get_l2_penalty(ind=0) + self.get_arc_cosine_penalty())
316 |         elif self.hint is "contrastive":
317 |             return self.get_contrastive(sim, margin)
318 |         elif self.hint is "divergence":
319 |             return self.get_divergence(sim, margin)
320 |         else:
321 |             raise ValueError("self.hint uknonw!!!!")
322 | 
323 | 
324 | class LeNetConvPoolLayer_hint(HiddenLayer):
325 |     """Pool Layer of a convolutional network """
326 | 
327 |     def __init__(self, rng, input, input1, input2, input_vl,
328 |                  filter_shape, image_shape, poolsize=(2, 2),
329 |                  activation=T.tanh, hint="l1mean",
330 |                  use_hint=False,
331 |                  intended_to_be_corrupted=False,
332 |                  corrupt_input_l=0.,
333 |                  use_sparsity=False,
334 |                  use_sparsity_in_pred=False,
335 |                  use_unsupervised=False,
336 |                  use_batch_normalization=False):
337 |         """
338 |         Allocate a LeNetConvPoolLayer with shared variable internal parameters.
339 | 
340 |         :type rng: numpy.random.RandomState
341 |         :param rng: a random number generator used to initialize weights
342 | 
343 |         :type input: theano.tensor.dtensor4
344 |         :param input: symbolic image tensor, of shape image_shape
345 | 
346 |         :type filter_shape: tuple or list of length 4
347 |         :param filter_shape: (number of filters, num input feature maps,
348 |                               filter height, filter width)
349 | 
350 |         :type image_shape: tuple or list of length 4
351 |         :param image_shape: (batch size, num input feature maps,
352 |                              image height, image width)
353 | 
354 |         :type poolsize: tuple or list of length 2
355 |         :param poolsize: the downsampling (pooling) factor (#rows, #cols)
356 |         """
357 | 
358 |         assert hint is not None
359 |         assert image_shape[1] == filter_shape[1]
360 |         self.corrupt_input_l = sharedX_value(corrupt_input_l, name="cor_l")
361 |         self.intended_to_be_corrupted = intended_to_be_corrupted
362 |         self.rng = np.random.RandomState(123)
363 |         self.theano_rng = RandomStreams(self.rng.randint(2 ** 30))
364 |         self.input = input
365 |         # keep track of model input
366 |         self.input = input
367 |         self.input1 = input1  # x1
368 |         self.input2 = input2  # x2
369 |         self.input_vl = input_vl  # bn input used for validation.
370 |         self.sparser = None
371 |         self.activation = activation
372 |         self.hint = hint
373 |         self.use_hint = use_hint
374 |         self.use_sparsity = use_sparsity
375 |         self.use_sparsity_in_pred = use_sparsity_in_pred
376 |         self.use_unsupervised = use_unsupervised
377 |         self.ae = None  # no need for cnn... for now.
378 |         self.use_batch_normalization = use_batch_normalization
379 |         self.bn = None
380 |         # the bn is applied before the pooling. (and after the linear op.)
381 |         # output_shape = [batch size, num output maps, img height, img width]
382 |         map_size_h = (image_shape[2] - filter_shape[2] + 1)
383 |         map_size_w = (image_shape[3] - filter_shape[3] + 1)
384 |         output_shape = [image_shape[0], filter_shape[0], map_size_h,
385 |                         map_size_w]
386 |         if self.use_batch_normalization:
387 |             self.bn = BatchNormLayer(output_shape)
388 |         # assert self.use_batch_normalization is False
389 | 
390 |         # there are "num input feature maps * filter height * filter width"
391 |         # inputs to each hidden unit
392 |         fan_in = np.prod(filter_shape[1:])
393 |         # each unit in the lower layer receives a gradient from:
394 |         # "num output feature maps * filter height * filter width" /
395 |         #   pooling size
396 |         fan_out = (filter_shape[0] * np.prod(filter_shape[2:]) //
397 |                    np.prod(poolsize))
398 |         # initialize weights with random weights
399 |         W_bound = np.sqrt(6. / (fan_in + fan_out))
400 |         self.W = theano.shared(
401 |             np.asarray(
402 |                 rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
403 |                 dtype=theano.config.floatX
404 |             ),
405 |             name="W",
406 |             borrow=True
407 |         )
408 | 
409 |         # the bias is a 1D tensor -- one bias per output feature map
410 |         b_values = np.zeros((filter_shape[0],), dtype=theano.config.floatX)
411 |         self.b = theano.shared(value=b_values, name="b", borrow=True)
412 | 
413 |         # convolve input feature maps with filters
414 |         conv_out = conv2d(
415 |             input=self.input,
416 |             filters=self.W,
417 |             filter_shape=filter_shape,
418 |             input_shape=image_shape
419 |         )
420 | 
421 |         conv_out1 = conv2d(
422 |             input=self.input1,
423 |             filters=self.W,
424 |             filter_shape=filter_shape,
425 |             input_shape=image_shape
426 |         )
427 |         conv_out2 = conv2d(
428 |             input=self.input2,
429 |             filters=self.W,
430 |             filter_shape=filter_shape,
431 |             input_shape=image_shape
432 |         )
433 |         conv_out_vl = conv2d(
434 |             input=self.input_vl,
435 |             filters=self.W,
436 |             filter_shape=filter_shape,
437 |             input_shape=image_shape
438 |         )
439 |         # BN
440 |         if self.bn is not None:
441 |             conv_out = self.bn.get_output_for(
442 |                 conv_out, deterministic=False,
443 |                 batch_norm_use_averages=False,
444 |                 batch_norm_update_averages=True)
445 |             conv_out1 = self.bn.get_output_for(
446 |                 conv_out1, deterministic=False,
447 |                 batch_norm_use_averages=False,
448 |                 batch_norm_update_averages=False)
449 |             conv_out2 = self.bn.get_output_for(
450 |                 conv_out2, deterministic=False,
451 |                 batch_norm_use_averages=False,
452 |                 batch_norm_update_averages=False)
453 |             conv_out_vl = self.bn.get_output_for(
454 |                 conv_out_vl, deterministic=False,
455 |                 batch_norm_use_averages=False,
456 |                 batch_norm_update_averages=True)
457 |         # pool each feature map individually, using maxpooling
458 |         pooled_out = pool.pool_2d(
459 |             input=conv_out,
460 |             ds=poolsize,
461 |             ignore_border=True
462 |         )
463 |         pooled_out1 = pool.pool_2d(
464 |             input=conv_out1,
465 |             ds=poolsize,
466 |             ignore_border=True
467 |         )
468 |         pooled_out2 = pool.pool_2d(
469 |             input=conv_out2,
470 |             ds=poolsize,
471 |             ignore_border=True
472 |         )
473 |         pooled_out_vl = pool.pool_2d(
474 |             input=conv_out_vl,
475 |             ds=poolsize,
476 |             ignore_border=True
477 |         )
478 |         # add the bias term. Since the bias is a vector (1D array), we first
479 |         # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
480 |         # thus be broadcasted across mini-batches and feature map
481 |         # width & height
482 |         self.output = activation(
483 |             pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
484 | 
485 |         self.output1_non_fl = activation(
486 |             pooled_out1 + self.b.dimshuffle('x', 0, 'x', 'x'))
487 |         self.output2_non_fl = activation(
488 |             pooled_out2 + self.b.dimshuffle('x', 0, 'x', 'x'))
489 |         self.output_vl = activation(
490 |             pooled_out_vl + self.b.dimshuffle('x', 0, 'x', 'x'))
491 | 
492 |         self.output1 = self.output1_non_fl.flatten(2)
493 |         self.output2 = self.output2_non_fl.flatten(2)
494 |         if self.sparser is None:
495 |             dim_h = int((image_shape[2] - filter_shape[1] + 1) / poolsize[0])
496 |             dim_w = int((image_shape[3] - filter_shape[1] + 1) / poolsize[1])
497 |             dim_out = filter_shape[0] * dim_h * dim_w
498 |             s_values = np.ones(
499 |                 (dim_out),
500 |                 dtype=theano.config.floatX)
501 |             self.sparser = theano.shared(value=s_values, name='sparser',
502 |                                          borrow=True)
503 | 
504 |         # store parameters of this layer
505 |         self.params = [self.W, self.b]
506 | 
507 | 
508 | class LogisticRegressionLayer(Layer):
509 |     """
510 |     Multi-class logistic regression layer.
511 |     The logistic regression is fully described by a weight matrix ::math:`W`
512 |     and a bias vector ::math: `b`. Classification is done by projecting data
513 |     points onto a set of hyperplanes, the distance to which is used to
514 |     determine a class membership probablity.
515 |     """
516 |     def __init__(self, input, n_in, n_out, is_binary=False, threshold=0.4,
517 |                  rng=None):
518 |         """
519 |         Initialize the parameters of the logistic regression.
520 |         :type input: theano.tensor.TensorType
521 |         :param input: symbolic variable that describes the input of the
522 |         architecture (one minibatch)
523 |         :type n_in: int
524 |         :param n_in: number of input units, the dimension of the space in which
525 |         the datapoints lie
526 |         :type n_out: int
527 |         :param n_out: number of output units, the dimension of the space in
528 |         which the labels lie (number of classes)
529 |         """
530 |         self.activation = T.nnet.sigmoid
531 |         self.threshold = threshold
532 |         super(LogisticRegressionLayer, self).__init__(
533 |             input,
534 |             n_in,
535 |             n_out,
536 |             self.activation,
537 |             rng)
538 | 
539 |         self.reset_layer()
540 | 
541 |         self.is_binary = is_binary
542 |         if n_out == 1:
543 |             self.is_binary = True
544 |         # The number of classes
545 |         self.n_classes_seen = np.zeros(n_out)
546 |         # The number of the wrong classification madefor the class i
547 |         self.n_wrong_classif_made = np.zeros(n_out)
548 | 
549 |         self.reset_conf_mat()
550 | 
551 |         # Compute vector class-membership probablities in symbolic form
552 |         # self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W)+ self.b)
553 |         self.p_y_given_x = self.get_class_memberships(self.input)
554 | 
555 |         if not self.is_binary:
556 |             # Compute prediction as class whose probability is maximal
557 |             # in symbolic form
558 |             self.y_decision = T.argmax(self.p_y_given_x, axis=1)
559 |         else:
560 |             # If the probability is greater than the specified threshold
561 |             # assign to the class 1, otherwise it is 0. Which alos can be
562 |             # checked if p(y=1|x) > threshold.
563 |             self.y_decision = T.gt(T.flatten(self.p_y_given_x), self.threshold)
564 | 
565 |         self.params = [self.W, self.b]
566 | 
567 |     def reset_conf_mat(self):
568 |         """
569 |         Reset the confusion matrix.
570 |         """
571 |         self.conf_mat = np.zeros(shape=(self.n_out, self.n_out),
572 |                                  dtype=np.dtype(int))
573 | 
574 |     def negative_log_likelihood(self, y):
575 |         """
576 |         Return the mean of the negative log-likelihood of the prediction
577 |         of this model under a given target distribution.
578 |         .. math::
579 |             \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) =
580 |             \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|}
581 |             \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\
582 |                     \ell (\theta=\{W,b\}, \mathcal{D})
583 | 
584 |         :type y: theano.tensor.TensorType
585 |         :param y: corresponds to a vector that gives for each example
586 |             the correct label.
587 |         Note: We use the mean instead of the sum so that the learning rate
588 |             is less dependent of the batch size.
589 |         """
590 |         if self.is_binary:
591 |             return -T.mean(T.log(self.p_y_given_x))
592 |         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
593 | 
594 |     def crossentropy_categorical(self, y):
595 |         """
596 |         Find the categorical cross entropy.
597 |         """
598 |         return T.mean(T.nnet.categorical_crossentropy(self.p_y_given_x, y))
599 | 
600 |     def crossentropy(self, y):
601 |         """
602 |         use the theano nnet cross entropy function. Return the mean.
603 |         Note: self.p_y_given_x is (batch_size, 1) but y is (batch_size,).
604 |         In order to establish the compliance, we should flatten the
605 |         p_y_given_x.
606 |         """
607 |         return T.mean(
608 |             T.nnet.binary_crossentropy(T.flatten(self.p_y_given_x), y))
609 | 
610 |     def get_class_memberships(self, x):
611 |         lin_activation = T.dot(x, self.W) + self.b
612 |         if self.is_binary:
613 |             # return the sigmoid value
614 |             return T.nnet.sigmoid(lin_activation)
615 |         # else retunr the softmax
616 |         return T.nnet.softmax(lin_activation)
617 | 
618 |     def update_conf_mat(self, y, p_y_given_x):
619 |         """
620 |         Update the confusion matrix with the given true labels and estimated
621 |         labels.
622 |         """
623 |         if self.n_out == 1:
624 |             y_decision = (p_y_given_x > self.threshold)
625 |         else:
626 |             y_decision = np.argmax(p_y_given_x, axis=1)
627 |         for i in xrange(y.shape[0]):
628 |             self.conf_mat[y[i]][y_decision[i]] += 1
629 | 
630 |     def errors(self, y):
631 |         """
632 |         returns a float representing the number of errors in the minibatch
633 |         over the total number of examples of the minibatch. Zero one loss
634 |         over the size of the minibatch.
635 | 
636 |         :type y: theano.tensor.TensorType
637 |         :param y: corresponds to a vector that gives for each example the
638 |         correct label.
639 |         """
640 |         if y.ndim != self.y_decision.ndim:
641 |             raise TypeError("y should have the same shape as self.y_decision",
642 |                             ('y', y.type, "y_decision", self.y_decision.type))
643 |         if y.dtype.startswith('int') or y.dtype.startswith('uint'):
644 |             # The T.neq operator returns a vector of 0s and 1s, where:
645 |             # 1 represents a mistake in classification
646 |             return T.mean(T.neq(self.y_decision, y))
647 |         else:
648 |             raise NotImplementedError()
649 | 
650 |     def raw_prediction_errors(self, y):
651 |         """
652 |         Returns a binary array where each each element indicates if the
653 |         corresponding sample has been correctly classified (0) or not (1) in
654 |         the minibatch.
655 | 
656 |         :type y: theano.tensor.TensorType
657 |         :param y: corresponds to a vector that gives for each example the
658 |         correct label.
659 |         """
660 |         if y.ndim != self.y_decision.ndim:
661 |             raise TypeError("y should have the same shape as self.y_decision",
662 |                             ('y', y.type, "y_decision", self.y_decision.type))
663 |         if y.dtype.startswith('int') or y.dtype.startswith('uint'):
664 |             # The T.neq operator returns a vector of 0s and 1s, where:
665 |             # 1 represents a mistake in classification
666 |             return T.neq(self.y_decision, y)
667 |         else:
668 |             raise NotImplementedError()
669 | 
670 |     def error_per_calss(self, y):
671 |         """
672 |         Return an array where each value is the error for the corresponding
673 |         classe in the minibatch.
674 | 
675 |         :type y: theano.tensor.TensorType
676 |         :param y: corresponds to a vector that gives for each example the
677 |         correct label.
678 |         """
679 |         if y.ndim != self.y_decision.ndim:
680 |             raise TypeError("y should have the same shape as self.y_decision",
681 |                             ('y', y.type, "y_decision", self.y_decision.type))
682 |         if y.dtype.startswith('int') or y.dtype.startswith('uint'):
683 |             y_decision_res = T.neq(self.y_decision, y)
684 |             for (i, y_decision_r) in enumerate(y_decision_res):
685 |                 self.n_classes_seen[y[i]] += 1
686 |                 if y_decision_r:
687 |                     self.n_wrong_classif_made[y[i]] += 1
688 |             pred_per_class = self.n_wrong_classif_made / self.n_classes_seen
689 |             return T.mean(y_decision_res), pred_per_class
690 |         else:
691 |             raise NotImplementedError()
692 | 


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
  1 | from theano import tensor as T
  2 | import theano
  3 | import numpy
  4 | from theano.tensor.signal import downsample
  5 | from theano.tensor.nnet import conv
  6 | 
  7 | from layer import HiddenLayer
  8 | from layer import LeNetConvPoolLayer_hint
  9 | 
 10 | 
 11 | def relu(x):
 12 |     return T.switch(x > 0, x, 0)
 13 | 
 14 | 
 15 | def sharedX_value(value, name=None, borrow=None, dtype=None):
 16 |     """Share a single value after transforming it to floatX type.
 17 |     value: a value
 18 |     name: variable name (str)
 19 |     borrow: boolean
 20 |     dtype: the type of the value when shared. default: theano.config.floatX
 21 |     """
 22 |     if dtype is None:
 23 |         dtype = theano.config.floatX
 24 |     return theano.shared(
 25 |         theano._asarray(value, dtype=dtype), name=name, borrow=borrow)
 26 | 
 27 | 
 28 | class IdentityHiddenLayer(object):
 29 |     """
 30 |     This is the identity layer. It takes the input and give it back as output.
 31 |     We will be using this layer just after the last convolution layer to applay
 32 |     a dropout.
 33 |     """
 34 |     def __init__(self, rng, input):
 35 |         self.input = input
 36 |         self.W = None
 37 |         self.b = None
 38 |         self.params = []
 39 |         self.output = input
 40 | 
 41 | 
 42 | def dropout_from_layer(rng, layer_output, p):
 43 |     """
 44 |     p: float. The probablity of dropping a unit.
 45 |     """
 46 |     srng = theano.tensor.shared_randomstreams.RandomStreams(
 47 |         rng.randint(99999))
 48 |     one = T.constant(1)
 49 |     retain_prob = one - p
 50 |     mask = srng.binomial(n=1, p=retain_prob, size=layer_output.shape,
 51 |                          dtype=layer_output.dtype)
 52 |     output = layer_output * mask
 53 | 
 54 |     return output
 55 | 
 56 | 
 57 | def localResponseNormalizationCrossChannel(incoming, alpha=1e-4,
 58 |                                            k=2, beta=0.75, n=5):
 59 |     """
 60 |     Implement the local response normalization cross the channels described
 61 |     in <ImageNet Classification with Deep Convolutional Neural Networks>,
 62 |     A.Krizhevsky et al. sec.3.3.
 63 |     Reference of the code:
 64 |     https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/
 65 |     normalization.py
 66 |     https://github.com/lisa-lab/pylearn2/blob/master/pylearn2/expr/normalize.py
 67 |     Parameters:
 68 |     incomping: The feature maps. (output of the convolution layer).
 69 |     alpha: float scalar
 70 |     k: float scalr
 71 |     beta: float scalar
 72 |     n: integer: number of adjacent channels. Must be odd.
 73 |     """
 74 |     if n % 2 == 0:
 75 |         raise NotImplementedError("Works only with odd n")
 76 | 
 77 |     input_shape = incoming.shape
 78 |     half_n = n // 2
 79 |     input_sqr = T.sqr(incoming)
 80 |     b, ch, r, c = input_shape
 81 |     extra_channels = T.alloc(0., b, ch + 2*half_n, r, c)
 82 |     input_sqr = T.set_subtensor(extra_channels[:, half_n:half_n+ch, :, :],
 83 |                                 input_sqr)
 84 |     scale = k
 85 |     for i in range(n):
 86 |         scale += alpha * input_sqr[:, i:i+ch, :, :]
 87 |     scale = scale ** beta
 88 | 
 89 |     return incoming / scale
 90 | 
 91 | 
 92 | class LRNCCIdentityLayer(IdentityHiddenLayer):
 93 |     def __init__(self, input, alpha=1e-4, k=2, beta=0.75, n=5):
 94 |         super(LRNCCIdentityLayer, self).__init__(rng=None, input=input)
 95 |         self.output = localResponseNormalizationCrossChannel(
 96 |             incoming=self.output, alpha=alpha, k=k, beta=beta, n=n)
 97 | 
 98 | 
 99 | class DropoutIdentityHiddenLayer(IdentityHiddenLayer):
100 |     def __init__(self, rng, input, dropout_rate, rescale):
101 |         """
102 |         rescale: Boolean. Can be only used when applying dropout.
103 |         """
104 |         if rescale:
105 |             one = T.constant(1)
106 |             retain_prob = one - dropout_rate
107 |             input /= retain_prob
108 | 
109 |         super(DropoutIdentityHiddenLayer, self).__init__(rng=rng, input=input)
110 |         if dropout_rate > 0.:
111 |             self.output = dropout_from_layer(rng, self.output, p=dropout_rate)
112 | 
113 | 
114 | class DropoutHiddenLayer(HiddenLayer):
115 |     def __init__(self, rng, input, n_in, n_out, dropout_rate, rescale,
116 |                  W=None, b=None, b_v=0., activation=None):
117 |         """
118 |         rescale: Boolean. Can be only used when applying dropout.
119 |         """
120 |         if rescale:
121 |             one = T.constant(1)
122 |             retain_prob = one - dropout_rate
123 |             input /= retain_prob
124 | 
125 |         super(DropoutHiddenLayer, self).__init__(
126 |             input=input, n_in=n_in, n_out=n_out, W=W, b=b,
127 |             activation=activation, rng=rng)
128 |         if dropout_rate > 0.:
129 |             self.output = dropout_from_layer(rng, self.output, p=dropout_rate)
130 | 
131 | 
132 | class LeNetConvPoolLayer(object):
133 |     def __init__(self, rng, input, filter_shape, image_shape,
134 |                  poolsize=(2, 2), maxout=False, poolmaxoutfactor=2,
135 |                  W=None, b=None, b_v=0., stride=(1, 1), LRN={
136 |                      "app": False, "before": False, "alpha": 1e-4, "k": 2,
137 |                      "beta": 0.75, "n": 5}):
138 |         """
139 |         Input:
140 |             maxout: Boolean. Indicates if to do or not a maxout.
141 |             poolmaxoutfactor: How many feature maps to maxout. The number of
142 |                 input feature maps must be a multiple of poolmaxoutfactor.
143 |             allow_dropout_conv: Boolean. Allow or not the dropout in conv.
144 |                 layer. This maybe helpful when we want to use dropout only
145 |                 for fully connected layers.
146 |             LRN: tuple (a, b) of booleans. a: apply or not the local response
147 |                 normalization. b: before (True) or after (False) the pooling.
148 |             b_v: float. The initial value of the bias.
149 |         """
150 |         self.LRNCCIdentityLayer = None
151 |         if maxout:
152 |             assert poolmaxoutfactor == 2
153 |         assert image_shape[1] == filter_shape[1]
154 |         self.input = input
155 | 
156 |         fan_in = numpy.prod(filter_shape[1:])
157 |         fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
158 |                    numpy.prod(poolsize))
159 |         # initialize weights with random weights
160 |         W_bound = numpy.sqrt(6. / (fan_in + fan_out))
161 |         if W is None:
162 |             W = theano.shared(
163 |                 numpy.asarray(
164 |                     rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
165 |                     dtype=theano.config.floatX
166 |                 ),
167 |                 name="w_conv",
168 |                 borrow=True
169 |             )
170 |         if b is None:
171 |             b_v = (
172 |                 numpy.ones(
173 |                     (filter_shape[0],)) * b_v).astype(theano.config.floatX)
174 |             b = theano.shared(value=b_v, name="b_conv", borrow=True)
175 | 
176 |         self.W = W
177 |         self.b = b
178 |         conv_out = conv.conv2d(
179 |             input=input,
180 |             filters=self.W,
181 |             filter_shape=filter_shape,
182 |             image_shape=image_shape,
183 |             subsample=stride
184 |         )
185 |         # Local reponse normalization
186 |         if LRN["app"] and LRN["before"]:
187 |             self.LRNCCIdentityLayer = LRNCCIdentityLayer(
188 |                 conv_out, alpha=LRN["alpha"], k=LRN["k"], beta=LRN["beta"],
189 |                 n=LRN["n"])
190 |             conv_out = self.LRNCCIdentityLayer.output
191 |             print "LRN BEFORE pooling ..."
192 | 
193 |         if maxout:
194 |             z = T.add(conv_out, self.b.dimshuffle('x', 0, 'x', 'x'))
195 |             s = None
196 |             for i in range(filter_shape[0]/poolmaxoutfactor):
197 |                 t = z[:, i::poolmaxoutfactor, :, :]
198 |                 if s is None:
199 |                     s = t
200 |                 else:
201 |                     s = T.maximum(s, t)
202 |             z = s
203 |             if poolsize not in [None, (1, 1)]:
204 |                 pooled_out = downsample.max_pool_2d(
205 |                     input=z,
206 |                     ds=poolsize,
207 |                     ignore_border=True
208 |                 )
209 |                 self.output = pooled_out
210 |             else:
211 |                 self.output = z
212 |         else:
213 |             if poolsize not in [None, (1, 1)]:
214 |                 pooled_out = downsample.max_pool_2d(
215 |                     input=conv_out,
216 |                     ds=poolsize,
217 |                     ignore_border=True
218 |                     )
219 |                 self.output = relu(
220 |                     pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
221 |                 print "RELU..."
222 |             else:
223 |                 # simple relu
224 |                 term = conv_out + self.b.dimshuffle('x', 0, 'x', 'x')
225 |                 self.output = T.switch(term > 0, term, 0 * term)
226 |                 print "RELU..."
227 | 
228 |         # Local reponse normalization
229 |         if LRN["app"] and not LRN["before"]:
230 |             self.LRNCCIdentityLayer = LRNCCIdentityLayer(
231 |                 self.output, alpha=LRN["alpha"], k=LRN["k"], beta=LRN["beta"],
232 |                 n=LRN["n"])
233 |             self.output = self.LRNCCIdentityLayer.output
234 |             print "LRN AFTER activation(of pooling)..."
235 | 
236 |         self.params = [self.W, self.b]
237 | 
238 | 
239 | class DropoutLeNetConvPoolLayer(LeNetConvPoolLayer):
240 |     def __init__(self, rng, input, filter_shape, image_shape, dropout_rate,
241 |                  rescale, poolsize=(2, 2), stride=(1, 1),
242 |                  LRN={
243 |                      "app": False, "before": False, "alpha": 1e-4, "k": 2,
244 |                      "beta": 0.75, "n": 5},
245 |                  maxout=False, poolmaxoutfactor=2, W=None, b=None, b_v=0.):
246 |         if rescale:
247 |             one = T.constant(1)
248 |             retain_prob = one - dropout_rate
249 |             input /= retain_prob
250 |         super(DropoutLeNetConvPoolLayer, self).__init__(
251 |             rng=rng, input=input, filter_shape=filter_shape,
252 |             image_shape=image_shape, poolsize=poolsize, stride=stride,
253 |             LRN=LRN, maxout=maxout, poolmaxoutfactor=poolmaxoutfactor,
254 |             W=W, b=b, b_v=b_v)
255 |         if dropout_rate > 0.:
256 |             self.output = dropout_from_layer(rng, self.output, p=dropout_rate)
257 | 


--------------------------------------------------------------------------------
/learning_rate.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano
 3 | 
 4 | 
 5 | class AnnealedLearningRate(object):
 6 |     """A callback to adjust the learning rate on each freq (batch or epoch).
 7 | 
 8 |     The learning rate will be annealed by 1/t at each freq.
 9 |     Parameters:
10 |         anneal_start: int
11 |             the epoch when to start annealing.
12 |     """
13 |     def __init__(self, anneal_start, freq='epoch'):
14 |         self._initialized = False
15 |         self._count = 0.
16 |         self._anneal_start = anneal_start
17 |         self.freq = freq
18 | 
19 |     def __call__(self, learning_rate):
20 |         """Updates the learning rate according to the annealing schedule.
21 | 
22 |         """
23 |         if not self._initialized:
24 |             self._base = learning_rate.get_value()
25 |             self._initialized = True
26 |         self._count += 1
27 |         learning_rate.set_value(
28 |             np.cast[theano.config.floatX](self.get_current_learning_rate()))
29 | 
30 |     def get_current_learning_rate(self):
31 |         """Calculate the current learning rate according to the annealing
32 |         schedule.
33 | 
34 |         """
35 |         return self._base * min(1, self._anneal_start / self._count)
36 | 
37 | 
38 | class ExponentialDecayLearningRate(object):
39 |     """
40 |     This anneals the learning rate by dviding it by decay_factor after
41 |     each update (freq='batch').
42 | 
43 |     lr = lr * decay_factor**(-t)
44 |     Parameters:
45 |         decay_factor: float
46 |             de the decay factor
47 |         min_lr: float
48 |             The lr will be fixed to min_lr when it's reached.
49 |     """
50 |     def __init__(self, decay_factor, min_lr):
51 |         self._count = 0
52 |         self._min_reached = False
53 |         self.min_lr = min_lr
54 |         self.decay_factor = decay_factor
55 |         self.freq = 'batch'
56 | 
57 |     def __call__(self, learning_rate):
58 |         """Update the learning rate according to the exponential decay
59 |         schedule.
60 | 
61 |         """
62 |         if self._count == 0.:
63 |             self._base_lr = learning_rate.get_vale()
64 |         self._count += 1
65 | 
66 |         if not self._min_reached:
67 |             new_lr = self._base_lr * (self.decay_factor ** (-self._count))
68 |             if new_lr <= self.min_lr:
69 |                 self._min_reached = True
70 |                 new_lr = self._min_reached
71 |         else:
72 |             new_lr = self.min_lr
73 | 
74 |         learning_rate.set_value(np.cast[theano.config.floatX](new_lr))
75 | 


--------------------------------------------------------------------------------
/learning_rule.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Jan 21 08:28:21 2015
  4 | 
  5 | @author: Soufiane Belharbi
  6 | """
  7 | import theano
  8 | import theano.tensor as T
  9 | import numpy as np
 10 | from collections import OrderedDict
 11 | 
 12 | from tools import sharedX_value, sharedX_mtx
 13 | from tools import floatX
 14 | 
 15 | 
 16 | def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7):
 17 |     """Max weight norm constraints and gradient clipping
 18 | 
 19 |     This takes a TensorVariable and rescales it so that incoming weight
 20 |     norms are below a specified constraint value. Vectors violating the
 21 |     constraint are rescaled so that they are within the allowed range.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     tensor_var : TensorVariable
 26 |         Theano expression for update, gradient, or other quantity.
 27 |     max_norm : scalar
 28 |         This value sets the maximum allowed value of any norm in
 29 |         `tensor_var`.
 30 |     norm_axes : sequence (list or tuple)
 31 |         The axes over which to compute the norm.  This overrides the
 32 |         default norm axes defined for the number of dimensions
 33 |         in `tensor_var`. When this is not specified and `tensor_var` is a
 34 |         matrix (2D), this is set to `(0,)`. If `tensor_var` is a 3D, 4D or
 35 |         5D tensor, it is set to a tuple listing all axes but axis 0. The
 36 |         former default is useful for working with dense layers, the latter
 37 |         is useful for 1D, 2D and 3D convolutional layers.
 38 |         (Optional)
 39 |     epsilon : scalar, optional
 40 |         Value used to prevent numerical instability when dividing by
 41 |         very small or zero norms.
 42 | 
 43 |     Credit:
 44 |         https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
 45 | 
 46 |     Returns
 47 |     -------
 48 |     TensorVariable
 49 |         Input `tensor_var` with rescaling applied to weight vectors
 50 |         that violate the specified constraints.
 51 | 
 52 | 
 53 |     Notes
 54 |     -----
 55 |     When `norm_axes` is not specified, the axes over which the norm is
 56 |     computed depend on the dimensionality of the input variable. If it is
 57 |     2D, it is assumed to come from a dense layer, and the norm is computed
 58 |     over axis 0. If it is 3D, 4D or 5D, it is assumed to come from a
 59 |     convolutional layer and the norm is computed over all trailing axes
 60 |     beyond axis 0. For other uses, you should explicitly specify the axes
 61 |     over which to compute the norm using `norm_axes`.
 62 |     """
 63 |     ndim = tensor_var.ndim
 64 | 
 65 |     if norm_axes is not None:
 66 |         sum_over = tuple(norm_axes)
 67 |     elif ndim == 2:  # DenseLayer
 68 |         sum_over = (0,)
 69 |     elif ndim in [3, 4, 5]:  # Conv{1,2,3}DLayer
 70 |         sum_over = tuple(range(1, ndim))
 71 |     else:
 72 |         raise ValueError(
 73 |             "Unsupported tensor dimensionality {}."
 74 |             "Must specify `norm_axes`".format(ndim)
 75 |         )
 76 | 
 77 |     dtype = np.dtype(theano.config.floatX).type
 78 |     norms = T.sqrt(T.sum(T.sqr(tensor_var), axis=sum_over, keepdims=True))
 79 |     target_norms = T.clip(norms, 0, dtype(max_norm))
 80 |     constrained_output = \
 81 |         (tensor_var * (target_norms / (dtype(epsilon) + norms)))
 82 | 
 83 |     return constrained_output
 84 | 
 85 | 
 86 | class LearningRule():
 87 |     """ A `LearningRule` is a class that calculates the new parameters value
 88 |     using:
 89 |     a learning rate, the current parameters value and the current gradient.
 90 | 
 91 |     """
 92 |     def get_updates(self, learning_rate, params, grads, lr_scalers):
 93 |         """ Compute the current updates for the parameters.
 94 | 
 95 |         """
 96 | 
 97 |         raise NotImplementedError(
 98 |             str(type(self)) + " does not implement get_updates.")
 99 | 
100 | 
101 | class Momentum(LearningRule):
102 |     """Implementation of the momentum as in the method described in section
103 |     9 of [1]:'A Practical Guide to Training Restricted Boltzmann Machines',
104 |     bu Geoffrey Hinton.(https://www.cs.toronto.edu/~hinton/absps/guideTR.pdf)
105 |     We implemented alos the formula presented in Imagenet paper:
106 |     <ImageNet Classification with Deep Convolutional Neural Networks>,
107 |     A.Krizhevsky et al. .
108 |     More details in:
109 |     [2]'On the importance of initialization and momentum in deep learning',
110 |     I. Sutskever et al.
111 |     [3]'Advances in optimizating recurrent networks', Y. Bengio et al.
112 | 
113 |     The model's parametes are updated such as:
114 |     velocity_(t+1) := momentum * velocity_t -
115 |         learning_rate * d cost / d param_t
116 |     param_(t+1) := param_t + velocity_(t+1)
117 | 
118 |     Parameters:
119 |         init_momentum: float
120 |             Initial value of the momentum coefficient. It remains fisex unless
121 |                 used with 'MomentumAdjuster'.
122 |         nesterov_momentum: boolean
123 |             If True, uses the accelerated momentum technique described in [2,3]
124 |         max_colm_norm: Boolean. The incoming weight vector corresponding to
125 |             each hidden unit is constrained to have a maximum squared length of
126 |             max_norm.
127 |         max_norm: Float. The maximum norm.
128 |     """
129 |     def __init__(self, init_momentum, nesterov_momentum=False,
130 |                  imagenet=False, imagenetDecay=5e-4, max_colm_norm=False,
131 |                  max_norm=15.0):
132 |         assert init_momentum >= 0., 'The initial momentum should be >=0.'
133 |         assert init_momentum < 1., 'The initial momentum should be < 1.'
134 | 
135 |         self.momentum = sharedX_value(value=init_momentum, name="momentum",
136 |                                       borrow=True)
137 |         self.nesterov_momentum = nesterov_momentum
138 |         self._first_time = True
139 |         self.velocity = None  # tracks the velocity at the previous time
140 |         self.imagenet = imagenet
141 |         self.imagenetDecay = sharedX_value(value=imagenetDecay,
142 |                                            name="imagenetDecay",
143 |                                            borrow=True)
144 |         self.max_colm_norm = max_colm_norm
145 |         self.max_norm = max_norm
146 | 
147 |     def get_updates(self, learning_rate, params, grads, lr_scalers):
148 |         """
149 |         get the updates (params, and velocity)
150 |         """
151 |         # the initial velocity is zero.
152 |         if self._first_time:
153 |             self.velocity = [
154 |                 sharedX_mtx(
155 |                     param.get_value() * 0.,
156 |                     name='vel_'+param.name, borrow=True) for param in params]
157 | 
158 |         updates = []
159 |         for (param, grad, vel, lr_sc) in zip(
160 |                 params, grads, self.velocity, lr_scalers):
161 |             lr_scaled = learning_rate * lr_sc
162 |             if self.imagenet:
163 |                 new_vel = self.momentum * vel -\
164 |                     lr_scaled * self.imagenetDecay * param - lr_scaled * grad
165 |             else:
166 |                 new_vel = self.momentum * vel - lr_scaled * grad
167 | 
168 |             updates.append((vel, new_vel))
169 |             inc = new_vel
170 |             # this is the equivalence of NAG in [3].3.5, eq [7].
171 |             # It helps to avoid calculating the new grad(param+vel_(t-1)).
172 |             # The only different from the paper is: momentum_(t)
173 |             # which it's set to momentum_(t-1). If you develop the final inc,
174 |             # you will find that it's equivalent to eq.[7] mentioned above.
175 |             if self.nesterov_momentum:
176 |                 inc = self.momentum * new_vel - lr_scaled * grad
177 | 
178 |             new_param = param + inc
179 |             if self.max_colm_norm and param.name in ["W", "w"]:
180 |                 new_param_final = norm_constraint(tensor_var=new_param,
181 |                                                   max_norm=self.max_norm)
182 |             else:
183 |                 new_param_final = new_param
184 |             updates.append((param, new_param_final))
185 | 
186 |         # add the velocity updates to updates
187 | 
188 |         return updates
189 | 
190 | 
191 | class MomentumLinearAdjusterOverEpoch(object):
192 |     """A callback to adjust linearly the momentum on each frequence (EPOCH).
193 |     It adjusts the momentum based on the number of the epochs seen.
194 | 
195 |     Parameters:
196 |         final_momentum: float
197 |             The momentum coefficient to use at the end of the learning.
198 |         start: int
199 |             The epoch on wich to start growing the momentum.
200 |         saturate: int
201 |             The epoch on wich to momentum should reach its final value.
202 | 
203 |     """
204 |     def __init__(self, final_momentum, start, saturate):
205 |         assert saturate >= start, "The momentum can not saturate before it "\
206 |             "starts increasing. Please set a saturation value higher than the"\
207 |             " start value."
208 |         self._initialized = False
209 |         self._count = 0
210 |         self.saturate = saturate
211 |         self.final_momentum = final_momentum
212 |         self.start = start
213 |         self.freq = 'epoch'  # it works only on epochs
214 |         self._first_time = True
215 | 
216 |     def __call__(self, learning_rule, seen_epochs):
217 |         """Update the momentum according to the number of the epochs already
218 |         seen.
219 | 
220 |         Parameters:
221 |             trainingAlgorithm: instance of
222 |                 training_algorithm.trainingAlgorithm,
223 |                 the current algorithm used for training the model.
224 |         """
225 |         # check
226 |         if not hasattr(learning_rule, 'momentum'):
227 |             raise ValueError(
228 |                 str(type(self))+' works only when the learning_rule '
229 |                 'specified in the training algorithm has the attribute '
230 |                 '<momentum>. For examples: "sarco.learning_rule.Momentum"')
231 | 
232 |         self._count = seen_epochs
233 |         self._apply_momentum(learning_rule)
234 | 
235 |     def _apply_momentum(self, learning_rule):
236 |         """Apply the momentum.
237 |         """
238 | 
239 |         momentum = learning_rule.momentum
240 |         if not self._initialized:
241 |             self._init_momentum = momentum.get_value()
242 |             self._initialized = True
243 |         momentum.set_value(
244 |             np.cast[theano.config.floatX](self.get_current_momentum()))
245 | 
246 |     def get_current_momentum(self):
247 |         """Return the current momentum with the desired schedule.
248 | 
249 |         """
250 |         w = self.saturate - self.start
251 |         if w == 0:
252 |             # saturate=start, jump straighforward to the final momentum value
253 |             # if we exceeded the saturation, return the final momentum
254 |             if self._count >= self.saturate:
255 |                 return self.final_momentum
256 |             else:
257 |                 # else: (we didn't reach yet the saturation point),
258 |                 # return the initial momentum
259 |                 return self._init_momentum
260 | 
261 |         coef = float(self._count - self.start) / float(w)
262 |         if coef < 0.:
263 |             coef = 0.  # no effect
264 |         if coef > 1.:
265 |             coef = 1.
266 | 
267 |         cu_m = self._init_momentum * (1 - coef) + coef * self.final_momentum
268 | 
269 |         return cu_m
270 | 
271 | 
272 | class AdaDelta(LearningRule):
273 |     """Implement the ADADELTA algorithm of [1] to update the parameters
274 |     of the model.
275 |     Parameters:
276 |         decay: float
277 |             Decay rate in [1].
278 | 
279 |     Caution: the parameter 'epsilon' in [1] is the learning rate.
280 |     So It would be better to use a small learning rate
281 |         (maybe fixed all the learning process [we will see]
282 |     [1]:'AdaDelta: An Adaptive Learning Rate Method', Zeiler M. )
283 |     """
284 |     def __init__(self, decay=0.95, max_colm_norm=False, max_norm=15.0):
285 |         assert decay >= 0., 'The decay parameter in ' + str(type(self)) +\
286 |             ' must be >= 0.'
287 |         assert decay < 1., 'The decay parameter in ' + str(type(self)) +\
288 |             ' must be < 1.'
289 |         self.decay = decay
290 |         self._first_time = True
291 |         self.mean_square_grad = None
292 |         self.mean_squar_dx = None
293 |         self.max_colm_norm = max_colm_norm
294 |         self.max_norm = max_norm
295 | 
296 |     def get_updates(self, learning_rate, params, grads, lr_scalers):
297 |         """Compute the AdaDelta updates of the model's parameters.
298 | 
299 |         param_t := param_(t-1) + AdaDelta_update_t
300 |         """
301 |         if self._first_time:
302 |             self.mean_square_grad = [
303 |                 sharedX_mtx(
304 |                     param.get_value() * 0.,
305 |                     name='mean_square_grad_'+param.name,
306 |                     borrow=True) for param in params]
307 |             self.mean_squar_dx = [
308 |                 sharedX_mtx(
309 |                     param.get_value() * 0.,
310 |                     name='mean_square_dx_'+param.name,
311 |                     borrow=True) for param in params]
312 |             self._first_time = False
313 | 
314 |         updates = []
315 |         for (param, grad, mean_square_grad, mean_squar_dx, lr_sc) in zip(
316 |                 params, grads, self.mean_square_grad, self.mean_squar_dx,
317 |                 lr_scalers):
318 |             # Calculate the running average gradient: E[g^2]_t
319 |             new_mean_square_grad = (
320 |                 self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grad))
321 | 
322 |             # The update: delta_x_t
323 |             lr_scaled = learning_rate * lr_sc
324 |             epsilon = lr_scaled
325 |             rms_dx_t_1 = T.sqrt(mean_squar_dx + epsilon)
326 |             rms_grad_t = T.sqrt(new_mean_square_grad + epsilon)
327 |             delta_x_t = - (rms_dx_t_1 / rms_grad_t) * grad
328 |             # Compute: E[delta_x^2]_t
329 |             new_mean_square_dx = (
330 |                 self.decay * mean_squar_dx +
331 |                 (1 - self.decay) * T.sqr(delta_x_t))
332 | 
333 |             # update the params
334 |             new_param = param + delta_x_t
335 |             # Send for the update
336 |             updates.append((mean_square_grad, new_mean_square_grad))
337 |             updates.append((mean_squar_dx, new_mean_square_dx))
338 |             if self.max_colm_norm and param.name in ["W", "w"]:
339 |                 new_param_final = norm_constraint(tensor_var=new_param,
340 |                                                   max_norm=self.max_norm)
341 |             else:
342 |                 new_param_final = new_param
343 |             updates.append((param, new_param_final))
344 | 
345 |         return updates
346 | 
347 | 
348 | class AdaGrad(LearningRule):
349 |     """Implement the AdaGrad algorithm of [1] to update the parameters of
350 |     the model.
351 | 
352 |     For more details on how to implement AdGrad, see [2], §2.
353 |     [1]:'Adaptive subgradient methods for online learning and
354 |     stochastic optimization.', Duchi et al.
355 |     [2]:'Notes on AdaDrad', Chris Dyer.
356 |     (link: http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf)
357 | 
358 |     """
359 |     def __init__(self, max_colm_norm=False, max_norm=15.0):
360 |         self._first_time = True
361 |         self.sum_square_grad = None
362 |         self.max_colm_norm = max_colm_norm
363 |         self.max_norm = max_norm
364 | 
365 |     def get_updates(self, learning_rate, params, grads, lr_scalers):
366 |         """Compute the AdaDelta updates of the model's parameters.
367 | 
368 |         param_t := param_(t-1) + AdaDelta_update_t
369 |         """
370 |         if self._first_time:
371 |             self.sum_square_grad = [
372 |                 sharedX_mtx(
373 |                     param.get_value() * 0.,
374 |                     name='sum_square_grad_'+param.name,
375 |                     borrow=True) for param in params]
376 |             self._first_time = False
377 | 
378 |         updates = []
379 |         for (param, grad, sum_square_grad, lr_sc) in zip(
380 |                 params, grads, self.sum_square_grad, lr_scalers):
381 |             # Calculate the running average gradient: E[g^2]_t
382 |             new_sum_square_grad = sum_square_grad + T.sqr(grad)
383 | 
384 |             # The update: delta_x_t
385 |             lr_scaled = learning_rate * lr_sc
386 |             epsilon = lr_scaled
387 |             sqrt_sum_grad_t = T.sqrt(new_sum_square_grad)
388 |             delta_x_t = - (epsilon / sqrt_sum_grad_t) * grad
389 | 
390 |             # update the params
391 |             new_param = param + delta_x_t
392 |             # Send for the update
393 |             updates.append((sum_square_grad, new_sum_square_grad))
394 |             if self.max_colm_norm and param.name in ["W", "w"]:
395 |                 new_param_final = norm_constraint(tensor_var=new_param,
396 |                                                   max_norm=self.max_norm)
397 |             else:
398 |                 new_param_final = new_param
399 |             updates.append((param, new_param_final))
400 | 
401 |         return updates
402 | 
403 | 
404 | class RMSProp(LearningRule):
405 |     """Implements the RMSProp learning rule as described in [1].
406 | 
407 |     The RMSProp rule was described in [1]. The idea is similar to the
408 |     AdaDelta,
409 |     which consists of dividing the learning rate for a weight by a running
410 |     average of the magintudes of recent graidients of that weight.
411 |     Parameters:
412 |         decay: float
413 |             Decay constant similar to the one used in AdaDelta, and Momentum.
414 |         max_scaling: float
415 |             Restrict the RMSProp gradient scaling coefficient to values below
416 |             'max_scaling' to avoid a learning rate too small (almost zero).
417 | 
418 |     [1]: 'Neural Networks for Machine Learning, Lecture 6a Overview of
419 |         mini-­‐batch gradient descent', a lecture by Hinton et al.
420 |     (http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
421 |     """
422 |     def __init__(self, decay=0.9, max_scaling=1e5, max_colm_norm=False,
423 |                  max_norm=15.0):
424 |         assert 0. <= decay < 1., 'decay must be: 0. <= decay < 1'
425 |         assert max_scaling > 0., 'max_scaling must be > 0.'
426 |         self.decay = sharedX_value(decay, name='decay', borrow=True)
427 |         self.epsilon = 1. / max_scaling
428 |         self.mean_square_grads = None
429 |         self._first_time = True
430 |         self.max_colm_norm = max_colm_norm
431 |         self.max_norm = max_norm
432 | 
433 |     def get_updates(self, learning_rate, params, grads, lr_scalers):
434 |         """Compute the parameters' updates.
435 | 
436 |         """
437 |         if self._first_time:
438 |             self.mean_square_grads = [
439 |                 sharedX_mtx(
440 |                     param.get_value() * 0.,
441 |                     name='mean_square_grad_'+param.name,
442 |                     borrow=True) for param in params]
443 |             self._first_time = False
444 |         updates = []
445 |         for (param, grad, mean_square_grad, lr_sc) in zip(
446 |                 params, grads, self.mean_square_grads, lr_scalers):
447 |             new_mean_square_grad = (
448 |                 self.decay * mean_square_grad + (1-self.decay) * T.sqr(grad))
449 |             # the update
450 |             rms_grad_t = T.sqrt(new_mean_square_grad)
451 |             rms_grad_t = T.maximum(rms_grad_t, self.epsilon)
452 |             lr_scaled = learning_rate * lr_sc
453 |             delta_x_t = - lr_scaled * grad / rms_grad_t
454 | 
455 |             new_param = param + delta_x_t
456 |             # updates
457 |             if self.max_colm_norm and param.name in ["W", "w"]:
458 |                 new_param_final = norm_constraint(tensor_var=new_param,
459 |                                                   max_norm=self.max_norm)
460 |             else:
461 |                 new_param_final = new_param
462 |             updates.append((param, new_param_final))
463 |             updates.append((mean_square_grad, new_mean_square_grad))
464 | 
465 |         return updates
466 | 
467 | 
468 | class Adam(LearningRule):
469 |     """
470 |     Implement Adaptive Moment Estimation.
471 |     Adam updates implemented as in [1]_.
472 |     Parameters:
473 |         beta1 : float
474 |             Exponential decay rate for the first moment estimates.
475 |         beta2 : float
476 |             Exponential decay rate for the second moment estimates.
477 |         epsilon : float
478 |             Constant for numerical stability.
479 |     Credit:
480 |         https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
481 |     References
482 |     ----------
483 |     .. [1] Kingma, Diederik, and Jimmy Ba (2014):
484 |            Adam: A Method for Stochastic Optimization.
485 |            arXiv preprint arXiv:1412.6980.
486 |     """
487 |     def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8,
488 |                  max_colm_norm=False, max_norm=15.0):
489 |         self.beta1 = sharedX_value(beta1, name='beta1', borrow=True)
490 |         self.beta2 = sharedX_value(beta2, name='beta2', borrow=True)
491 |         self.epsilon = sharedX_value(epsilon, name='epsilon', borrow=True)
492 |         self.max_colm_norm = max_colm_norm
493 |         self.max_norm = max_norm
494 | 
495 |     def get_updates(self, learning_rate, params, grads, lr_scalers):
496 |         """Compute the parameters' updates.
497 | 
498 |         """
499 |         t_prev = theano.shared(floatX(0.))
500 |         updates = OrderedDict()
501 | 
502 |         # Using theano constant to prevent upcasting of float32
503 |         one = T.constant(1)
504 | 
505 |         t = t_prev + 1
506 |         a_t = learning_rate*T.sqrt(one-self.beta2**t)/(one-self.beta1**t)
507 | 
508 |         for param, g_t in zip(params, grads):
509 |             value = param.get_value(borrow=True)
510 |             m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
511 |                                    broadcastable=param.broadcastable)
512 |             v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
513 |                                    broadcastable=param.broadcastable)
514 | 
515 |             m_t = self.beta1*m_prev + (one-self.beta1)*g_t
516 |             v_t = self.beta2*v_prev + (one-self.beta2)*g_t**2
517 |             step = a_t*m_t/(T.sqrt(v_t) + self.epsilon)
518 | 
519 |             updates[m_prev] = m_t
520 |             updates[v_prev] = v_t
521 |             new_param = param - step
522 |             if self.max_colm_norm and param.name in ["W", "w"]:
523 |                 new_param_final = norm_constraint(tensor_var=new_param,
524 |                                                   max_norm=self.max_norm)
525 |             else:
526 |                 new_param_final = new_param
527 |             updates[param] = new_param_final
528 | 
529 |         updates[t_prev] = t
530 | 
531 |         return updates
532 | 
533 | 
534 | class Adamax(LearningRule):
535 |     """
536 |     Adamax updates.
537 |      Adamax updates implemented as in [1]_. This is a variant of of the Adam
538 |     algorithm based on the infinity norm.
539 |     Parameters:
540 |         beta1 : float
541 |             Exponential decay rate for the first moment estimates.
542 |         beta2 : float
543 |             Exponential decay rate for the weighted infinity norm estimates.
544 |         epsilon : float
545 |             Constant for numerical stability.
546 |     Credit:
547 |         https://github.com/Lasagne/Lasagne/blob/master/lasagne/updates.py
548 | 
549 |     References
550 |     ----------
551 |     .. [1] Kingma, Diederik, and Jimmy Ba (2014):
552 |            Adam: A Method for Stochastic Optimization.
553 |            arXiv preprint arXiv:1412.6980.
554 |     """
555 |     def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8,
556 |                  max_colm_norm=False, max_norm=15.0):
557 |         self.beta1 = sharedX_value(beta1, name='beta1', borrow=True)
558 |         self.beta2 = sharedX_value(beta2, name='beta2', borrow=True)
559 |         self.epsilon = sharedX_value(epsilon, name='epsilon', borrow=True)
560 |         self.max_colm_norm = max_colm_norm
561 |         self.max_norm = max_norm
562 | 
563 |     def get_updates(self, learning_rate, params, grads, lr_scalers):
564 |         """Compute the parameters' updates.
565 | 
566 |         """
567 |         t_prev = theano.shared(floatX(0.))
568 |         updates = OrderedDict()
569 | 
570 |         # Using theano constant to prevent upcasting of float32
571 |         one = T.constant(1)
572 | 
573 |         t = t_prev + 1
574 |         a_t = learning_rate/(one-self.beta1**t)
575 | 
576 |         for param, g_t in zip(params, grads):
577 |             value = param.get_value(borrow=True)
578 |             m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
579 |                                    broadcastable=param.broadcastable)
580 |             u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
581 |                                    broadcastable=param.broadcastable)
582 | 
583 |             m_t = self.beta1*m_prev + (one-self.beta1)*g_t
584 |             u_t = T.maximum(self.beta2*u_prev, abs(g_t))
585 |             step = a_t*m_t/(u_t + self.epsilon)
586 | 
587 |             updates[m_prev] = m_t
588 |             updates[u_prev] = u_t
589 |             new_param = param - step
590 |             if self.max_colm_norm and param.name in ["W", "w"]:
591 |                 new_param_final = norm_constraint(tensor_var=new_param,
592 |                                                   max_norm=self.max_norm)
593 |             else:
594 |                 new_param_final = new_param
595 |             updates[param] = new_param_final
596 | 
597 |         updates[t_prev] = t
598 | 
599 |         return updates
600 | 


--------------------------------------------------------------------------------
/mnist_manip.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | from keras.datasets import cifar10
 3 | import numpy as np
 4 | import os
 5 | import sys
 6 | import matplotlib.pyplot as plt
 7 | import cPickle as pkl
 8 | from tools import add_noise
 9 | from tools import add_cifar_10
10 | from scipy import ndimage
11 | 
12 | 
13 | def repeat_it(x, y, nbr):
14 |     out_x, out_y = None, None
15 |     for i in range(nbr):
16 |         gen = add_noise(x)
17 |         if out_x is None:
18 |             out_x = gen
19 |             out_y = y
20 |         else:
21 |             out_x = np.vstack((out_x, gen))
22 |             out_y = np.hstack((out_y, y))
23 |     return out_x, out_y
24 | 
25 | 
26 | def repeat_it_cifar(x, y, nbr, x_cifar):
27 |     out_x, out_y = None, None
28 |     for i in range(nbr):
29 |         gen = add_cifar_10(x, x_cifar)
30 |         if out_x is None:
31 |             out_x = gen
32 |             out_y = y
33 |         else:
34 |             out_x = np.vstack((out_x, gen))
35 |             out_y = np.hstack((out_y, y))
36 |     return out_x, out_y
37 | 
38 | # MNIST + noise
39 | #path_data = "./data/mnist.pkl"
40 | #f = open(path_data, 'r')
41 | #train, valid, test = pkl.load(f)
42 | #trainx, trainy = train[0], train[1]
43 | #validx, validy = valid[0], valid[1]
44 | #testx, testy = test[0], test[1]
45 | #
46 | ## random noise
47 | #times_tr, times_vl, times_ts = 2, 2, 5
48 | #
49 | #trainx_noise, trainy_new = repeat_it(trainx, trainy, times_tr)
50 | #validx_noise, validy_new = repeat_it(validx, validy, times_vl)
51 | #testx_noise, testy_new = repeat_it(testx, testy, times_ts)
52 | #
53 | #stuff = [(trainx_noise, trainy_new), (validx_noise, validy_new),
54 | #         (testx_noise, testy_new)]
55 | #with open("./data/mnist_noise.pkl", "w") as f:
56 | #    pkl.dump(stuff, f)
57 | #path = "./data/mnist_noise/"
58 | #if not os.path.exists(path):
59 | #    os.makedirs(path)
60 | #for k in range(trainx_noise.shape[0]):
61 | #    fig = plt.figure()
62 | #    plt.imshow(trainx_noise[k].reshape(28, 28), cmap='gray')
63 | #    fig.savefig(path + str(k) + ".png")
64 | #    # blurred
65 | #    if k == 10:
66 | #        break
67 | 
68 | 
69 | # MNIST + cifar 10.
70 | path_data = "./data/mnist.pkl"
71 | f = open(path_data, 'r')
72 | train, valid, test = pkl.load(f)
73 | trainx, trainy = train[0], train[1]
74 | validx, validy = valid[0], valid[1]
75 | testx, testy = test[0], test[1]
76 | (x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = cifar10.load_data()
77 | 
78 | # random noise
79 | times_tr, times_vl, times_ts = 2, 2, 5
80 | 
81 | trainx_noise, trainy_new = repeat_it_cifar(trainx, trainy, times_tr, x_train_cifar[:40000])
82 | validx_noise, validy_new = repeat_it_cifar(validx, validy, times_vl, x_train_cifar[40000:])
83 | testx_noise, testy_new = repeat_it_cifar(testx, testy, times_ts, x_test_cifar)
84 | 
85 | stuff = [(trainx_noise, trainy_new), (validx_noise, validy_new),
86 |          (testx_noise, testy_new)]
87 | with open("./data/mnist_img.pkl", "w") as f:
88 |     pkl.dump(stuff, f)
89 | path = "./data/mnist_img/"
90 | if not os.path.exists(path):
91 |     os.makedirs(path)
92 | for k in range(trainx_noise.shape[0]):
93 |     fig = plt.figure()
94 |     plt.imshow(trainx_noise[k].reshape(28, 28), cmap='gray')
95 |     fig.savefig(path + str(k) + ".png")
96 |     # blurred
97 |     if k == 10:
98 |         break


--------------------------------------------------------------------------------
/non_linearities.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | 
 4 | def relu(x):
 5 |     return T.switch(x > 0, x, 0)
 6 | 
 7 | 
 8 | class NonLinearity:
 9 |     RELU = "rectifier"
10 |     TANH = "tanh"
11 |     SIGMOID = "sigmoid"
12 |     SOFTMAX = "softmax"
13 | 
14 | 
15 | def softmax(x):
16 |     return T.exp(x)/(T.exp(x).sum(1, keepdims=True))
17 | 
18 | 
19 | def get_non_linearity_fn(nonlinearity):
20 |         if nonlinearity == NonLinearity.SIGMOID:
21 |             return T.nnet.sigmoid
22 |         elif nonlinearity == NonLinearity.RELU:
23 |             return relu
24 |         elif nonlinearity == NonLinearity.TANH:
25 |             return T.tanh
26 |         elif nonlinearity == NonLinearity.SOFTMAX:
27 |             return softmax  # T.nnet.softmax
28 |         elif nonlinearity is None:
29 |             return None
30 | 
31 | 
32 | def get_non_linearity_str(nonlinearity):
33 |         if nonlinearity == T.nnet.sigmoid:
34 |             return NonLinearity.SIGMOID
35 |         elif nonlinearity == relu:
36 |             return NonLinearity.RELU
37 |         elif nonlinearity == T.tanh:
38 |             return NonLinearity.TANH
39 |         elif nonlinearity == T.nnet.softmax:
40 |             return None  # we do not use any non-linearity.
41 |         elif nonlinearity == softmax:
42 |             return None  # we do not use any non-linearity.
43 |         elif nonlinearity is None:
44 |             return None
45 |         else:
46 |             raise ValueError("Unknown non-linearity")
47 | 
48 | 
49 | class CostType:
50 |     MeanSquared = "MeanSquaredCost"
51 |     CrossEntropy = "CrossEntropy"
52 |     NegativeLogLikelihood = "NegativelogLikelihood"
53 | 


--------------------------------------------------------------------------------
/normalization.py:
--------------------------------------------------------------------------------
  1 | # Based on: https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/
  2 | # normalization.py#L120-L320
  3 | import theano
  4 | import numpy as np
  5 | from theano import tensor as T
  6 | 
  7 | 
  8 | class BatchNormLayer(object):
  9 |     """ Implementation of batch normalization from the paper:
 10 |     Ioffe, Sergey and Szegedy, Christian (2015):
 11 |            Batch Normalization: Accelerating Deep Network Training by Reducing
 12 |            Internal Covariate Shift. http://arxiv.org/abs/1502.03167.
 13 |     """
 14 |     def __init__(self, input_shape, axes='auto', epsilon=1e-4, alpha=0.1,
 15 |                  beta=0., gamma=0, mean=0, inv_std=1):
 16 |         self.input_shape = input_shape
 17 |         if axes == 'auto':
 18 |             # default normalizationover lla but the not the second axis.
 19 |             axes = (0,) + tuple(range(2, len(self.input_shape)))
 20 |         elif isinstance(axes, int):
 21 |             axes = (axes,)
 22 |         self.axes = axes
 23 |         self.epsilon = epsilon
 24 |         self.alpha = alpha
 25 | 
 26 |         # create params
 27 |         shape = [size for axis, size in enumerate(self.input_shape)
 28 |                  if axis not in self.axes]
 29 |         if any(size is None for size in shape):
 30 |             raise ValueError("BatchNormLayer needs specified input shape for "
 31 |                              "all axes not normalized over.")
 32 |         if beta is None:
 33 |             self.beta = 0.
 34 |         else:
 35 |             value = np.ones(shape, dtype=theano.config.floatX) * beta
 36 |             self.beta = theano.shared(value=value.astype(theano.config.floatX),
 37 |                                       name="beta", borrow=True)
 38 | 
 39 |         if gamma is None:
 40 |             self.gamma = 0.
 41 |         else:
 42 |             value = np.ones(shape, dtype=theano.config.floatX) * gamma
 43 |             self.gamma = theano.shared(
 44 |                 value=value.astype(theano.config.floatX),
 45 |                 name="gamma", borrow=True)
 46 | 
 47 |         value = np.ones(shape, dtype=theano.config.floatX) * mean
 48 |         self.mean = theano.shared(value=value.astype(theano.config.floatX),
 49 |                                   name="mean", borrow=True)
 50 | 
 51 |         value = np.ones(shape, dtype=theano.config.floatX) * inv_std
 52 |         self.inv_std = theano.shared(value=value.astype(theano.config.floatX),
 53 |                                      name="inv_std", borrow=True)
 54 |         self.params = [self.beta, self.gamma]
 55 |         self.stats = [self.mean, self.inv_std]
 56 | 
 57 |     def get_output_for(self, input, deterministic=False,
 58 |                        batch_norm_use_averages=None,
 59 |                        batch_norm_update_averages=None):
 60 |         input_mean = input.mean(self.axes)
 61 |         input_inv_std = T.inv(T.sqrt(input.var(self.axes) + self.epsilon))
 62 | 
 63 |         # decide whether to use the sotred averages or mini-batch statistics
 64 |         if batch_norm_use_averages is None:
 65 |             batch_norm_use_averages = deterministic
 66 |         use_averages = batch_norm_use_averages
 67 | 
 68 |         if use_averages:
 69 |             mean = self.mean
 70 |             inv_std = self.inv_std
 71 |         else:
 72 |             mean = input_mean
 73 |             inv_std = input_inv_std
 74 | 
 75 |         # decide whether to update the stored averages
 76 |         if batch_norm_update_averages is None:
 77 |             batch_norm_update_averages = not deterministic
 78 |         update_averages = batch_norm_update_averages
 79 | 
 80 |         if update_averages:
 81 |             # Trick: To update the stored statistics, we create memory-aliased
 82 |             # clones of the stored statistics.
 83 |             running_mean = theano.clone(self.mean, share_inputs=False)
 84 |             running_inv_std = theano.clone(self.inv_std, share_inputs=False)
 85 |             # set a default update for them
 86 |             running_mean.default_update = ((1 - self.alpha) * running_mean +
 87 |                                            self.alpha * input_mean)
 88 |             running_inv_std.default_update = ((1 - self.alpha) *
 89 |                                               running_inv_std +
 90 |                                               self.alpha * input_inv_std)
 91 |             # and make sure they end up in the graph without participating in
 92 |             # the computation (this way their default_update will be collected
 93 |             # and applied, but the computation will be optimized away):
 94 |             mean += 0 * running_mean
 95 |             inv_std += 0 * running_inv_std
 96 |         # prepare dimshuffle pattern inserting broadcastable axes as needed
 97 |         param_axes = iter(range(input.ndim - len(self.axes)))
 98 |         pattern = ['x' if input_axis in self.axes
 99 |                    else next(param_axes)
100 |                    for input_axis in range(input.ndim)]
101 | 
102 |         # apply dimshuffle pattern to all parameters
103 |         beta = 0 if self.beta is None else self.beta.dimshuffle(pattern)
104 |         gamma = 1 if self.gamma is None else self.gamma.dimshuffle(pattern)
105 |         mean = mean.dimshuffle(pattern)
106 |         inv_std = inv_std.dimshuffle(pattern)
107 | 
108 |         # normalize
109 |         normalized = (input - mean) * (gamma * inv_std) + beta
110 |         return normalized
111 | 


--------------------------------------------------------------------------------
/outputjobs/.readme.md:
--------------------------------------------------------------------------------
1 | This folder contains the output of the [Slurm](https://slurm.schedmd.com/) jobs.


--------------------------------------------------------------------------------
/p100.sl:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Slurm submission script, 
 4 | # GPU job 
 5 | # CRIHAN v 1.00 - Jan 2017 
 6 | # support@criann.fr
 7 | 
 8 | # Not shared resources
 9 | #SBATCH --share
10 | 
11 | # Job name
12 | #SBATCH -J "lenet"
13 | 
14 | # Batch output file
15 | #SBATCH --output ./outputjobs/lenet.o%J
16 | 
17 | # Batch error file
18 | #SBATCH --error ./outputjobs/lenet.e%J
19 | 
20 | # GPUs architecture and number
21 | # ----------------------------
22 | # Partition (submission class)
23 | #SBATCH --partition gpu_p100 
24 | 
25 | # GPUs per compute node
26 | #   gpu:4 (maximum) for gpu_k80 
27 | #   gpu:2 (maximum) for gpu_p100 
28 | #SBATCH --gres gpu:1
29 | # ----------------------------
30 | 
31 | # Job time (hh:mm:ss)
32 | #SBATCH --time 24:00:00
33 | 
34 | # MPI task maximum memory (MB)
35 | #SBATCH --mem-per-cpu 3000 
36 | # ----------------------------
37 | 
38 | #SBATCH --mail-type ALL
39 | # User e-mail address
40 | #SBATCH --mail-user soufiane.belharbi@insa-rouen.fr
41 | 
42 | # environments
43 | # ---------------------------------
44 | module load cuda/8.0
45 | module load python/2.7.12
46 | # ---------------------------------
47 | 
48 | cd $LOCAL_WORK_DIR/workspace/code/class-invariance-hint/
49 | 
50 | # THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python trainLenet.py lenet_0_1000_3_0_0_1_0_0_True_False_False_False_False.yaml 
51 | 
52 | 


--------------------------------------------------------------------------------
/plot_paper.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import cPickle as pkl
 4 | from keras.datasets import cifar10
 5 | from tools import add_noise
 6 | from tools import add_cifar_10
 7 | import copy
 8 | 
 9 | 
10 | path_data = "./data/out.pkl"
11 | f = open(path_data, 'r')
12 | train, valid, test = pkl.load(f)
13 | trainx, trainy = train[0], train[1]
14 | print trainy
15 | (x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = cifar10.load_data()
16 | # with open("./data/cifar10_data.pkl", 'w') as fx:
17 | #    pkl.dump(((x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar)), fx)
18 | #with open("./data/cifar10_data.pkl", 'r') as fx:
19 | #    (x_train_cifar, y_train_cifar), (x_test_cifar, y_test_cifar) = pkl.load(fx)
20 | 
21 | trainx_noise = add_noise(copy.deepcopy(trainx))
22 | ind = [0, 1, 6, 3, 7]
23 | trainx_img = add_cifar_10(copy.deepcopy(trainx), x_train_cifar[ind], sh=False)
24 | for k in range(trainx_img.shape[0]):
25 |     fig = plt.figure()
26 |     plt.imshow(trainx_img[k].reshape(28, 28), cmap='gray')
27 |     fig.savefig("./data/"+ str(k) + ".png")
28 | x = np.vstack((trainx, trainx_noise, trainx_img))
29 | # Plot
30 | fig, axes = plt.subplots(3, 5, figsize=(12, 6),
31 |                          subplot_kw={'xticks': [], 'yticks': []})
32 | 
33 | 
34 | i = 0
35 | for ax in axes.flat:
36 |     print x_train_cifar.shape
37 |     # img = x_train_cifar[i, :, :, 0].reshape(32, 32)
38 |     img = x[i, :].reshape(28, 28)
39 |     ax.imshow(img, cmap='gray', interpolation="bilinear")
40 |     ax.set_aspect("auto")
41 |     ax.set_xticklabels([])
42 |     ax.set_yticklabels([])
43 |     i += 1
44 | 
45 | fig.subplots_adjust(hspace=0.01, wspace=0.01)
46 | fig.savefig("./data/samples.eps", format="eps", dpi=300)
47 | 


--------------------------------------------------------------------------------
/submit.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash 
2 | sbatch ./jobs/10_1000_0.sl 
3 | sbatch ./jobs/10_1000_1.sl 
4 | sbatch ./jobs/10_1000_2.sl 
5 | sbatch ./jobs/10_1000_3.sl 
6 | sbatch ./jobs/10_1000_4.sl 
7 | sbatch ./jobs/10_1000_5.sl 
8 | sbatch ./jobs/10_1000_6.sl 
9 | 


--------------------------------------------------------------------------------
/train3_bin.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pkl
  2 | import numpy as np
  3 | import theano.tensor as T
  4 | import os
  5 | import sys
  6 | import datetime as DT
  7 | import shutil
  8 | import inspect
  9 | import theano
 10 | import warnings
 11 | import yaml
 12 | 
 13 | from tools import ModelMLP
 14 | from tools import NonLinearity
 15 | from tools import split_data_to_minibatchs_eval
 16 | from tools import sharedX_value
 17 | from tools import theano_fns
 18 | from learning_rule import AdaDelta
 19 | from learning_rule import RMSProp
 20 | from learning_rule import Momentum
 21 | from tools import evaluate_model
 22 | from tools import collect_stats_epoch
 23 | from tools import plot_stats
 24 | from tools import train_one_epoch
 25 | from tools import train_one_epoch_alter
 26 | from tools import to_categorical
 27 | from tools import plot_classes
 28 | from tools import chunks
 29 | from tools import plot_penalty_vl
 30 | from tools import plot_debug_grad
 31 | from tools import plot_debug_ratio_grad
 32 | 
 33 | 
 34 | # Parse the yaml config.
 35 | config_path = "./config_yaml/"
 36 | with open(config_path + sys.argv[1], 'r') as fy:
 37 |     config_exp = yaml.load(fy)
 38 | 
 39 | x_classes = 2
 40 | 
 41 | cs = [1, 7]
 42 | debug_code = config_exp["debug_code"]
 43 | 
 44 | if debug_code:
 45 |     warnings.warn("YOU ARE IN DEBUG MODE! YOUR CODE WILL TAKE MORE TIME!!!!!")
 46 | 
 47 | 
 48 | def standerize(d, mu=None, sigma=None):
 49 |     if mu is None:
 50 |         mu = np.mean(d, axis=0)
 51 |         sigma = np.std(d, axis=0)
 52 |     if sigma.nonzero()[0].shape[0] == 0:
 53 |         raise Exception("std found to be zero!!!!")
 54 |     norm_d = (d - mu) / sigma
 55 | 
 56 |     return norm_d, mu, sigma
 57 | 
 58 | 
 59 | def get_class_c(x, y, c, nbr):
 60 |     ind = np.argwhere(y == c)
 61 |     x_out = x[ind]
 62 |     y_out = y[ind]
 63 |     x_out = x_out.reshape(x_out.shape[0], x.shape[1])
 64 |     y_out = y_out.reshape(y_out.shape[0],)
 65 | 
 66 |     if nbr is not None:
 67 |         x_out = x_out[:nbr, :]
 68 |         y_out = y_out[:nbr]
 69 |     return x_out, y_out
 70 | 
 71 | 
 72 | def get_data(cs, x, y, nbr, shuffle=False):
 73 |     datax, datay = None, None
 74 |     for c in cs:
 75 |         xx, yy = get_class_c(x, y, c, nbr)
 76 | 
 77 |         if datax is None:
 78 |             datax = xx
 79 |             datay = yy
 80 |         else:
 81 |             datax = np.vstack((datax, xx))
 82 |             datay = np.hstack((datay, yy))
 83 |     # suffle
 84 |     if shuffle:
 85 |         megaxy = np.hstack((datax, datay.reshape(datay.size, 1)))
 86 |         for i in range(100):
 87 |             np.random.shuffle(megaxy)
 88 |         datax = megaxy[:, :-1]
 89 |         datay = megaxy[:, -1]
 90 |     else:
 91 |         return datax, datay
 92 |     return datax, datay
 93 | 
 94 | 
 95 | def rename_classes(y):
 96 |     un = np.unique(y)
 97 |     un = np.array(sorted(un))
 98 |     y_out = y * 0
 99 |     for u, re in zip(un, range(un.size)):
100 |         ind = np.argwhere(y == u)
101 |         y_out[ind] = re
102 |     return y_out
103 | 
104 | 
105 | def get_inter_output(model, l_tst, testx_sh):
106 |     i_x_vl = T.lvector("ixtst")
107 | 
108 |     eval_fn_tst = theano.function(
109 |         [i_x_vl],
110 |         [l.output for l in model.layers],
111 |         givens={model.x: testx_sh[i_x_vl]})
112 |     output_v = [
113 |         eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))]
114 |     nbr_layers = len(output_v[0])
115 | 
116 |     l_val = []
117 |     for l in range(nbr_layers):
118 |         tmp = None
119 |         for k in output_v:
120 |             if tmp is None:
121 |                 tmp = k[l]
122 |             else:
123 |                 tmp = np.vstack((tmp, k[l]))
124 |         l_val.append(tmp)
125 | 
126 |     return l_val
127 | 
128 | 
129 | def generate_2d_checkboard(x_born, y_born, s, ss):
130 |     """x_born: [-1, 1], y_born:[-1, 1], s=10, ss=20
131 |     """
132 |     linex = np.linspace(x_born[0], x_born[1], s, endpoint=False)
133 |     liney = np.linspace(y_born[0], y_born[1], s, endpoint=False)
134 |     x, y = [], []
135 |     start_y = True
136 |     for ix in range(linex.size - 1):
137 |         lx, lxnext = linex[ix], linex[ix+1]
138 |         for iy in range(liney.size - 1):
139 |             ly, lynext = liney[iy], liney[iy+1]
140 |             linexx = np.linspace(lx, lxnext, ss, endpoint=False)
141 |             lineyy = np.linspace(ly, lynext, ss, endpoint=False)
142 |             xv, yv = np.meshgrid(linexx, lineyy)
143 |             for i in range(xv.shape[0]):
144 |                 for j in range(yv.shape[0]):
145 |                     x.append([xv[i, j], yv[i, j]])
146 |                     y.append(start_y)
147 |             start_y = not start_y
148 | 
149 |     y = np.array(y) * 1.
150 |     x = np.array(x)
151 |     mega = np.hstack((x, y.reshape(y.size, 1)))
152 |     for i in range(500):
153 |         np.random.shuffle(mega)
154 |         print i
155 |     x = mega[:, :-1]
156 |     y = mega[:, -1]
157 |     print x.shape, y.shape
158 |     fig = plot_classes(y, x, "", 0., "generated 2D: checkboard.")
159 |     fig.savefig("data/2d/cb2d_generated.png", bbox_inches='tight')
160 |     return x, y
161 | 
162 | 
163 | def create_tr_vl_ts_cb(path):
164 |     if not os.path.exists(path):
165 |         os.makedirs(path)
166 |     x, y = generate_2d_checkboard([-1, 1], [-1, 1], 10, 20)
167 |     nbr = x.shape[0]
168 |     l1 = int(nbr*2/3.)
169 |     l2 = int(nbr * ((2/3.) + 1/6.))
170 |     trainx, trainy = x[:l1, :], y[:l1]
171 |     validx, validy = x[l1:l2, :], y[l1:l2]
172 |     testx, testy = x[l2:, :], y[l2:]
173 |     trfig = plot_classes(trainy, trainx, "", 0., "g.tr 2D: checkboard.")
174 |     vlfig = plot_classes(validy, validx, "", 0., "g.vl 2D: checkboard.")
175 |     tsfig = plot_classes(testy, testx, "", 0., "g.tst 2D: checkboard.")
176 |     trfig.savefig(path + "/traingfig.png", bbox_inches='tight')
177 |     vlfig.savefig(path + "/validfig.png", bbox_inches='tight')
178 |     tsfig.savefig(path + "/testfig.png", bbox_inches='tight')
179 |     # dump
180 |     with open(path+"/cb.pkl", "w") as f:
181 |         stuff = {"trainx": trainx, "trainy": trainy,
182 |                  "validx": validx, "validy": validy,
183 |                  "testx": testx, "testy": testy}
184 |         pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL)
185 | 
186 | # create_tr_vl_ts_cb("data/2d")
187 | 
188 | 
189 | def generate_2d_data_bin(nbr, mn1, cov1, mn2, cov2):
190 |     """Generate 2D points using multivariate normal distribution.
191 |     nbr: number of samples per class."""
192 |     x1 = np.random.multivariate_normal(mn1, cov1, nbr)
193 |     x2 = np.random.multivariate_normal(mn2, cov2, nbr)
194 |     y1 = np.zeros((nbr, 1), dtype=np.float32)
195 |     y2 = np.ones((nbr, 1), dtype=np.float32)
196 |     x = np.vstack((x1, x2))
197 |     y = np.vstack((y1, y2))
198 |     print x.shape, y.shape
199 |     mega = np.hstack((x, y.reshape(y.size, 1)))
200 |     for i in range(100):
201 |         np.random.shuffle(mega)
202 |     x = mega[:, :-1]
203 |     y = mega[:, -1]
204 |     fig = plot_classes(y, x, "", 0., "generated 2D: multivariate normal.")
205 |     return x, y, fig
206 | 
207 | 
208 | def generate_all_2d_data(path):
209 |     if not os.path.exists(path):
210 |         os.makedirs(path)
211 |     mn1, cov1 = [1, 0], [[1, -0.5], [-0.5, 1]]
212 |     mn2, cov2 = [4, 0], [[1, 0], [0, 1]]
213 |     trainx, trainy, trainfig = generate_2d_data_bin(25000, mn1,
214 |                                                     cov1, mn2, cov2)
215 |     minx = np.min(trainx, axis=0)
216 |     maxx = np.max(trainx, axis=0)
217 | 
218 |     trainx = (trainx - minx)/(maxx - minx)
219 |     trainfig = plot_classes(trainy, trainx, "", 0.,
220 |                             "generated 2D: multivariate normal.")
221 |     validx, validy, validfig = generate_2d_data_bin(5000, mn1,
222 |                                                     cov1, mn2, cov2)
223 |     validx = (validx - minx)/(maxx - minx)
224 |     validfig = plot_classes(trainy, trainx, "", 0.,
225 |                             "generated 2D: multivariate normal.")
226 |     testx, testy, testfig = generate_2d_data_bin(5000, mn1,
227 |                                                  cov1, mn2, cov2)
228 |     trainfig.savefig(path + "/traingfig.png", bbox_inches='tight')
229 |     validfig.savefig(path + "/validfig.png", bbox_inches='tight')
230 |     testfig.savefig(path + "/testfig.png", bbox_inches='tight')
231 | 
232 | 
233 | def generate_nested_circles(n):
234 |     limits = [0, 1./3, 2./3, 1, 2]
235 |     np.random.seed(0)
236 |     X = np.random.rand(n, 2)*2-1
237 |     Xd = np.sqrt((X**2).sum(axis=1))
238 |     Y = np.zeros((n, ), dtype='bool')
239 |     classe = True
240 |     for b1, b2 in zip(limits[:-1], limits[1:]):
241 |         (idx, ) = np.nonzero(np.logical_and(b1 < Xd, Xd <= b2))
242 |         Y[idx] = classe
243 |         classe = not classe
244 |     Y = Y.astype(np.float32)
245 |     mega = np.hstack((X, Y.reshape(Y.size, 1)))
246 |     for i in range(500):
247 |         np.random.shuffle(mega)
248 |         print i
249 |     x = mega[:, :-1]
250 |     y = mega[:, -1]
251 |     print x.shape, y.shape
252 |     fig = plot_classes(y, x, "", 0., "generated 2D: nested circles.")
253 |     fig.savefig("data/nestedcircle/nc_generated.png", bbox_inches='tight')
254 |     return x, y
255 | 
256 | 
257 | def create_tr_vl_ts_nc(path, n):
258 |     """nested circles"""
259 |     if not os.path.exists(path):
260 |         os.makedirs(path)
261 |     x, y = generate_nested_circles(n)
262 |     nbr = x.shape[0]
263 |     l1 = int(nbr*2/3.)
264 |     l2 = int(nbr * ((2/3.) + 1/6.))
265 |     trainx, trainy = x[:l1, :], y[:l1]
266 |     validx, validy = x[l1:l2, :], y[l1:l2]
267 |     testx, testy = x[l2:, :], y[l2:]
268 |     trfig = plot_classes(
269 |         trainy, trainx, "", 0., "g.tr 2D: nested circles." + str(l1))
270 |     vlfig = plot_classes(
271 |         validy, validx, "", 0., "g.vl 2D: nested circles." + str(l2-l1))
272 |     tsfig = plot_classes(
273 |         testy, testx, "", 0., "g.tst 2D: nested circles." + str(y.size - l2))
274 |     trfig.savefig(path + "/traingfig.png", bbox_inches='tight')
275 |     vlfig.savefig(path + "/validfig.png", bbox_inches='tight')
276 |     tsfig.savefig(path + "/testfig.png", bbox_inches='tight')
277 |     # dump
278 |     with open(path+"/nc.pkl", "w") as f:
279 |         stuff = {"trainx": trainx, "trainy": trainy,
280 |                  "validx": validx, "validy": validy,
281 |                  "testx": testx, "testy": testy}
282 |         pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL)
283 | 
284 | 
285 | # def knn1(model, l_tst, testx_sh, l_tr, trainx_sh):
286 | 
287 | # create_tr_vl_ts_nc("data/nestedcircle", 50000)
288 | 
289 | # DATA MNIST
290 | # =============================================================================
291 | path_data = "data/mnist.pkl"
292 | f = open(path_data, 'r')
293 | train, valid, test = pkl.load(f)
294 | trainx, trainy = train[0], train[1]
295 | validx, validy = valid[0], valid[1]
296 | testx, testy = test[0], test[1]
297 | 
298 | # How much to take for training?
299 | nbr_sup = config_exp["nbr_sup"]
300 | run = config_exp["run"]
301 | print "RUN:", run
302 | print "SUP: ", nbr_sup
303 | # get the data of each class
304 | # train
305 | nbx = nbr_sup / len(cs)
306 | trainx, trainy = get_data(cs, trainx, trainy, nbx, True)
307 | validx, validy = get_data(cs, validx, validy, None, False)
308 | testx, testy = get_data(cs, testx, testy, None, False)
309 | 
310 | # convert the name of the classes from the ream name to: 0, 1, 2, ...
311 | testy_int = testy
312 | trainy = rename_classes(trainy)
313 | validy = rename_classes(validy)
314 | testy = rename_classes(testy)
315 | with open("data/mnist_bin17.pkl", "w") as f17:
316 |     stuff = ((trainx, trainy), (validx, validy), (testx, testy))
317 |     print trainx.shape, validx.shape, testx.shape
318 |     pkl.dump(stuff, f17)
319 | sys.exit()
320 | # =============================================================================
321 | 
322 | # DATA MNIST -- end
323 | 
324 | # DATA CHECKBOARD
325 | #==============================================================================
326 | # p_data = "nestedcircle"
327 | # path_data = "data/" + p_data + "/nc.pkl"
328 | # f = open(path_data, 'r')
329 | # stuff = pkl.load(f)
330 | # trainx, trainy = stuff["trainx"], stuff["trainy"]
331 | # validx, validy = stuff["validx"], stuff["validy"]
332 | # testx, testy = stuff["testx"], stuff["testy"]
333 | # 
334 | # 
335 | # # How much to take for training?
336 | # nbr_sup = int(sys.argv[6])
337 | # run = int(sys.argv[7])
338 | # print "RUN:", run
339 | # print "SUP: ", nbr_sup
340 | # trainx, trainy = trainx[:nbr_sup, :], trainy[:nbr_sup]
341 | # trfig = plot_classes(
342 | #     trainy, trainx, "", 0., "tr 2D: nested circles." + str(nbr_sup))
343 | # trfig.savefig("data/" + p_data + "/trfig_" + str(nbr_sup) + ".png",
344 | #               bbox_inches='tight')
345 | # 
346 | # testy_int = testy
347 | #==============================================================================
348 | 
349 | # DATA CHECKBOARD --end
350 | 
351 | # Prepare the pre-shuffling
352 | if not os.path.exists("data/" + str(nbr_sup)):
353 |     os.makedirs("data/" + str(nbr_sup))
354 | trainx_tmp = trainx
355 | trainy_tmp = trainy
356 | 
357 | big_mtx = np.hstack((trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1)))
358 | print "Going to shuffle the train data. It takes some time ..."
359 | period = 200
360 | i = 0
361 | #for k in xrange(5000):
362 | #    np.random.shuffle(big_mtx)
363 | #    if k % period == 0:
364 | #        trainx_tmp2 = big_mtx[:, 0:trainx_tmp.shape[1]]
365 | #        trainy_tmp2 = big_mtx[:, -1]
366 | #        stuff = {"x": trainx_tmp2, "y": trainy_tmp2}
367 | #        print k
368 | #        with open("data/"+str(nbr_sup) + "/" + str(i) + ".pkl", 'w') as f:
369 | #            pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL)
370 | #        i += 1
371 | 
372 | #with open("data/"+str(nbr_sup) + "/0.pkl") as f:
373 | #    stuff = pkl.load(f)
374 | #    trainx, trainy = stuff["x"], stuff["y"]
375 | # share over gpu: we can store the whole mnist over the gpu.
376 | # Train
377 | trainx_sh = theano.shared(trainx.astype(theano.config.floatX),
378 |                           name="trainx", borrow=True)
379 | trainlabels_sh = theano.shared(trainy.astype(theano.config.floatX),
380 |                                name="trainlabels", borrow=True)
381 | trainy_sh = theano.shared(to_categorical(trainy, x_classes).astype(
382 |     theano.config.floatX),  name="trainy", borrow=True)
383 | 
384 | # valid
385 | validx_sh = theano.shared(validx.astype(theano.config.floatX),
386 |                           name="validx", borrow=True)
387 | validlabels_sh = theano.shared(validy.astype(theano.config.floatX),
388 |                                name="validlabels", borrow=True)
389 | #
390 | input = T.fmatrix("x")
391 | input1 = T.fmatrix("x1")
392 | input2 = T.fmatrix("x2")
393 | rng = np.random.RandomState(23455)
394 | # Architecture
395 | nhid_l0 = 300
396 | nhid_l1 = 200
397 | nhid_l2 = 100
398 | 
399 | nbr_classes = x_classes
400 | h_ind = config_exp["h_ind"]
401 | h_ind = [int(tt) for tt in h_ind]
402 | 
403 | assert len(h_ind) == 4
404 | h0, h1, h2, h3, h4, h5, h6, h7, h8 = None, None, None, None, None, None, None,\
405 |     None, None
406 | l_v = []
407 | for xx in h_ind:
408 |     print xx
409 |     if int(xx) == 1:
410 |         l_v.append(True)
411 |     elif int(xx) == 0:
412 |         l_v.append(False)
413 |     else:
414 |         raise ValueError("Error in applying hint: 0/1")
415 | 
416 | hint_type = "l2mean"  # "l1mean"
417 | print l_v
418 | corrupt_input_l = config_exp["corrupt_input_l"]
419 | if corrupt_input_l != 0.:
420 |     warnings.warn(
421 |         "YOU ASKED TO USE DENOISING PROCESS OVER THE INPUTS OF THE FIRST LAYER"
422 |         )
423 |     if not config_exp["hint"]:
424 |         raise ValueError(
425 |             "You asked for densoing process but you are not using the penalty")
426 | start_corrupting = config_exp["start_corrupting"]
427 | warnings.warn(
428 |     "CORRUPTION WILL START AFTER:" + str(start_corrupting) + " epochs!!!!!!")
429 | use_sparsity = config_exp["use_sparsity"]
430 | use_sparsity_in_pred = config_exp["use_sparsity_in_pred"]
431 | print "Use sparsity: ", use_sparsity
432 | print "Use sparsity in pred:", use_sparsity_in_pred
433 | layer0 = {
434 |     "rng": rng,
435 |     "n_in": trainx.shape[1],
436 |     "n_out": nhid_l0,
437 |     "W": None,
438 |     "b": None,
439 |     "activation": NonLinearity.SIGMOID,
440 |     "hint": hint_type,
441 |     "use_hint": l_v[0],
442 |     "intended_to_be_corrupted": True,
443 |     "corrupt_input_l": corrupt_input_l,
444 |     "use_sparsity": use_sparsity,
445 |     "use_sparsity_in_pred": use_sparsity_in_pred
446 |     }
447 | 
448 | layer1 = {
449 |     "rng": rng,
450 |     "n_in": nhid_l0,
451 |     "n_out": nhid_l1,
452 |     "W": None,
453 |     "b": None,
454 |     "activation": NonLinearity.SIGMOID,
455 |     "hint": hint_type,
456 |     "use_hint": l_v[1],
457 |     "use_sparsity": use_sparsity,
458 |     "use_sparsity_in_pred": use_sparsity_in_pred
459 |     }
460 | 
461 | layer2 = {
462 |     "rng": rng,
463 |     "n_in": nhid_l1,
464 |     "n_out": nhid_l2,
465 |     "W": None,
466 |     "b": None,
467 |     "activation": NonLinearity.SIGMOID,
468 |     "hint": hint_type,
469 |     "use_hint": l_v[2],
470 |     "use_sparsity": use_sparsity,
471 |     "use_sparsity_in_pred": use_sparsity_in_pred
472 |     }
473 | 
474 | #layer3 = {
475 | #    "rng": rng,
476 | #    "n_in": nhid_l2,
477 | #    "n_out": nhid_l3,
478 | #    "W": None,
479 | #    "b": None,
480 | #    "activation": NonLinearity.SIGMOID,
481 | #    "hint": l_v[3]
482 | #    }
483 | #
484 | #layer4 = {
485 | #    "rng": rng,
486 | #    "n_in": nhid_l3,
487 | #    "n_out": nhid_l4,
488 | #    "W": None,
489 | #    "b": None,
490 | #    "activation": NonLinearity.SIGMOID,
491 | #    "hint": l_v[4]
492 | #    }
493 | #
494 | #layer5 = {
495 | #    "rng": rng,
496 | #    "n_in": nhid_l4,
497 | #    "n_out": nhid_l5,
498 | #    "W": None,
499 | #    "b": None,
500 | #    "activation": NonLinearity.SIGMOID,
501 | #    "hint": l_v[5]
502 | #    }
503 | #
504 | #layer6 = {
505 | #    "rng": rng,
506 | #    "n_in": nhid_l5,
507 | #    "n_out": nhid_l6,
508 | #    "W": None,
509 | #    "b": None,
510 | #    "activation": NonLinearity.SIGMOID,
511 | #    "hint": l_v[6]
512 | #    }
513 | #
514 | #layer7 = {
515 | #    "rng": rng,
516 | #    "n_in": nhid_l6,
517 | #    "n_out": nhid_l7,
518 | #    "W": None,
519 | #    "b": None,
520 | #    "activation": NonLinearity.SIGMOID,
521 | #    "hint": l_v[7]
522 | #    }
523 | 
524 | output_layer = {
525 |     "rng": rng,
526 |     "n_in": nhid_l2,
527 |     "n_out": nbr_classes,
528 |     "W": None,
529 |     "b": None,
530 |     "activation": NonLinearity.SOFTMAX,
531 |     "hint": hint_type,
532 |     "use_hint": l_v[3],
533 |     "use_sparsity": False,
534 |     "use_sparsity_in_pred": False
535 |     }
536 | layers = [layer0, layer1, layer2, output_layer]
537 | l1, l2 = 0., 0.
538 | reg_bias = True
539 | margin = sharedX_value(1., name="margin")
540 | similair = theano.shared(np.array([0, 1], dtype=theano.config.floatX),
541 |                          name="sim")
542 | model = ModelMLP(layers, input, input1, input2,
543 |                  trainx_sh, trainlabels_sh, trainy_sh,
544 |                  validx_sh, validlabels_sh, margin, similair,
545 |                  l1_reg=l1, l2_reg=l2,
546 |                  reg_bias=reg_bias)
547 | 
548 | size_model = str(trainx.shape[1]) +\
549 |     '_'.join([str(l["n_in"]) for l in layers]) + "_" + str(nbr_classes)
550 | path_model_init_params = "init_params/" + size_model + ".pkl"
551 | if not os.path.isfile(path_model_init_params):
552 |     model.save_params(path_model_init_params, catched=False)
553 | else:
554 |     model.set_params_vals(path_model_init_params)
555 | 
556 | train_batch_size = 100
557 | valid_batch_size = 1000
558 | 
559 | max_epochs = config_exp["max_epochs"]
560 | lr_vl = 1e-7
561 | lr = sharedX_value(lr_vl, name="lr")
562 | h_w = sharedX_value(1., name="hw")
563 | s_w = sharedX_value(1., name="sw")
564 | lambda_sparsity = sharedX_value(0., name="l_sparsity")
565 | 
566 | # Compile functions: train/valid
567 | updater = AdaDelta(decay=0.95)
568 | 
569 | # updater = Momentum(0.9, nesterov_momentum=False, imagenet=False,
570 | #                   imagenetDecay=5e-4, max_colm_norm=False)
571 | 
572 | hint = config_exp["hint"]
573 | # "hint", "noHint"
574 | if hint:
575 |     tag = "hint"
576 | else:
577 |     tag = "noHint"
578 | 
579 | norm_gsup = config_exp["norm_gsup"]
580 | norm_gh = config_exp["norm_gh"]
581 | fns = theano_fns(model, learning_rate=lr,
582 |                  h_w=h_w, s_w=s_w, lambda_sparsity=lambda_sparsity,
583 |                  updater=updater, tag=tag,
584 |                  max_colm_norm=False, max_norm=15.0,
585 |                  norm_gsup=norm_gsup, norm_gh=norm_gh)
586 | 
587 | eval_fn, eval_fn_tr = fns["eval_fn"], fns["eval_fn_tr"]
588 | # Things to track during training: epoch and minibatch
589 | train_stats = {"tr_error_ep": [], "vl_error_ep": [], "tr_cost_ep": [],
590 |                "tr_error_mn": [], "vl_error_mn": [], "tr_cost_mn": [],
591 |                "current_nb_mb": 0, "best_epoch": 0, "best_mn": 0}
592 | 
593 | names = []
594 | for l, i in zip(layers, range(len(layers))):
595 |     if l["hint"] is not None:
596 |         names.append(i)
597 | debug = {"grad_sup": [], "grad_hint": [], "penalty": [], "names": names}
598 | # Eval before start training
599 | l_vl = chunks(range(validx.shape[0]), valid_batch_size)
600 | l_tr = chunks(range(trainx.shape[0]), valid_batch_size)
601 | vl_err_start = np.mean(
602 |     [eval_fn(np.array(l_vl[kk])) for kk in range(len(l_vl))])
603 | tr_err_start = np.mean(
604 |     [eval_fn_tr(np.array(l_tr[kk])) for kk in range(len(l_tr))])
605 | print vl_err_start, tr_err_start
606 | 
607 | # Exp stamp
608 | time_exp = DT.datetime.now().strftime('%m_%d_%Y_%H_%M')
609 | tag_text = "_".join([str(l["hint"]) for l in layers])
610 | h_exp = "_".join([str(e) for e in h_ind])
611 | fold_exp = "exps/" + tag + "_" + h_exp + "_" + size_model + "_" + time_exp
612 | if not os.path.exists(fold_exp):
613 |     os.makedirs(fold_exp)
614 | 
615 | shutil.copy(inspect.stack()[0][1], fold_exp)
616 | shutil.copy(config_path+sys.argv[1], fold_exp)
617 | 
618 | # Start training
619 | stop, i = False, 0
620 | div = any([l["hint"] is "contrastive" for l in layers])
621 | shuffle_period = 1   # epochs
622 | do_shuffle = True
623 | extreme_random = config_exp["extreme_random"]
624 | if extreme_random:
625 |     print "Extreme randomness."
626 | else:
627 |     print "Same shuffle."
628 | kk = 1
629 | 
630 | # TEST BEFORE START TRAINING
631 | testx_sh = theano.shared(testx.astype(theano.config.floatX),
632 |                          name="testx", borrow=True)
633 | testlabels_sh = theano.shared(testy.astype(theano.config.floatX),
634 |                               name="testlabels", borrow=True)
635 | 
636 | i_x_vl = T.lvector("ixtst")
637 | y_vl = T.vector("y")
638 | error = T.mean(T.neq(T.argmax(model.output, axis=1), y_vl))
639 | 
640 | output_fn_test = [error, model.output]
641 | 
642 | eval_fn_tst = theano.function(
643 |     [i_x_vl], output_fn_test,
644 |     givens={model.x: testx_sh[i_x_vl],
645 |             y_vl: testlabels_sh[i_x_vl]})
646 | l_tst = chunks(range(testx.shape[0]), valid_batch_size)
647 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))]
648 | print test_error_l[0][0]
649 | 
650 | test_error = np.mean([l[0] for l in test_error_l])
651 | print "Test error:", test_error
652 | prediction = None
653 | for l in test_error_l:
654 |     if prediction is None:
655 |         prediction = l[1]
656 |     else:
657 |         prediction = np.vstack((prediction, l[1]))
658 | 
659 | 
660 | with open(fold_exp+"/pred_before.pkl", "w") as fp:
661 |     pkl.dump({"y": testy, "pred": prediction}, fp)
662 | 
663 | 
664 | #    fig_scatter = plot_classes(y=testy_int, cord=prediction, names=cs,
665 | #                               test_error=test_error, message="BEFORE train")
666 | #    fig_scatter.savefig(fold_exp+"/pred_before.png", bbox_inches='tight')
667 | 
668 | 
669 | while i < max_epochs:
670 |     if i >= start_corrupting:
671 |         warnings.warn(
672 |             "SETTING THE CORRUPTION LEVEL TO:" + str(corrupt_input_l))
673 |         model.layers[0].corrupt_input_l.set_value(
674 |             np.cast[theano.config.floatX](corrupt_input_l))
675 |     else:
676 |         warnings.warn("SETTING THE CORRUPTION LEVEL TO: 0")
677 |         model.layers[0].corrupt_input_l.set_value(
678 |               np.cast[theano.config.floatX](0.))
679 |     stop = (i == max_epochs - 1)
680 |     tx = DT.datetime.now()
681 |     stats = train_one_epoch(
682 |         model, fns, i, fold_exp, train_stats, vl_err_start, tag,
683 |         train_batch_size, l_vl, l_tr, div, stop=stop,
684 |         debug=debug, debug_code=debug_code)
685 |     txx = DT.datetime.now()
686 |     print "CORRUPTION LEVEL VALUE: " +\
687 |         str(model.layers[0].corrupt_input_l.get_value())
688 |     print "One epoch", DT.datetime.now() - tx
689 |     train_stats = collect_stats_epoch(stats, train_stats)
690 |     if (i % 100 == 0 or stop) and debug_code:
691 |         plot_debug_grad(debug, tag_text, fold_exp, "sup")
692 |         plot_penalty_vl(debug, tag_text, fold_exp)
693 |         if tag == "hint":
694 |             plot_debug_grad(debug, tag_text, fold_exp, "hint")
695 |             plot_debug_ratio_grad(debug, fold_exp, "h/s")
696 |             plot_debug_ratio_grad(debug, fold_exp, "s/h")
697 | 
698 |     if stop:
699 |         plot_stats(train_stats, "ep", fold_exp, tag)
700 |         with open(fold_exp + "/train_stats.pkl", 'w') as f_ts:
701 |                 pkl.dump(train_stats, f_ts)
702 |         with open(fold_exp + "/train_debug.pkl", 'w') as f_ts:
703 |                 pkl.dump(debug, f_ts)
704 |     i += 1
705 |     # shuffle the data
706 | 
707 |     print "Going to shuffle the train data."
708 | 
709 |     if do_shuffle and i % shuffle_period == 0 and not stop:
710 |         if extreme_random:
711 |             trainx_tmp = model.trainx_sh.get_value()
712 |             trainy_tmp = model.trainlabels_sh.get_value()
713 |             big_mtx = np.hstack(
714 |                 (trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1)))
715 |             for k in xrange(5):
716 |                 np.random.shuffle(big_mtx)
717 |             trainx_tmp = big_mtx[:, 0:trainx_tmp.shape[1]]
718 |             trainy_tmp = big_mtx[:, -1]
719 |         else:
720 |             with open("data/"+str(nbr_sup) + "/" + str(kk) + ".pkl") as f:
721 |                 stuff = pkl.load(f)
722 |                 trainx_tmp, trainy_tmp = stuff["x"], stuff["y"]
723 |         model.trainlabels_sh.set_value(trainy_tmp.astype(theano.config.floatX))
724 |         model.trainy_sh.set_value(
725 |             to_categorical(
726 |                 trainy_tmp, nbr_classes).astype(theano.config.floatX))
727 |         model.trainx_sh.set_value(trainx_tmp.astype(theano.config.floatX))
728 |         kk += 1
729 |         if kk > 240:
730 |             kk = 0
731 |         print "Finished loading shuffled data. Updated the train set on GPU."
732 |     del stats
733 |     print "This part took:", DT.datetime.now() - txx
734 | #    new_v = min([1., h_w.get_value() + 0.01])
735 | #    h_w.set_value(np.cast[theano.config.floatX](new_v))
736 |     # Update the importance of the hint
737 | #    if i >= 1:
738 | #        # new_v = min([1., h_w.get_value() + 0.1])
739 | #        h_w.set_value(np.cast[theano.config.floatX](1.))
740 | 
741 | 
742 | # Perform the test
743 | # Set the model's param to the best catched ones
744 | model.set_model_to_catched_params()
745 | # share test data
746 | 
747 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))]
748 | 
749 | test_error = np.mean([l[0] for l in test_error_l])
750 | print "Test error:", test_error
751 | 
752 | prediction = None
753 | for l in test_error_l:
754 |     if prediction is None:
755 |         prediction = l[1]
756 |     else:
757 |         prediction = np.vstack((prediction, l[1]))
758 | 
759 | 
760 | with open(fold_exp+"/pred_after.pkl", "w") as fp:
761 |     pkl.dump({"y": testy, "pred": prediction}, fp)
762 | 
763 | ##############################################################################
764 | # GET INTERMEDIATE VALUE AND PLOT THEM. POSSIBLE ONLY WHEN THE INTERMEDIATE
765 | # VALUES ARE 2D.
766 | # inter_vl = get_inter_output(model, l_tst, testx_sh)
767 | # plot the intermediate values
768 | # ll = 0
769 | # for vi in inter_vl:
770 | #    fig = plot_classes(
771 | #        testy_int, vi, "", test_error,
772 | #        "pred. 2D: mnist 1/7. layer" + str(ll))
773 | #    fig.savefig(
774 | #        fold_exp + "/predinterlayer" + str(ll) + ".png", bbox_inches='tight')
775 | #    ll += 1
776 | 
777 | ###############################################################################
778 | 
779 | #    fig_scatter = plot_classes(y=testy_int, cord=prediction, names=cs,
780 | #                               test_error=test_error, message="AFTER train")
781 | #    fig_scatter.savefig(fold_exp+"/pred_after.png", bbox_inches='tight')
782 | # save min valid
783 | vl_pathfile = "exps/" + "run_" + str(run) + "_sup_" + str(nbr_sup) + "_" +\
784 |     h_exp + "_c_l_" + str(corrupt_input_l) + "_start_at_" +\
785 |     str(start_corrupting) + "_debug_" + str(debug_code) +\
786 |     "_use_sparse_" + str(use_sparsity) + "_use_spar_pred_" +\
787 |     str(use_sparsity_in_pred) + "_" + "norm_" + str(norm_gsup) + "_" +\
788 |     str(norm_gh) + "_" + time_exp + ".txt"
789 | with open(vl_pathfile, 'w') as f:
790 |     f.write("Exp. folder: " + fold_exp + "\n")
791 |     f.write(
792 |         "valid error:" + str(
793 |             np.min(train_stats["vl_error_mn"]) * 100.) + " % \n")
794 |     f.write("Test error:" + str(test_error * 100.) + " % \n")
795 | shutil.copy(vl_pathfile, fold_exp)
796 | 


--------------------------------------------------------------------------------
/train3_new_dup.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pkl
  2 | import numpy as np
  3 | import theano.tensor as T
  4 | import os
  5 | import sys
  6 | import datetime as DT
  7 | import shutil
  8 | import inspect
  9 | import theano
 10 | import warnings
 11 | import yaml
 12 | 
 13 | from tools import ModelMLP
 14 | from tools import NonLinearity
 15 | from tools import split_data_to_minibatchs_eval
 16 | from tools import sharedX_value
 17 | from tools import theano_fns
 18 | from tools import theano_fns_double_up
 19 | from learning_rule import AdaDelta
 20 | from learning_rule import RMSProp
 21 | from learning_rule import Momentum
 22 | from tools import evaluate_model
 23 | from tools import collect_stats_epoch
 24 | from tools import plot_stats
 25 | from tools import train_one_epoch
 26 | from tools import train_one_epoch_alter
 27 | from tools import to_categorical
 28 | from tools import plot_classes
 29 | from tools import chunks
 30 | from tools import plot_penalty_vl
 31 | from tools import plot_debug_grad
 32 | from tools import plot_debug_ratio_grad
 33 | from sklearn import manifold
 34 | from tools import plot_representations
 35 | 
 36 | 
 37 | # Parse the yaml config.
 38 | config_path = "./config_yaml/"
 39 | with open(config_path + sys.argv[1], 'r') as fy:
 40 |     config_exp = yaml.load(fy)
 41 | 
 42 | x_classes = 10
 43 | 
 44 | debug_code = config_exp["debug_code"]
 45 | 
 46 | if debug_code:
 47 |     warnings.warn("YOU ARE IN DEBUG MODE! YOUR CODE WILL TAKE MORE TIME!!!!!")
 48 | 
 49 | 
 50 | def standerize(d, mu=None, sigma=None):
 51 |     if mu is None:
 52 |         mu = np.mean(d, axis=0)
 53 |         sigma = np.std(d, axis=0)
 54 |     if sigma.nonzero()[0].shape[0] == 0:
 55 |         raise Exception("std found to be zero!!!!")
 56 |     norm_d = (d - mu) / sigma
 57 | 
 58 |     return norm_d, mu, sigma
 59 | 
 60 | 
 61 | def get_inter_output(model, l_tst, testx_sh):
 62 |     i_x_vl = T.lvector("ixtst")
 63 | 
 64 |     eval_fn_tst = theano.function(
 65 |         [i_x_vl],
 66 |         [l.output for l in model.layers],
 67 |         givens={model.x: testx_sh[i_x_vl]})
 68 |     output_v = [
 69 |         eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))]
 70 |     nbr_layers = len(output_v[0])
 71 | 
 72 |     l_val = []
 73 |     for l in range(nbr_layers):
 74 |         tmp = None
 75 |         for k in output_v:
 76 |             if tmp is None:
 77 |                 tmp = k[l]
 78 |             else:
 79 |                 tmp = np.vstack((tmp, k[l]))
 80 |         l_val.append(tmp)
 81 | 
 82 |     return l_val
 83 | 
 84 | 
 85 | # create_tr_vl_ts_cb("data/2d")
 86 | 
 87 | # def knn1(model, l_tst, testx_sh, l_tr, trainx_sh):
 88 | 
 89 | # create_tr_vl_ts_nc("data/nestedcircle", 50000)
 90 | 
 91 | # DATA MNIST
 92 | # =============================================================================
 93 | path_data = "data/mnist.pkl"
 94 | f = open(path_data, 'r')
 95 | train, valid, test = pkl.load(f)
 96 | trainx, trainy = train[0], train[1]
 97 | validx, validy = valid[0], valid[1]
 98 | testx, testy = test[0], test[1]
 99 | 
100 | # How much to take for training?
101 | nbr_sup = config_exp["nbr_sup"]
102 | run = config_exp["run"]
103 | print "RUN:", run
104 | print "SUP: ", nbr_sup
105 | trainx, trainy = trainx[:nbr_sup], trainy[:nbr_sup]
106 | # Prepare the pre-shuffling
107 | if not os.path.exists("data/" + str(nbr_sup)):
108 |     os.makedirs("data/" + str(nbr_sup))
109 | trainx_tmp = trainx
110 | trainy_tmp = trainy
111 | 
112 | print trainy.shape
113 | big_mtx = np.hstack((trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1)))
114 | print "Going to shuffle the train data. It takes some time ..."
115 | period = 200
116 | i = 0
117 | #for k in xrange(5000):
118 | #    np.random.shuffle(big_mtx)
119 | #    if k % period == 0:
120 | #        trainx_tmp2 = big_mtx[:, 0:trainx_tmp.shape[1]]
121 | #        trainy_tmp2 = big_mtx[:, -1]
122 | #        stuff = {"x": trainx_tmp2, "y": trainy_tmp2}
123 | #        print k
124 | #        with open("data/"+str(nbr_sup) + "/" + str(i) + ".pkl", 'w') as f:
125 | #            pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL)
126 | #        i += 1
127 | 
128 | #with open("data/"+str(nbr_sup) + "/0.pkl") as f:
129 | #    stuff = pkl.load(f)
130 | #    trainx, trainy = stuff["x"], stuff["y"]
131 | # share over gpu: we can store the whole mnist over the gpu.
132 | # Train
133 | trainx_sh = theano.shared(trainx.astype(theano.config.floatX),
134 |                           name="trainx", borrow=True)
135 | trainlabels_sh = theano.shared(trainy.astype(theano.config.floatX),
136 |                                name="trainlabels", borrow=True)
137 | trainy_sh = theano.shared(to_categorical(trainy, x_classes).astype(
138 |     theano.config.floatX),  name="trainy", borrow=True)
139 | 
140 | # valid
141 | validx_sh = theano.shared(validx.astype(theano.config.floatX),
142 |                           name="validx", borrow=True)
143 | validlabels_sh = theano.shared(validy.astype(theano.config.floatX),
144 |                                name="validlabels", borrow=True)
145 | #
146 | input = T.fmatrix("x")
147 | input1 = T.fmatrix("x1")
148 | input2 = T.fmatrix("x2")
149 | rng = np.random.RandomState(23455)
150 | # Architecture
151 | nhid_l0 = 1200
152 | nhid_l1 = 1200
153 | nhid_l2 = 200
154 | 
155 | nbr_classes = x_classes
156 | use_batch_normalization = config_exp["use_batch_normalization"]
157 | h_ind = config_exp["h_ind"]
158 | h_ind = [int(tt) for tt in h_ind]
159 | 
160 | assert len(h_ind) == 4
161 | h0, h1, h2, h3, h4, h5, h6, h7, h8 = None, None, None, None, None, None, None,\
162 |     None, None
163 | l_v = []
164 | for xx in h_ind:
165 |     print xx
166 |     if int(xx) == 1:
167 |         l_v.append(True)
168 |     elif int(xx) == 0:
169 |         l_v.append(False)
170 |     else:
171 |         raise ValueError("Error in applying hint: 0/1")
172 | 
173 | hint_type = "l2sum"  # "l1mean"
174 | print l_v
175 | corrupt_input_l = config_exp["corrupt_input_l"]
176 | if corrupt_input_l != 0.:
177 |     warnings.warn(
178 |         "YOU ASKED TO USE DENOISING PROCESS OVER THE INPUTS OF THE FIRST LAYER"
179 |         )
180 |     if not config_exp["hint"]:
181 |         raise ValueError(
182 |             "You asked for densoing process but you are not using the penalty")
183 | start_corrupting = config_exp["start_corrupting"]
184 | warnings.warn(
185 |     "CORRUPTION WILL START AFTER:" + str(start_corrupting) + " epochs!!!!!!")
186 | use_sparsity = config_exp["use_sparsity"]
187 | use_sparsity_in_pred = config_exp["use_sparsity_in_pred"]
188 | print "Use sparsity: ", use_sparsity
189 | print "Use sparsity in pred:", use_sparsity_in_pred
190 | use_unsupervised = config_exp["use_unsupervised"]
191 | 
192 | layer0 = {
193 |     "rng": rng,
194 |     "n_in": trainx.shape[1],
195 |     "n_out": nhid_l0,
196 |     "W": None,
197 |     "b": None,
198 |     "activation": NonLinearity.SIGMOID,
199 |     "hint": hint_type,
200 |     "use_hint": l_v[0],
201 |     "intended_to_be_corrupted": True,
202 |     "corrupt_input_l": corrupt_input_l,
203 |     "use_sparsity": use_sparsity,
204 |     "use_sparsity_in_pred": use_sparsity_in_pred,
205 |     "use_unsupervised": use_unsupervised,
206 |     "use_batch_normalization": use_batch_normalization[0]
207 |     }
208 | 
209 | layer1 = {
210 |     "rng": rng,
211 |     "n_in": nhid_l0,
212 |     "n_out": nhid_l1,
213 |     "W": None,
214 |     "b": None,
215 |     "activation": NonLinearity.SIGMOID,
216 |     "hint": hint_type,
217 |     "use_hint": l_v[1],
218 |     "use_sparsity": use_sparsity,
219 |     "use_sparsity_in_pred": use_sparsity_in_pred,
220 |     "use_unsupervised": use_unsupervised,
221 |     "use_batch_normalization": use_batch_normalization[1]
222 |     }
223 | 
224 | layer2 = {
225 |     "rng": rng,
226 |     "n_in": nhid_l1,
227 |     "n_out": nhid_l2,
228 |     "W": None,
229 |     "b": None,
230 |     "activation": NonLinearity.SIGMOID,
231 |     "hint": hint_type,
232 |     "use_hint": l_v[2],
233 |     "use_sparsity": use_sparsity,
234 |     "use_sparsity_in_pred": use_sparsity_in_pred,
235 |     "use_unsupervised": use_unsupervised,
236 |     "use_batch_normalization": use_batch_normalization[2]
237 |     }
238 | 
239 | #layer3 = {
240 | #    "rng": rng,
241 | #    "n_in": nhid_l2,
242 | #    "n_out": nhid_l3,
243 | #    "W": None,
244 | #    "b": None,
245 | #    "activation": NonLinearity.SIGMOID,
246 | #    "hint": l_v[3]
247 | #    }
248 | #
249 | #layer4 = {
250 | #    "rng": rng,
251 | #    "n_in": nhid_l3,
252 | #    "n_out": nhid_l4,
253 | #    "W": None,
254 | #    "b": None,
255 | #    "activation": NonLinearity.SIGMOID,
256 | #    "hint": l_v[4]
257 | #    }
258 | #
259 | #layer5 = {
260 | #    "rng": rng,
261 | #    "n_in": nhid_l4,
262 | #    "n_out": nhid_l5,
263 | #    "W": None,
264 | #    "b": None,
265 | #    "activation": NonLinearity.SIGMOID,
266 | #    "hint": l_v[5]
267 | #    }
268 | #
269 | #layer6 = {
270 | #    "rng": rng,
271 | #    "n_in": nhid_l5,
272 | #    "n_out": nhid_l6,
273 | #    "W": None,
274 | #    "b": None,
275 | #    "activation": NonLinearity.SIGMOID,
276 | #    "hint": l_v[6]
277 | #    }
278 | #
279 | #layer7 = {
280 | #    "rng": rng,
281 | #    "n_in": nhid_l6,
282 | #    "n_out": nhid_l7,
283 | #    "W": None,
284 | #    "b": None,
285 | #    "activation": NonLinearity.SIGMOID,
286 | #    "hint": l_v[7]
287 | #    }
288 | 
289 | output_layer = {
290 |     "rng": rng,
291 |     "n_in": nhid_l2,
292 |     "n_out": nbr_classes,
293 |     "W": None,
294 |     "b": None,
295 |     "activation": NonLinearity.SOFTMAX,
296 |     "hint": hint_type,
297 |     "use_hint": l_v[3],
298 |     "use_sparsity": False,
299 |     "use_sparsity_in_pred": False,
300 |     "use_unsupervised": use_unsupervised,
301 |     "use_batch_normalization": use_batch_normalization[3]
302 |     }
303 | layers = [layer0, layer1, layer2, output_layer]
304 | l1, l2 = 0., 0.
305 | reg_bias = True
306 | margin = sharedX_value(1., name="margin")
307 | similair = theano.shared(np.array([0, 1], dtype=theano.config.floatX),
308 |                          name="sim")
309 | model = ModelMLP(layers, input, input1, input2,
310 |                  trainx_sh, trainlabels_sh, trainy_sh,
311 |                  validx_sh, validlabels_sh, margin, similair,
312 |                  l1_reg=l1, l2_reg=l2,
313 |                  reg_bias=reg_bias)
314 | 
315 | size_model = str(trainx.shape[1]) +\
316 |     '_'.join([str(l["n_in"]) for l in layers]) + "_" + str(nbr_classes)
317 | path_model_init_params = "init_params/" + size_model + '_' +\
318 |     str(config_exp["repet"]) + ".pkl"
319 | if not os.path.isfile(path_model_init_params):
320 |     model.save_params(path_model_init_params, catched=False)
321 | else:
322 |     model.set_params_vals(path_model_init_params)
323 | 
324 | train_batch_size = 100
325 | valid_batch_size = 1000
326 | 
327 | max_epochs = config_exp["max_epochs"]
328 | lr_vl = 1e-7
329 | lr = sharedX_value(lr_vl, name="lr")
330 | h_w = sharedX_value(config_exp["h_w"], name="hw")
331 | s_w = sharedX_value(1., name="sw")
332 | unsup_w = sharedX_value(1., name="unsw")
333 | lambda_sparsity = sharedX_value(0., name="l_sparsity")
334 | 
335 | # Compile functions: train/valid
336 | updater_sup = AdaDelta(decay=0.95)
337 | updater_hint = AdaDelta(decay=0.95)
338 | updater_unsup = AdaDelta(decay=0.95)
339 | updater = {"sup": updater_sup, 'hint': updater_hint, "unsup": updater_unsup}
340 | 
341 | # updater = Momentum(0.9, nesterov_momentum=False, imagenet=False,
342 | #                   imagenetDecay=5e-4, max_colm_norm=False)
343 | 
344 | hint = config_exp["hint"]
345 | # "hint", "noHint"
346 | if hint:
347 |     tag = "hint"
348 | else:
349 |     tag = "noHint"
350 | 
351 | norm_gsup = config_exp["norm_gsup"]
352 | norm_gh = config_exp["norm_gh"]
353 | fns = theano_fns_double_up(
354 |     model, learning_rate=lr,
355 |     h_w=h_w, s_w=s_w, unsup_w=unsup_w, lambda_sparsity=lambda_sparsity,
356 |     updater=updater, tag=tag,
357 |     max_colm_norm=False, max_norm=15.0,
358 |     norm_gsup=norm_gsup, norm_gh=norm_gh)
359 | 
360 | eval_fn, eval_fn_tr = fns["eval_fn"], fns["eval_fn_tr"]
361 | # Things to track during training: epoch and minibatch
362 | train_stats = {"tr_error_ep": [], "vl_error_ep": [], "tr_cost_ep": [],
363 |                "tr_error_mn": [], "vl_error_mn": [], "tr_cost_mn": [],
364 |                "current_nb_mb": 0, "best_epoch": 0, "best_mn": 0}
365 | 
366 | names = []
367 | for l, i in zip(layers, range(len(layers))):
368 |     if l["hint"] is not None:
369 |         names.append(i)
370 | debug = {"grad_sup": [], "grad_hint": [], "penalty": [], "names": names}
371 | # Eval before start training
372 | l_vl = chunks(range(validx.shape[0]), valid_batch_size)
373 | l_tr = chunks(range(trainx.shape[0]), valid_batch_size)
374 | vl_err_start = np.mean(
375 |     [eval_fn(np.array(l_vl[kk])) for kk in range(len(l_vl))])
376 | tr_err_start = np.mean(
377 |     [eval_fn_tr(np.array(l_tr[kk])) for kk in range(len(l_tr))])
378 | print vl_err_start, tr_err_start
379 | 
380 | # Exp stamp
381 | time_exp = DT.datetime.now().strftime('%m_%d_%Y_%H_%M_%s')
382 | tag_text = "_".join([str(l["hint"]) for l in layers])
383 | h_exp = "_".join([str(e) for e in h_ind])
384 | fold_exp = "exps/" + tag + "_" + str(nbr_sup) + "_" + h_exp + "_" +\
385 |     size_model + "_" + time_exp
386 | if not os.path.exists(fold_exp):
387 |     os.makedirs(fold_exp)
388 | 
389 | shutil.copy(inspect.stack()[0][1], fold_exp)
390 | shutil.copy(config_path+sys.argv[1], fold_exp)
391 | 
392 | # Start training
393 | stop, i = False, 0
394 | div = any([l["hint"] is "contrastive" for l in layers])
395 | shuffle_period = 1   # epochs
396 | do_shuffle = True
397 | extreme_random = config_exp["extreme_random"]
398 | if extreme_random:
399 |     print "Extreme randomness."
400 | else:
401 |     print "Same shuffle."
402 | kk = 1
403 | 
404 | # TEST BEFORE START TRAINING
405 | testx_sh = theano.shared(testx.astype(theano.config.floatX),
406 |                          name="testx", borrow=True)
407 | testlabels_sh = theano.shared(testy.astype(theano.config.floatX),
408 |                               name="testlabels", borrow=True)
409 | 
410 | i_x_vl = T.lvector("ixtst")
411 | y_vl = T.vector("y")
412 | error = T.mean(T.neq(T.argmax(model.output, axis=1), y_vl))
413 | 
414 | output_fn_test = [error, model.output, model.layers[-2].output]
415 | 
416 | eval_fn_tst = theano.function(
417 |     [i_x_vl], output_fn_test,
418 |     givens={model.x: testx_sh[i_x_vl],
419 |             y_vl: testlabels_sh[i_x_vl]})
420 | l_tst = chunks(range(testx.shape[0]), valid_batch_size)
421 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))]
422 | print test_error_l[0][0]
423 | 
424 | test_error = np.mean([l[0] for l in test_error_l])
425 | print "Test error:", test_error
426 | prediction = None
427 | for l in test_error_l:
428 |     if prediction is None:
429 |         prediction = l[1]
430 |     else:
431 |         prediction = np.vstack((prediction, l[1]))
432 | 
433 | 
434 | with open(fold_exp+"/pred_before.pkl", "w") as fp:
435 |     pkl.dump({"y": testy, "pred": prediction}, fp)
436 | 
437 | best_vl_error = np.finfo(np.float).max
438 | start_hint_epoch = config_exp["start_hint"]
439 | 
440 | while i < max_epochs:
441 |     if i >= start_corrupting:
442 |         warnings.warn(
443 |             "SETTING THE CORRUPTION LEVEL TO:" + str(corrupt_input_l))
444 |         model.layers[0].corrupt_input_l.set_value(
445 |             np.cast[theano.config.floatX](corrupt_input_l))
446 |     else:
447 |         warnings.warn("SETTING THE CORRUPTION LEVEL TO: 0")
448 |         model.layers[0].corrupt_input_l.set_value(
449 |               np.cast[theano.config.floatX](0.))
450 |     stop = (i == max_epochs - 1)
451 |     tx = DT.datetime.now()
452 |     stats = train_one_epoch_alter(
453 |         model, fns, i, fold_exp, train_stats, vl_err_start, tag,
454 |         train_batch_size, l_vl, l_tr, div, stop=stop,
455 |         debug=debug, debug_code=debug_code, h_w=h_w)
456 |     txx = DT.datetime.now()
457 |     print "CORRUPTION LEVEL VALUE: " +\
458 |         str(model.layers[0].corrupt_input_l.get_value())
459 |     print "One epoch", DT.datetime.now() - tx
460 |     train_stats = collect_stats_epoch(stats, train_stats)
461 |     if (i % 100 == 0 or stop) and debug_code:
462 |         plot_debug_grad(debug, tag_text, fold_exp, "sup")
463 |         plot_penalty_vl(debug, tag_text, fold_exp)
464 |         if tag == "hint":
465 |             plot_debug_grad(debug, tag_text, fold_exp, "hint")
466 |             plot_debug_ratio_grad(debug, fold_exp, "h/s")
467 |             plot_debug_ratio_grad(debug, fold_exp, "s/h")
468 | 
469 |     if stop:
470 |         plot_stats(train_stats, "ep", fold_exp, tag)
471 |         with open(fold_exp + "/train_stats.pkl", 'w') as f_ts:
472 |                 pkl.dump(train_stats, f_ts)
473 |         with open(fold_exp + "/train_debug.pkl", 'w') as f_ts:
474 |                 pkl.dump(debug, f_ts)
475 |     i += 1
476 |     # shuffle the data
477 | 
478 |     print "Going to shuffle the train data."
479 | 
480 |     if do_shuffle and i % shuffle_period == 0 and not stop:
481 |         if extreme_random:
482 |             trainx_tmp = model.trainx_sh.get_value()
483 |             trainy_tmp = model.trainlabels_sh.get_value()
484 |             big_mtx = np.hstack(
485 |                 (trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1)))
486 |             for k in xrange(5):
487 |                 np.random.shuffle(big_mtx)
488 |             trainx_tmp = big_mtx[:, 0:trainx_tmp.shape[1]]
489 |             trainy_tmp = big_mtx[:, -1]
490 |         else:
491 |             with open("data/"+str(nbr_sup) + "/" + str(kk) + ".pkl") as f:
492 |                 stuff = pkl.load(f)
493 |                 trainx_tmp, trainy_tmp = stuff["x"], stuff["y"]
494 |         model.trainlabels_sh.set_value(trainy_tmp.astype(theano.config.floatX))
495 |         model.trainy_sh.set_value(
496 |             to_categorical(
497 |                 trainy_tmp, nbr_classes).astype(theano.config.floatX))
498 |         model.trainx_sh.set_value(trainx_tmp.astype(theano.config.floatX))
499 |         kk += 1
500 |         if kk > 240:
501 |             kk = 0
502 |         print "Finished loading shuffled data. Updated the train set on GPU."
503 |     del stats
504 |     print "This part took:", DT.datetime.now() - txx
505 | 
506 |     print "MIN VALID ", np.min(train_stats["vl_error_mn"]), " *********"
507 | #    # If there was no improvement...
508 |     if (i > start_hint_epoch) and hint:
509 |         # new_v = min([1., h_w.get_value() + 0.01])
510 |         new_v = 1.
511 |         h_w.set_value(np.cast[theano.config.floatX](new_v))
512 | #        print "NO IMPROV. PUSHING THE NET..............................."
513 | #    best_vl_error = np.min(train_stats["vl_error_mn"])
514 |     # Update the importance of the hint
515 | #    if i >= 1:
516 | #        # new_v = min([1., h_w.get_value() + 0.1])
517 | #        h_w.set_value(np.cast[theano.config.floatX](1.))
518 | 
519 | 
520 | # Perform the test
521 | # Set the model's param to the best catched ones
522 | model.set_model_to_catched_params()
523 | # share test data
524 | 
525 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))]
526 | train_error_l = [eval_fn_tst(np.array(l_tr[kkk])) for kkk in range(len(l_tr))]
527 | 
528 | test_error = np.mean([l[0] for l in test_error_l])
529 | print "Test error:", test_error
530 | # Train
531 | 
532 | # Test
533 | # last hidden layer representations.
534 | with open(fold_exp+"/last_hidden_rep_test.pkl", "w") as fhr:
535 |     stuff_hrep_tst = None
536 |     for k in test_error_l:
537 |         if stuff_hrep_tst is None:
538 |             stuff_hrep_tst = l[2]
539 |         else:
540 |             stuff_hrep_tst = np.vstack((stuff_hrep_tst, l[2]))
541 | 
542 |     stuff_hrep_tr = None
543 |     for k in train_error_l:
544 |         if stuff_hrep_tr is None:
545 |             stuff_hrep_tr = l[2]
546 |         else:
547 |             stuff_hrep_tr = np.vstack((stuff_hrep_tr, l[2]))
548 |     pkl.dump(
549 |         {"x_hint_repr_tst": stuff_hrep_tst, "y_tst": testy, "ximg_tst": testx,
550 |          "x_hint_repr_tr": stuff_hrep_tr, "y_tr": trainy, "ximg_tr": trainx},
551 |         fhr)
552 |     # plot t-SNE of the opriginal images
553 |     tx0 = DT.datetime.now()
554 |     tsne_original = manifold.TSNE(n_components=2, init='pca', random_state=0)
555 |     X_tsne_original = tsne_original.fit_transform(testx)
556 |     fig_tsne_org = plot_representations(
557 |         X_tsne_original, testy, "t-SNE embedding of mnist original images.")
558 |     fig_tsne_org.savefig(fold_exp+"/original_rep_test.eps", format='eps',
559 |                          dpi=1200, bbox_inches='tight')
560 |     print "t-SNE of original images took:", DT.datetime.now() - tx0
561 |     # plot t-SNE of the prediction
562 |     tx0 = DT.datetime.now()
563 |     tsne_lasthidden_rep = manifold.TSNE(n_components=2, init='pca',
564 |                                         random_state=0)
565 |     X_tsne_lhrep = tsne_original.fit_transform(stuff_hrep_tst)
566 |     fig_tsne_lhrep = plot_representations(
567 |         X_tsne_lhrep, testy,
568 |         "t-SNE embedding of the last hidden representation of the MLP" +
569 |         "applied over mnist.")
570 |     fig_tsne_lhrep.savefig(fold_exp+"/lasth_rep_mlp_test.eps", format='eps',
571 |                            dpi=1200, bbox_inches='tight')
572 |     print "t-SNE of hidden representation took:", DT.datetime.now() - tx0
573 | prediction = None
574 | for l in test_error_l:
575 |     if prediction is None:
576 |         prediction = l[1]
577 |     else:
578 |         prediction = np.vstack((prediction, l[1]))
579 | 
580 | 
581 | with open(fold_exp+"/pred_after.pkl", "w") as fp:
582 |     pkl.dump({"y": testy, "pred": prediction}, fp)
583 | 
584 | ##############################################################################
585 | # GET INTERMEDIATE VALUE AND PLOT THEM. POSSIBLE ONLY WHEN THE INTERMEDIATE
586 | # VALUES ARE 2D.
587 | # inter_vl = get_inter_output(model, l_tst, testx_sh)
588 | # plot the intermediate values
589 | # ll = 0
590 | # for vi in inter_vl:
591 | #    fig = plot_classes(
592 | #        testy_int, vi, "", test_error,
593 | #        "pred. 2D: mnist 1/7. layer" + str(ll))
594 | #    fig.savefig(
595 | #        fold_exp + "/predinterlayer" + str(ll) + ".png", bbox_inches='tight')
596 | #    ll += 1
597 | 
598 | ###############################################################################
599 | 
600 | #    fig_scatter = plot_classes(y=testy_int, cord=prediction, names=cs,
601 | #                               test_error=test_error, message="AFTER train")
602 | #    fig_scatter.savefig(fold_exp+"/pred_after.png", bbox_inches='tight')
603 | # save min valid
604 | vl_pathfile = "exps/" + "run_" + str(run) + "_sup_" + str(nbr_sup) + "_" +\
605 |     h_exp + "_c_l_" + str(corrupt_input_l) + "_start_at_" +\
606 |     str(start_corrupting) + "_debug_" + str(debug_code) +\
607 |     "_use_sparse_" + str(use_sparsity) + "_use_spar_pred_" +\
608 |     str(use_sparsity_in_pred) + "_" + "norm_" + str(norm_gsup) + "_" +\
609 |     str(norm_gh) + "_" + time_exp + ".txt"
610 | with open(vl_pathfile, 'w') as f:
611 |     f.write("Exp. folder: " + fold_exp + "\n")
612 |     f.write(
613 |         "valid error:" + str(
614 |             np.min(train_stats["vl_error_mn"]) * 100.) + " % \n")
615 |     f.write("Test error:" + str(test_error * 100.) + " % \n")
616 | shutil.copy(vl_pathfile, fold_exp)
617 | 


--------------------------------------------------------------------------------
/trainLenet.py:
--------------------------------------------------------------------------------
  1 | import cPickle as pkl
  2 | import numpy as np
  3 | import theano.tensor as T
  4 | import os
  5 | import sys
  6 | import datetime as DT
  7 | import shutil
  8 | import inspect
  9 | import theano
 10 | import warnings
 11 | 
 12 | 
 13 | from tools import LeNet
 14 | from tools import NonLinearity
 15 | from tools import split_data_to_minibatchs_eval
 16 | from tools import sharedX_value
 17 | from tools import theano_fns
 18 | from tools import theano_fns_double_up
 19 | from learning_rule import AdaDelta
 20 | from learning_rule import RMSProp
 21 | from learning_rule import Momentum
 22 | from tools import evaluate_model
 23 | from tools import collect_stats_epoch
 24 | from tools import plot_stats
 25 | from tools import train_one_epoch
 26 | from tools import train_one_epoch_alter
 27 | from tools import to_categorical
 28 | from tools import chunks
 29 | from tools import plot_penalty_vl
 30 | from tools import plot_debug_grad
 31 | from tools import plot_debug_ratio_grad
 32 | import yaml
 33 | from sklearn import manifold
 34 | from tools import plot_representations
 35 | 
 36 | 
 37 | # Parse the yaml config.
 38 | config_path = "./config_yaml/"
 39 | with open(config_path + sys.argv[1], 'r') as fy:
 40 |     config_exp = yaml.load(fy)
 41 | 
 42 | x_classes = 10
 43 | debug_code = config_exp["debug_code"]
 44 | if debug_code:
 45 |     warnings.warn("YOU ARE IN DEBUG MODE! YOUR CODE WILL TAKE MORE TIME!!!!!")
 46 | 
 47 | 
 48 | def standerize(d, mu=None, sigma=None):
 49 |     if mu is None:
 50 |         mu = np.mean(d, axis=0)
 51 |         sigma = np.std(d, axis=0)
 52 |     if sigma.nonzero()[0].shape[0] == 0:
 53 |         raise Exception("std found to be zero!!!!")
 54 |     norm_d = (d - mu) / sigma
 55 | 
 56 |     return norm_d, mu, sigma
 57 | 
 58 | path_data = "data/mnist.pkl"
 59 | f = open(path_data, 'r')
 60 | train, valid, test = pkl.load(f)
 61 | trainx, trainy = train[0], train[1]
 62 | validx, validy = valid[0], valid[1]
 63 | testx, testy = test[0], test[1]
 64 | # Rehape 3D
 65 | validx = validx.reshape((validx.shape[0], 1, 28, 28))
 66 | testx = testx.reshape((testx.shape[0], 1, 28, 28))
 67 | 
 68 | # How much to take for training?
 69 | nbr_sup = config_exp["nbr_sup"]
 70 | run = config_exp["run"]
 71 | print "RUN:", run
 72 | print "SUP: ", nbr_sup
 73 | trainx, trainy = trainx[:nbr_sup], trainy[:nbr_sup]
 74 | # Prepare the pre-shuffling
 75 | if not os.path.exists("data/" + str(nbr_sup)):
 76 |     os.makedirs("data/" + str(nbr_sup))
 77 | trainx_tmp = trainx
 78 | trainy_tmp = trainy
 79 | 
 80 | # big_mtx = np.hstack((trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1)))
 81 | print "Going to shuffle the train data. It takes some time ..."
 82 | period = 200
 83 | i = 0
 84 | #for k in xrange(5000):
 85 | #    np.random.shuffle(big_mtx)
 86 | #    if k % period == 0:
 87 | #        trainx_tmp2 = big_mtx[:, 0:trainx_tmp.shape[1]]
 88 | #        trainy_tmp2 = big_mtx[:, -1]
 89 | #        stuff = {"x": trainx_tmp2, "y": trainy_tmp2}
 90 | #        print k
 91 | #        with open("data/"+str(nbr_sup) + "/" + str(i) + ".pkl", 'w') as f:
 92 | #            pkl.dump(stuff, f, protocol=pkl.HIGHEST_PROTOCOL)
 93 | #        i += 1
 94 | 
 95 | #with open("data/"+str(nbr_sup) + "/0.pkl") as f:
 96 | #    stuff = pkl.load(f)
 97 | #    trainx, trainy = stuff["x"], stuff["y"]
 98 | # share over gpu: we can store the whole mnist over the gpu.
 99 | # Train
100 | trainx = trainx.reshape((trainx.shape[0], 1, 28, 28))
101 | trainx_sh = theano.shared(trainx.astype(theano.config.floatX),
102 |                           name="trainx", borrow=True)
103 | trainlabels_sh = theano.shared(trainy.astype(theano.config.floatX),
104 |                                name="trainlabels", borrow=True)
105 | trainy_sh = theano.shared(to_categorical(trainy, 10).astype(
106 |     theano.config.floatX),  name="trainy", borrow=True)
107 | # trainy_sh = T.cast(trainy_sh, 'int32')
108 | 
109 | # valid
110 | validx_sh = theano.shared(validx.astype(theano.config.floatX),
111 |                           name="validx", borrow=True)
112 | validlabels_sh = theano.shared(validy.astype(theano.config.floatX),
113 |                                name="validlabels", borrow=True)
114 | #
115 | input = T.tensor4("x")
116 | input1 = T.tensor4("x1")
117 | input2 = T.tensor4("x2")
118 | rng = np.random.RandomState(23455)
119 | 
120 | nbr_classes = x_classes
121 | use_batch_normalization = config_exp["use_batch_normalization"]
122 | h_ind = config_exp["h_ind"]
123 | h_ind = [int(tt) for tt in h_ind]
124 | 
125 | assert len(h_ind) == 4
126 | 
127 | l_v = []
128 | for xx in h_ind:
129 |     print xx
130 |     if int(xx) == 1:
131 |         l_v.append(True)
132 |     elif int(xx) == 0:
133 |         l_v.append(False)
134 |     else:
135 |         raise ValueError("Error in applying hint: 0/1")
136 | 
137 | hint_type = "l2sum"
138 | print l_v
139 | corrupt_input_l = config_exp["corrupt_input_l"]
140 | if corrupt_input_l != 0.:
141 |     warnings.warn(
142 |         "YOU ASKED TO USE DENOISING PROCESS OVER THE INPUTS OF THE FIRST LAYER"
143 |         )
144 |     if not config_exp["hint"]:
145 |         raise ValueError(
146 |             "You asked for densoing process but you are not using the penalty")
147 | start_corrupting = config_exp["start_corrupting"]
148 | warnings.warn(
149 |     "CORRUPTION WILL START AFTER:" + str(start_corrupting) + " epochs!!!!!!")
150 | use_sparsity = config_exp["use_sparsity"]
151 | use_sparsity_in_pred = config_exp["use_sparsity_in_pred"]
152 | print "Use sparsity: ", use_sparsity
153 | print "Use sparsity in pred:", use_sparsity_in_pred
154 | use_unsupervised = config_exp["use_unsupervised"]
155 | layer0 = {
156 |     "rng": rng,
157 |     "n_in": 1,
158 |     "n_out": 20,
159 |     "W": None,
160 |     "b": None,
161 |     "activation": NonLinearity.TANH,
162 |     "hint": hint_type,
163 |     "use_hint": l_v[0],
164 |     "intended_to_be_corrupted": True,
165 |     "corrupt_input_l": corrupt_input_l,
166 |     "use_sparsity": use_sparsity,
167 |     "use_sparsity_in_pred": use_sparsity_in_pred,
168 |     "use_unsupervised": use_unsupervised,
169 |     "use_batch_normalization": use_batch_normalization[0]
170 |     }
171 | 
172 | layer1 = {
173 |     "rng": rng,
174 |     "n_in": 20,
175 |     "n_out": 50,
176 |     "W": None,
177 |     "b": None,
178 |     "activation": NonLinearity.TANH,
179 |     "hint": hint_type,
180 |     "use_hint": l_v[1],
181 |     "use_sparsity": use_sparsity,
182 |     "use_sparsity_in_pred": use_sparsity_in_pred,
183 |     "use_unsupervised": use_unsupervised,
184 |     "use_batch_normalization": use_batch_normalization[1]
185 |     }
186 | 
187 | layer2 = {
188 |     "rng": rng,
189 |     "n_in": 50*4*4,
190 |     "n_out": 500,
191 |     "W": None,
192 |     "b": None,
193 |     "activation": NonLinearity.TANH,
194 |     "hint": hint_type,
195 |     "use_hint": l_v[2],
196 |     "use_sparsity": use_sparsity,
197 |     "use_sparsity_in_pred": use_sparsity_in_pred,
198 |     "use_unsupervised": use_unsupervised,
199 |     "use_batch_normalization": use_batch_normalization[2]
200 |     }
201 | 
202 | 
203 | output_layer = {
204 |     "rng": rng,
205 |     "n_in": 500,
206 |     "n_out": nbr_classes,
207 |     "W": None,
208 |     "b": None,
209 |     "activation": NonLinearity.SOFTMAX,
210 |     "hint": hint_type,
211 |     "use_hint": l_v[3],
212 |     "use_sparsity": False,
213 |     "use_sparsity_in_pred": False,
214 |     "use_unsupervised": use_unsupervised,
215 |     "use_batch_normalization": use_batch_normalization[3]
216 |     }
217 | layers = [layer0, layer1, layer2, output_layer]
218 | l1, l2 = 0., 0.
219 | margin = sharedX_value(1., name="margin")
220 | similair = theano.shared(np.array([0, 1], dtype=theano.config.floatX),
221 |                          name="sim")
222 | train_batch_size = 100
223 | valid_batch_size = train_batch_size
224 | model = LeNet(layers, input, input1, input2,
225 |               trainx_sh, trainlabels_sh, trainy_sh,
226 |               validx_sh, validlabels_sh, margin, similair,
227 |               l1_reg=l1, l2_reg=l2,
228 |               reg_bias=False,
229 |               batch_size=None)
230 | 
231 | size_model = str(trainx.shape[1]) +\
232 |     '_'.join([str(l["n_in"]) for l in layers]) + "_" + str(nbr_classes)
233 | path_model_init_params = "init_params/" + size_model + '_' +\
234 |     str(config_exp["repet"]) + ".pkl"
235 | if not os.path.isfile(path_model_init_params):
236 |     model.save_params(path_model_init_params, catched=False)
237 | else:
238 |     model.set_params_vals(path_model_init_params)
239 | 
240 | 
241 | max_epochs = config_exp["max_epochs"]
242 | lr_vl = 1e-7
243 | lr = sharedX_value(lr_vl, name="lr")
244 | h_w = sharedX_value(config_exp["h_w"], name="hw")
245 | s_w = sharedX_value(1., name="sw")
246 | unsup_w = sharedX_value(1., name="unsw")
247 | lambda_sparsity = sharedX_value(1e-3, name="l_sparsity")
248 | 
249 | # Compile functions: train/valid
250 | updater_sup = AdaDelta(decay=0.95)
251 | updater_hint = AdaDelta(decay=0.95)
252 | updater_unsup = AdaDelta(decay=0.95)
253 | updater = {"sup": updater_sup, 'hint': updater_hint, "unsup": updater_unsup}
254 | 
255 | # updater = Momentum(0.9, nesterov_momentum=False, imagenet=False,
256 | #                   imagenetDecay=5e-4, max_colm_norm=False)
257 | 
258 | hint = config_exp["hint"]
259 | # "hint", "noHint"
260 | if hint:
261 |     tag = "hint"
262 | else:
263 |     tag = "noHint"
264 | 
265 | norm_gsup = config_exp["norm_gsup"]
266 | norm_gh = config_exp["norm_gh"]
267 | fns = theano_fns_double_up(
268 |     model, learning_rate=lr,
269 |     h_w=h_w, s_w=s_w, unsup_w=unsup_w, lambda_sparsity=lambda_sparsity,
270 |     updater=updater, tag=tag,
271 |     max_colm_norm=False, max_norm=15.0,
272 |     norm_gsup=norm_gsup, norm_gh=norm_gh)
273 | 
274 | eval_fn, eval_fn_tr = fns["eval_fn"], fns["eval_fn_tr"]
275 | # Things to track during training: epoch and minibatch
276 | train_stats = {"tr_error_ep": [], "vl_error_ep": [], "tr_cost_ep": [],
277 |                "tr_error_mn": [], "vl_error_mn": [], "tr_cost_mn": [],
278 |                "current_nb_mb": 0, "best_epoch": 0, "best_mn": 0}
279 | 
280 | names = []
281 | for l, i in zip(layers, range(len(layers))):
282 |     if l["hint"] is not None:
283 |         names.append(i)
284 | debug = {"grad_sup": [], "grad_hint": [], "penalty": [], "names": names}
285 | # Eval before start training
286 | l_vl = chunks(range(validx.shape[0]), valid_batch_size)
287 | l_tr = chunks(range(trainx.shape[0]), valid_batch_size)
288 | vl_err_start = np.mean(
289 |     [eval_fn(np.array(l_vl[kk])) for kk in range(len(l_vl))])
290 | tr_err_start = np.mean(
291 |     [eval_fn_tr(np.array(l_tr[kk])) for kk in range(len(l_tr))])
292 | print vl_err_start, tr_err_start
293 | 
294 | # Exp stamp
295 | time_exp = DT.datetime.now().strftime('%m_%d_%Y_%H_%M_%s')
296 | tag_text = "_".join([str(l["hint"]) for l in layers])
297 | h_exp = "_".join([str(e) for e in h_ind])
298 | fold_exp = "exps/lenet_" + tag + "_" + str(nbr_sup) + "_" + h_exp + "_" +\
299 |     size_model + "_" + time_exp
300 | if not os.path.exists(fold_exp):
301 |     os.makedirs(fold_exp)
302 | 
303 | shutil.copy(inspect.stack()[0][1], fold_exp)
304 | shutil.copy(config_path+sys.argv[1], fold_exp)
305 | 
306 | # Start training
307 | stop, i = False, 0
308 | div = any([l["hint"] is "contrastive" for l in layers])
309 | shuffle_period = 1   # epochs
310 | do_shuffle = True
311 | extreme_random = config_exp["extreme_random"]
312 | if extreme_random:
313 |     print "Extreme randomness."
314 | else:
315 |     print "Same shuffle."
316 | kk = 1
317 | start_hint_epoch = config_exp["start_hint"]
318 | 
319 | while i < max_epochs:
320 |     if i >= start_corrupting:
321 |         warnings.warn(
322 |             "SETTING THE CORRUPTION LEVEL TO:" + str(corrupt_input_l))
323 |         model.layers[0].corrupt_input_l.set_value(
324 |             np.cast[theano.config.floatX](corrupt_input_l))
325 |     else:
326 |         warnings.warn("SETTING THE CORRUPTION LEVEL TO: 0")
327 |         model.layers[0].corrupt_input_l.set_value(
328 |               np.cast[theano.config.floatX](0.))
329 |     stop = (i == max_epochs - 1)
330 |     tx = DT.datetime.now()
331 |     stats = train_one_epoch_alter(
332 |         model, fns, i, fold_exp, train_stats, vl_err_start, tag,
333 |         train_batch_size, l_vl, l_tr, div, stop=stop,
334 |         debug=debug, debug_code=debug_code)
335 |     txx = DT.datetime.now()
336 |     print "CORRUPTION LEVEL VALUE: " +\
337 |         str(model.layers[0].corrupt_input_l.get_value())
338 |     print "One epoch", DT.datetime.now() - tx
339 |     train_stats = collect_stats_epoch(stats, train_stats)
340 |     if (i % 100 == 0 or stop) and debug_code:
341 |         plot_debug_grad(debug, tag_text, fold_exp, "sup")
342 |         plot_penalty_vl(debug, tag_text, fold_exp)
343 |         if tag == "hint":
344 |             plot_debug_grad(debug, tag_text, fold_exp, "hint")
345 |             plot_debug_ratio_grad(debug, fold_exp, "h/s")
346 |             plot_debug_ratio_grad(debug, fold_exp, "s/h")
347 | 
348 |     if stop:
349 |         plot_stats(train_stats, "ep", fold_exp, tag)
350 |         with open(fold_exp + "/train_stats.pkl", 'w') as f_ts:
351 |                 pkl.dump(train_stats, f_ts)
352 |         with open(fold_exp + "/train_debug.pkl", 'w') as f_ts:
353 |                 pkl.dump(debug, f_ts)
354 |     i += 1
355 |     # shuffle the data
356 | 
357 |     print "Going to shuffle the train data."
358 | 
359 |     if do_shuffle and i % shuffle_period == 0 and not stop:
360 |         if extreme_random:
361 |             trainx_tmp = model.trainx_sh.get_value()
362 |             trainx_tmp = trainx_tmp.reshape((trainx_tmp.shape[0], 28*28))
363 |             trainy_tmp = model.trainlabels_sh.get_value()
364 |             big_mtx = np.hstack(
365 |                 (trainx_tmp, trainy_tmp.reshape(trainy_tmp.size, 1)))
366 |             for k in xrange(5):
367 |                 np.random.shuffle(big_mtx)
368 |             trainx_tmp = big_mtx[:, 0:trainx_tmp.shape[1]]
369 |             trainy_tmp = big_mtx[:, -1]
370 |         else:
371 |             with open("data/"+str(nbr_sup) + "/" + str(kk) + ".pkl") as f:
372 |                 stuff = pkl.load(f)
373 |                 trainx_tmp, trainy_tmp = stuff["x"], stuff["y"]
374 |         trainx_tmp = trainx_tmp.reshape((trainx_tmp.shape[0], 1, 28, 28))
375 |         model.trainlabels_sh.set_value(trainy_tmp.astype(theano.config.floatX))
376 |         model.trainy_sh.set_value(
377 |             to_categorical(
378 |                 trainy_tmp, nbr_classes).astype(theano.config.floatX))
379 |         # model.trainy_sh = T.cast(model.trainy_sh, 'int32')
380 |         model.trainx_sh.set_value(trainx_tmp.astype(theano.config.floatX))
381 |         kk += 1
382 |         if kk > 240:
383 |             kk = 0
384 |         print "Finished loading shuffled data. Updated the train set on GPU."
385 |     del stats
386 |     print "This part took:", DT.datetime.now() - txx
387 |     if (i > start_hint_epoch) and hint:
388 |         # new_v = min([1., h_w.get_value() + 0.1])
389 |         new_v = 1.
390 |         h_w.set_value(np.cast[theano.config.floatX](new_v))
391 |     # Update the importance of the hint
392 | #    if i >= 1:
393 | #        # new_v = min([1., h_w.get_value() + 0.1])
394 | #        h_w.set_value(np.cast[theano.config.floatX](1.))
395 | 
396 | 
397 | # Perform the test
398 | # Set the model's param to the best catched ones
399 | model.set_model_to_catched_params()
400 | # share test data
401 | testx_sh = theano.shared(testx.astype(theano.config.floatX),
402 |                          name="testx", borrow=True)
403 | testlabels_sh = theano.shared(testy.astype(theano.config.floatX),
404 |                               name="testlabels", borrow=True)
405 | 
406 | i_x_vl = T.lvector("ixtst")
407 | y_vl = T.vector("y")
408 | error = T.mean(T.neq(T.argmax(model.output, axis=1), y_vl))
409 | 
410 | output_fn_test = [error, model.output, model.layers[-2].output]
411 | 
412 | eval_fn_tst = theano.function(
413 |     [i_x_vl], output_fn_test,
414 |     givens={model.x: testx_sh[i_x_vl],
415 |             y_vl: testlabels_sh[i_x_vl]})
416 | l_tst = chunks(range(testx.shape[0]), valid_batch_size)
417 | test_error_l = [eval_fn_tst(np.array(l_tst[kkk])) for kkk in range(len(l_tst))]
418 | train_error_l = [eval_fn_tst(np.array(l_tr[kkk])) for kkk in range(len(l_tr))]
419 | 
420 | test_error = np.mean([l[0] for l in test_error_l])
421 | print "Test error:", test_error
422 | 
423 | # Test
424 | # last hidden layer representations.
425 | with open(fold_exp+"/last_hidden_rep_test.pkl", "w") as fhr:
426 |     stuff_hrep_tst = None
427 |     for k in test_error_l:
428 |         if stuff_hrep_tst is None:
429 |             stuff_hrep_tst = l[2]
430 |         else:
431 |             stuff_hrep_tst = np.vstack((stuff_hrep_tst, l[2]))
432 | 
433 |     stuff_hrep_tr = None
434 |     for k in train_error_l:
435 |         if stuff_hrep_tr is None:
436 |             stuff_hrep_tr = l[2]
437 |         else:
438 |             stuff_hrep_tr = np.vstack((stuff_hrep_tr, l[2]))
439 |     pkl.dump(
440 |         {"x_hint_repr_tst": stuff_hrep_tst, "y_tst": testy,
441 |          "ximg_tst": testx.reshape((testx.shape[0], 28*28)),
442 |          "x_hint_repr_tr": stuff_hrep_tr, "y_tr": trainy,
443 |          "ximg_tr": trainx.reshape((trainx.shape[0], 28*28))},
444 |         fhr)
445 |     # plot t-SNE of the opriginal images
446 |     tx0 = DT.datetime.now()
447 |     tsne_original = manifold.TSNE(n_components=2, init='pca', random_state=0)
448 |     X_tsne_original = tsne_original.fit_transform(
449 |         testx.reshape((testx.shape[0], 28*28)))
450 |     fig_tsne_org = plot_representations(
451 |         X_tsne_original, testy, "t-SNE embedding of mnist original images.")
452 |     fig_tsne_org.savefig(fold_exp+"/original_rep_test.eps", format='eps',
453 |                          dpi=1200, bbox_inches='tight')
454 |     print "t-SNE of original images took:", DT.datetime.now() - tx0
455 |     # plot t-SNE of the prediction
456 |     tx0 = DT.datetime.now()
457 |     tsne_lasthidden_rep = manifold.TSNE(n_components=2, init='pca',
458 |                                         random_state=0)
459 |     X_tsne_lhrep = tsne_original.fit_transform(stuff_hrep_tst)
460 |     fig_tsne_lhrep = plot_representations(
461 |         X_tsne_lhrep, testy,
462 |         "t-SNE embedding of the last hidden representation of the MLP" +
463 |         "applied over mnist.")
464 |     fig_tsne_lhrep.savefig(fold_exp+"/lasth_rep_mlp_test.eps", format='eps',
465 |                            dpi=1200, bbox_inches='tight')
466 |     print "t-SNE of hidden representation took:", DT.datetime.now() - tx0
467 | # save min valid
468 | vl_pathfile = "exps/" + "LeNet_run_" + str(run) + "_sup_" + str(nbr_sup) +\
469 |     "_" + h_exp + "_c_l_" + str(corrupt_input_l) + "_start_at_" +\
470 |     str(start_corrupting) + "_debug_" + str(debug_code) +\
471 |     "_use_sparse_" + str(use_sparsity) + "_use_spar_pred_" +\
472 |     str(use_sparsity_in_pred) + "_" + time_exp + ".txt"
473 | with open(vl_pathfile, 'w') as f:
474 |     f.write("Exp. folder: " + fold_exp + "\n")
475 |     f.write(
476 |         "valid error:" + str(
477 |             np.min(train_stats["vl_error_mn"]) * 100.) + " % \n")
478 |     f.write("Test error:" + str(test_error * 100.) + " % \n")
479 | shutil.copy(vl_pathfile, fold_exp)
480 | 


--------------------------------------------------------------------------------