├── README.md ├── LICENSE ├── mnist_small_validation.py ├── proper_batch_norm.py ├── l2hinge_classifier.py ├── incremental_annealer.py ├── mnist_highwaylrdiagdropoutbnl2hl.py ├── highwaylrdropoutbn_layer.py ├── models ├── highwaylrdiagdropoutbnl2hl-l2reg3e-3-n1024-d256-T5-p02-03-03-05-b20-a3e-3-autoannealing.log └── highwaylrdropoutbnl2hl-l2reg3e-3-n1024-d256-T5-p02-03-03-05-b20-a3e-3-autoannealing.log └── highwaylrdiagdropoutbn_layer.py /README.md: -------------------------------------------------------------------------------- 1 | # lowrank-highwaynetwork 2 | Low-rank highway networks 3 | 4 | Highway Networks with Low-rank matrix factorization 5 | 6 | Paper: http://arxiv.org/abs/1603.03116 7 | 8 | A reparametrization of Highway Networks (Srivastava et al. 2015, http://arxiv.org/abs/1505.00387) where the hidden layer matrices are constrained to be low-rank. Reduces data complexity and memory usage. 9 | 10 | This code is based on and depends on Deepy (Shu 2015). In order to run dowload Deepy from https://github.com/zomux/deepy and copy these files in the deepy/experiments/highway_networks/ directory. 11 | 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 uaca 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /mnist_small_validation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | import logging 6 | import tempfile 7 | import os 8 | import gzip 9 | import urllib 10 | import cPickle 11 | 12 | from deepy.dataset import Dataset 13 | import numpy as np 14 | 15 | logging = logging.getLogger(__name__) 16 | 17 | MNIST_URL = "http://deeplearning.net/data/mnist/mnist.pkl.gz" 18 | 19 | class MnistDatasetSmallValid(Dataset): 20 | 21 | def __init__(self): 22 | self._target_size = 10 23 | logging.info("loading minst data") 24 | path = os.path.join(tempfile.gettempdir(), "mnist.pkl.gz") 25 | if not os.path.exists(path): 26 | logging.info("downloading minst data") 27 | urllib.urlretrieve (MNIST_URL, path) 28 | self._train_set, self._valid_set, self._test_set = cPickle.load(gzip.open(path, 'rb')) 29 | 30 | # Moving validation examples to training set, leaving 1000 31 | train_set_x = np.vstack((self._train_set[0], self._valid_set[0][:-1000])) 32 | train_set_y = np.hstack((self._train_set[1], self._valid_set[1][:-1000])) 33 | valid_set_x = self._valid_set[0][-1000:] 34 | valid_set_y = self._valid_set[1][-1000:] 35 | self._train_set = (train_set_x, train_set_y) 36 | self._valid_set = (valid_set_x, valid_set_y) 37 | 38 | logging.info("[mnist small validation] training data size: %d" % len(self._train_set[0])) 39 | logging.info("[mnist small validation] valid data size: %d" % len(self._valid_set[0])) 40 | logging.info("[mnist small validation] test data size: %d" % len(self._test_set[0])) 41 | 42 | def train_set(self): 43 | data, target = self._train_set 44 | return zip(data, np.array(target)) 45 | 46 | def valid_set(self): 47 | data, target = self._valid_set 48 | return zip(data, np.array(target)) 49 | 50 | def test_set(self): 51 | data, target = self._test_set 52 | return zip(data, np.array(target)) 53 | -------------------------------------------------------------------------------- /proper_batch_norm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # by Antonio Valerio Miceli Barone 5 | 6 | import numpy as np 7 | import theano.tensor as T 8 | 9 | from deepy.layers import NeuralLayer 10 | from deepy.utils import FLOATX 11 | 12 | class BatchNormalization(NeuralLayer): 13 | """ 14 | Batch normalization. 15 | http://arxiv.org/pdf/1502.03167v3.pdf 16 | """ 17 | def __init__(self, with_scale=True, with_bias=True, epsilon=1e-4, tau=0.1): 18 | super(BatchNormalization,self).__init__("norm") 19 | self.with_scale = with_scale 20 | self.with_bias = with_bias 21 | self.epsilon = epsilon 22 | self.tau = tau 23 | 24 | def setup(self): 25 | if self.with_scale: 26 | self.S = self.create_vector(self.input_dim, "S") 27 | self.S.set_value(np.ones(self.input_dim, dtype=FLOATX)) 28 | self.register_parameters(self.S) 29 | if self.with_bias: 30 | self.B = self.create_bias(self.input_dim, "B") 31 | self.register_parameters(self.B) 32 | self.Mean = self.create_vector(self.input_dim, "Mean") 33 | self.Std = self.create_vector(self.input_dim, "Std") 34 | 35 | def output(self, x): 36 | x_mean = T.mean(x, axis=0) 37 | x_std = T.std(x, axis=0) 38 | rv = (x - x_mean) / (x_std + self.epsilon) 39 | if self.with_scale: 40 | rv = rv * self.S 41 | if self.with_bias: 42 | rv = rv + self.B 43 | 44 | new_mean = self.tau * x_mean + (1.0 - self.tau) * self.Mean 45 | new_std = self.tau * x_std + (1.0 - self.tau) * self.Std 46 | self.register_training_updates((self.Mean, new_mean), 47 | (self.Std, new_std)) 48 | 49 | return rv 50 | 51 | def test_output(self, x): 52 | rv = (x - self.Mean) / (self.Std + self.epsilon) 53 | if self.with_scale: 54 | rv = rv * self.S 55 | if self.with_bias: 56 | rv = rv + self.B 57 | return rv 58 | -------------------------------------------------------------------------------- /l2hinge_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # by Antonio Valerio Miceli Barone 5 | 6 | from deepy.networks import NeuralNetwork 7 | from deepy.utils import FLOATX, EPSILON, CrossEntropyCost 8 | import theano.tensor as T 9 | 10 | class L2HingeNeuralClassifier(NeuralNetwork): 11 | """ 12 | Classifier network with squared hinge loss. 13 | http://arxiv.org/abs/1306.0239 14 | """ 15 | 16 | def __init__(self, input_dim, config=None, input_tensor=None, last_layer_l2_regularization = 0.0): 17 | super(L2HingeNeuralClassifier, self).__init__(input_dim, config=config, input_tensor=input_tensor) 18 | self.last_layer_l2_regularization = last_layer_l2_regularization 19 | 20 | def setup_variables(self): 21 | super(L2HingeNeuralClassifier, self).setup_variables() 22 | 23 | self.k = T.ivector('k') 24 | self.target_variables.append(self.k) 25 | 26 | def _cost_func(self, y): 27 | #y = T.clip(y, EPSILON, 1.0 - EPSILON) 28 | #return CrossEntropyCost(y, self.k).get() 29 | 30 | k_onehot = T.eye(y.shape[1])[self.k] 31 | k_centered = 2.0 * k_onehot - 1.0 32 | loss = T.mean(T.sqr(T.maximum(0.0, 1.0 - y*k_centered))) 33 | return loss 34 | 35 | def _error_func(self, y): 36 | return 100 * T.mean(T.neq(T.argmax(y, axis=1), self.k)) 37 | 38 | @property 39 | def cost(self): 40 | last_layer_W = self.layers[-1].W 41 | regularization_cost = 0.0 if (self.last_layer_l2_regularization == 0.0) else self.last_layer_l2_regularization * last_layer_W.norm(2) / last_layer_W.size 42 | return self._cost_func(self.output) + regularization_cost 43 | 44 | @property 45 | def test_cost(self): 46 | return self._cost_func(self.test_output) 47 | 48 | def prepare_training(self): 49 | self.training_monitors.append(("err", self._error_func(self.output))) 50 | self.testing_monitors.append(("err", self._error_func(self.test_output))) 51 | super(L2HingeNeuralClassifier, self).prepare_training() 52 | 53 | def predict(self, x): 54 | return self.compute(x).argmax(axis=1) 55 | 56 | -------------------------------------------------------------------------------- /incremental_annealer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import theano 5 | import numpy as np 6 | from deepy.utils import FLOATX 7 | 8 | import logging as loggers 9 | logging = loggers.getLogger(__name__) 10 | 11 | class LearningRateAnnealer(object): 12 | """ 13 | Learning rate annealer. 14 | """ 15 | 16 | def __init__(self, trainer, patience=5, anneal_times=4): 17 | """ 18 | :type trainer: deepy.trainers.trainers.NeuralTrainer 19 | """ 20 | self._trainer = trainer 21 | self._iter = -1 22 | self._annealed_iter = -1 23 | self._patience = patience 24 | self._anneal_times = anneal_times 25 | self._annealed_times = 0 26 | self._learning_rate = self._trainer.config.learning_rate 27 | if type(self._learning_rate) == float: 28 | raise Exception("use LearningRateAnnealer.learning_rate to wrap the value in the config.") 29 | 30 | def invoke(self): 31 | """ 32 | Run it, return whether to end training. 33 | """ 34 | self._iter += 1 35 | if self._iter - max(self._trainer.best_iter, self._annealed_iter) >= self._patience: 36 | if self._annealed_times >= self._anneal_times: 37 | logging.info("ending") 38 | return True 39 | else: 40 | self._trainer.set_params(*self._trainer.best_params) 41 | self._learning_rate.set_value(self._learning_rate.get_value() * 0.5) 42 | self._annealed_times += 1 43 | self._annealed_iter = self._iter 44 | logging.info("annealed learning rate to %f" % self._learning_rate.get_value()) 45 | return False 46 | 47 | @staticmethod 48 | def learning_rate(value=0.01): 49 | """ 50 | Wrap learning rate. 51 | """ 52 | return theano.shared(np.array(value, dtype=FLOATX), name="learning_rate") 53 | 54 | 55 | class IncrementalLearningRateAnnealer(object): 56 | 57 | def __init__(self, trainer, iter_start_annealing=10, annealing_rate=0.9): 58 | self.iter_start_annealing = iter_start_annealing 59 | self.annealing_rate = np.array(annealing_rate, dtype=FLOATX) 60 | self._trainer = trainer 61 | self._learning_rate = self._trainer.config.learning_rate 62 | self._iter = -1 63 | if type(self._learning_rate) == float: 64 | raise Exception("use LearningRateAnnealer.learning_rate to wrap the value in the config.") 65 | 66 | 67 | def invoke(self): 68 | self._iter += 1 69 | if self._iter >= self.iter_start_annealing: 70 | self._learning_rate.set_value(self._learning_rate.get_value() * self.annealing_rate) 71 | logging.info("setting learning rate to %f" % self._learning_rate.get_value()) 72 | return False 73 | -------------------------------------------------------------------------------- /mnist_highwaylrdiagdropoutbnl2hl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | . 6 | """ 7 | 8 | import logging, os 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | from deepy.dataset import MnistDataset, MiniBatches 12 | #from deepy.networks import NeuralClassifier 13 | from deepy.layers import Dense, Softmax, Dropout, Activation 14 | from deepy.trainers import MomentumTrainer, LearningRateAnnealer, ScheduledLearningRateAnnealer, AdamTrainer 15 | from deepy.conf import TrainerConfig 16 | from deepy.utils import XavierGlorotInitializer, OrthogonalInitializer 17 | from highwaylrdropoutbn_layer import HighwayLayerLRDropoutBatchNorm 18 | from highwaylrdiagdropoutbn_layer import HighwayLayerLRDiagDropoutBatchNorm 19 | from proper_batch_norm import BatchNormalization 20 | from l2hinge_classifier import L2HingeNeuralClassifier 21 | 22 | import sys 23 | sys.path.append(os.path.dirname(__file__)) 24 | from mnist_small_validation import MnistDatasetSmallValid 25 | from incremental_annealer import IncrementalLearningRateAnnealer 26 | 27 | #model_path = os.path.join(os.path.dirname(__file__), "models", "highwaylrdropoutbnl2hl-l2reg1e-5-n1024-d256-T5-p02-03-03-05-b100-a3e-3-autoannealing.gz") 28 | model_path = os.path.join(os.path.dirname(__file__), "models", "highwaylrdiagdropoutbnl2hl-l2reg1e-5-n1024-d256-T5-p02-03-03-05-b100-a3e-3-autoannealing.gz") 29 | 30 | if __name__ == '__main__': 31 | dropout_p_0 = 0.2 32 | dropout_p_h_0 = 0.3 33 | dropout_p_h_1 = 0.3 34 | dropout_p_2 = 0.5 35 | T = 5 36 | n = 1024 37 | d = 256 38 | gate_bias = -1.0 39 | activation = 'relu' 40 | #l2_reg = 0.001 41 | l2_reg = 1e-5 42 | init = XavierGlorotInitializer() 43 | model = L2HingeNeuralClassifier(input_dim=28*28, last_layer_l2_regularization = l2_reg) 44 | model.stack(Dropout(p=dropout_p_0), Dense(n, init=init, disable_bias=True), BatchNormalization(), Activation(activation)) 45 | #model.stack(Dropout(p=dropout_p_0), BatchNormalization()) 46 | 47 | for _ in range(T): 48 | #model.stack(HighwayLayerLRDropoutBatchNorm(activation=activation, gate_bias=gate_bias, projection_dim=d, d_p_0 = dropout_p_h_0, d_p_1 = dropout_p_h_1, init=init)) 49 | model.stack(HighwayLayerLRDiagDropoutBatchNorm(activation=activation, gate_bias=gate_bias, projection_dim=d, d_p_0 = dropout_p_h_0, d_p_1 = dropout_p_h_1, init=init, quasi_ortho_init=True)) 50 | #model.stack(BatchNormalization(),Dropout(p=dropout_p_2), Dense(10, init=init)) 51 | model.stack(Dropout(p=dropout_p_2), Dense(10, init=init)) 52 | 53 | 54 | learning_rate_start = 3e-3 55 | #learning_rate_target = 3e-7 56 | #learning_rate_epochs = 100 57 | #learning_rate_decay = (learning_rate_target / learning_rate_start) ** (1.0 / learning_rate_epochs) 58 | conf = TrainerConfig() 59 | conf.learning_rate = LearningRateAnnealer.learning_rate(learning_rate_start) 60 | #conf.gradient_clipping = 1 61 | conf.patience = 20 62 | #conf.gradient_tolerance = 5 63 | conf.avoid_nan = True 64 | conf.min_improvement = 1e-10 65 | 66 | #trainer = MomentumTrainer(model) 67 | trainer = AdamTrainer(model, conf) 68 | 69 | mnist = MiniBatches(MnistDataset(), batch_size=100) 70 | #mnist = MiniBatches(MnistDatasetSmallValid(), batch_size=100) 71 | 72 | #trainer.run(mnist, controllers=[IncrementalLearningRateAnnealer(trainer, 0, learning_rate_decay)]) 73 | trainer.run(mnist, controllers=[LearningRateAnnealer(trainer, 3, 14)]) 74 | logging.info('Setting best parameters for testing.') 75 | trainer.set_params(*trainer.best_params) 76 | trainer._run_test(-1, mnist.test_set()) 77 | 78 | model.save_params(model_path) 79 | -------------------------------------------------------------------------------- /highwaylrdropoutbn_layer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # by Antonio Valerio Miceli Barone 5 | 6 | import numpy as np 7 | from deepy.layers import NeuralLayer 8 | from deepy.utils import build_activation, global_theano_rand, FLOATX 9 | import theano.tensor as T 10 | 11 | class HighwayLayerLRDropoutBatchNorm(NeuralLayer): 12 | """ 13 | Low-rank Highway network layer with internal dropout and batch normalization. 14 | 15 | Extends http://arxiv.org/abs/1505.00387. 16 | """ 17 | 18 | def __init__(self, activation='relu', init=None, gate_bias=-5, projection_dim=10, d_p_0 = 0.3, d_p_1 = 0.3, epsilon=1e-4, tau=0.1): 19 | super(HighwayLayerLRDropoutBatchNorm, self).__init__("highwayLRDropoutBatchNorm") 20 | self.activation = activation 21 | self.init = init 22 | self.gate_bias = gate_bias 23 | self.projection_dim = projection_dim 24 | self.d_p_0 = d_p_0 25 | self.d_p_1 = d_p_1 26 | self.epsilon = epsilon 27 | self.tau = tau 28 | 29 | def setup(self): 30 | self.output_dim = self.input_dim 31 | self._act = build_activation(self.activation) 32 | self.W_hl = self.create_weight(self.input_dim, self.projection_dim, "hl", initializer=self.init) 33 | self.W_tl = self.create_weight(self.input_dim, self.projection_dim, "tl", initializer=self.init) 34 | self.W_hr = self.create_weight(self.projection_dim, self.input_dim, "hr", initializer=self.init) 35 | self.W_tr = self.create_weight(self.projection_dim, self.input_dim, "tr", initializer=self.init) 36 | self.B_h = self.create_bias(self.input_dim, "h") 37 | self.B_t = self.create_bias(self.input_dim, "t", value=self.gate_bias) 38 | 39 | self.S_h = self.create_vector(self.input_dim, "S_h") 40 | self.S_t = self.create_vector(self.input_dim, "S_t") 41 | self.S_h.set_value(np.ones(self.input_dim, dtype=FLOATX)) 42 | self.S_t.set_value(np.ones(self.input_dim, dtype=FLOATX)) 43 | 44 | self.register_parameters(self.W_hl, self.B_h, self.W_tl, self.B_t, self.W_hr, self.W_tr, self.S_h, self.S_t) 45 | 46 | self.Mean_hl = self.create_vector(self.projection_dim, "Mean_hl") 47 | self.Mean_tl = self.create_vector(self.projection_dim, "Mean_tl") 48 | self.Mean_hr = self.create_vector(self.input_dim, "Mean_hr") 49 | self.Mean_tr = self.create_vector(self.input_dim, "Mean_tr") 50 | self.Std_hl = self.create_vector(self.projection_dim, "Std_hl") 51 | self.Std_tl = self.create_vector(self.projection_dim, "Std_tl") 52 | self.Std_hr = self.create_vector(self.input_dim, "Std_hr") 53 | self.Std_tr = self.create_vector(self.input_dim, "Std_tr") 54 | self.Std_hl.set_value(np.ones(self.projection_dim, dtype=FLOATX)) 55 | self.Std_tl.set_value(np.ones(self.projection_dim, dtype=FLOATX)) 56 | self.Std_hr.set_value(np.ones(self.input_dim, dtype=FLOATX)) 57 | self.Std_tr.set_value(np.ones(self.input_dim, dtype=FLOATX)) 58 | 59 | self.register_free_parameters(self.Mean_hl, self.Mean_tl, self.Mean_hr, self.Mean_tr, self.Std_hl, self.Std_tl, self.Std_hr, self.Std_tr) 60 | 61 | def output(self, x): 62 | d_0 = global_theano_rand.binomial(x.shape, p=1-self.d_p_0, dtype=FLOATX) 63 | d_1 = global_theano_rand.binomial((x.shape[0], self.projection_dim), p=1-self.d_p_1, dtype=FLOATX) 64 | 65 | tl_raw = T.dot(x * d_0, self.W_tl) 66 | hl_raw = T.dot(x * d_0, self.W_hl) 67 | tl_mean = T.mean(tl_raw, axis=0) 68 | hl_mean = T.mean(hl_raw, axis=0) 69 | tl_std = T.std(tl_raw, axis=0) 70 | hl_std = T.std(hl_raw, axis=0) 71 | tl = (tl_raw - tl_mean) / (tl_std + self.epsilon) 72 | hl = (hl_raw - hl_mean) / (hl_std + self.epsilon) 73 | new_Mean_tl = self.tau * tl_mean + (1.0 - self.tau) * self.Mean_tl 74 | new_Mean_hl = self.tau * hl_mean + (1.0 - self.tau) * self.Mean_hl 75 | new_Std_tl = self.tau * tl_std + (1.0 - self.tau) * self.Std_tl 76 | new_Std_hl = self.tau * hl_std + (1.0 - self.tau) * self.Std_hl 77 | 78 | tr_raw = (tl * d_1).dot(self.W_tr) 79 | hr_raw = (hl * d_1).dot(self.W_hr) 80 | tr_mean = T.mean(tr_raw, axis=0) 81 | hr_mean = T.mean(hr_raw, axis=0) 82 | tr_std = T.std(tr_raw, axis=0) 83 | hr_std = T.std(hr_raw, axis=0) 84 | tr = (tr_raw - tr_mean) / (tr_std + self.epsilon) 85 | hr = (hr_raw - hr_mean) / (hr_std + self.epsilon) 86 | new_Mean_tr = self.tau * tr_mean + (1.0 - self.tau) * self.Mean_tr 87 | new_Mean_hr = self.tau * hr_mean + (1.0 - self.tau) * self.Mean_hr 88 | new_Std_tr = self.tau * tr_std + (1.0 - self.tau) * self.Std_tr 89 | new_Std_hr = self.tau * hr_std + (1.0 - self.tau) * self.Std_hr 90 | 91 | t = T.nnet.sigmoid(tr * self.S_t + self.B_t) 92 | h = self._act(hr * self.S_h + self.B_h) 93 | rv = h * t + x * (1 - t) 94 | 95 | self.register_training_updates((self.Mean_tl, new_Mean_tl), 96 | (self.Mean_hl, new_Mean_hl), 97 | (self.Mean_tr, new_Mean_tr), 98 | (self.Mean_hr, new_Mean_hr), 99 | (self.Std_tl, new_Std_tl), 100 | (self.Std_hl, new_Std_hl), 101 | (self.Std_tr, new_Std_tr), 102 | (self.Std_hr, new_Std_hr)) 103 | 104 | return rv 105 | 106 | def test_output(self, x): 107 | d_0 = 1.0 - self.d_p_0 108 | d_1 = 1.0 - self.d_p_1 109 | 110 | tl_raw = T.dot(x * d_0, self.W_tl) 111 | hl_raw = T.dot(x * d_0, self.W_hl) 112 | tl = (tl_raw - self.Mean_tl) / (self.Std_tl + self.epsilon) 113 | hl = (hl_raw - self.Mean_hl) / (self.Std_hl + self.epsilon) 114 | 115 | tr_raw = (tl * d_1).dot(self.W_tr) 116 | hr_raw = (hl * d_1).dot(self.W_hr) 117 | tr = (tr_raw - self.Mean_tr) / (self.Std_tr + self.epsilon) 118 | hr = (hr_raw - self.Mean_hr) / (self.Std_hr + self.epsilon) 119 | 120 | t = T.nnet.sigmoid(tr * self.S_t + self.B_t) 121 | h = self._act(hr * self.S_h + self.B_h) 122 | rv = h * t + x * (1 - t) 123 | 124 | return rv 125 | 126 | -------------------------------------------------------------------------------- /models/highwaylrdiagdropoutbnl2hl-l2reg3e-3-n1024-d256-T5-p02-03-03-05-b20-a3e-3-autoannealing.log: -------------------------------------------------------------------------------- 1 | # deepy version: 0.1.7 2 | [2016/03/09 11:46:43] test (iter=1) J=395314171084.80 err=88.18 3 | [2016/03/09 11:46:48] valid (iter=1) J=401897281781.76 err=88.36 * 4 | [2016/03/09 11:50:23] monitor (iter=1) J=0.06 err=9.00 5 | [2016/03/09 11:50:30] valid (iter=2) J=0.03 err=3.81 * 6 | [2016/03/09 11:54:07] monitor (iter=2) J=0.06 err=6.00 7 | [2016/03/09 11:54:13] valid (iter=3) J=0.02 err=2.81 * 8 | [2016/03/09 11:57:55] monitor (iter=3) J=0.02 err=2.00 9 | [2016/03/09 11:58:04] valid (iter=4) J=0.02 err=2.51 * 10 | [2016/03/09 12:02:13] monitor (iter=4) J=0.02 err=3.00 11 | [2016/03/09 12:02:20] valid (iter=5) J=0.02 err=2.33 * 12 | [2016/03/09 12:05:52] monitor (iter=5) J=0.03 err=3.00 13 | [2016/03/09 12:05:59] valid (iter=6) J=0.02 err=2.20 * 14 | [2016/03/09 12:09:34] monitor (iter=6) J=0.02 err=2.00 15 | [2016/03/09 12:09:41] valid (iter=7) J=0.02 err=2.17 * 16 | [2016/03/09 12:13:14] monitor (iter=7) J=0.03 err=4.00 17 | [2016/03/09 12:13:21] valid (iter=8) J=0.02 err=1.97 * 18 | [2016/03/09 12:16:56] monitor (iter=8) J=0.02 err=3.00 19 | [2016/03/09 12:17:02] valid (iter=9) J=0.02 err=2.02 20 | [2016/03/09 12:20:43] monitor (iter=9) J=0.01 err=1.00 21 | [2016/03/09 12:20:53] valid (iter=10) J=0.02 err=1.70 * 22 | [2016/03/09 12:25:04] monitor (iter=10) J=0.01 err=2.00 23 | [2016/03/09 12:25:11] test (iter=11) J=0.01 err=1.78 24 | [2016/03/09 12:25:19] valid (iter=11) J=0.01 err=1.84 * 25 | [2016/03/09 12:28:49] monitor (iter=11) J=0.01 err=0.00 26 | [2016/03/09 12:28:56] valid (iter=12) J=0.02 err=1.74 27 | [2016/03/09 12:32:31] monitor (iter=12) J=0.02 err=3.00 28 | [2016/03/09 12:32:38] valid (iter=13) J=0.02 err=1.67 29 | [2016/03/09 12:36:12] monitor (iter=13) J=0.00 err=0.00 30 | [2016/03/09 12:36:18] valid (iter=14) J=0.02 err=1.64 31 | [2016/03/09 12:39:54] monitor (iter=14) J=0.01 err=1.00 32 | [2016/03/09 12:40:02] valid (iter=15) J=0.02 err=2.16 33 | [2016/03/09 12:44:02] monitor (iter=15) J=0.01 err=1.00 34 | [2016/03/09 12:44:12] valid (iter=16) J=0.01 err=1.55 * 35 | [2016/03/09 12:48:05] monitor (iter=16) J=0.01 err=1.00 36 | [2016/03/09 12:48:12] valid (iter=17) J=0.01 err=1.37 * 37 | [2016/03/09 12:51:40] monitor (iter=17) J=0.03 err=4.00 38 | [2016/03/09 12:51:47] valid (iter=18) J=0.01 err=1.60 39 | [2016/03/09 12:55:19] monitor (iter=18) J=0.00 err=0.00 40 | [2016/03/09 12:55:26] valid (iter=19) J=0.02 err=1.51 41 | [2016/03/09 12:58:58] monitor (iter=19) J=0.00 err=1.00 42 | [2016/03/09 12:59:05] valid (iter=20) J=0.01 err=1.55 43 | [2016/03/09 13:02:50] monitor (iter=20) J=0.00 err=1.00 44 | [2016/03/09 13:02:58] test (iter=21) J=0.01 err=1.43 45 | [2016/03/09 13:03:05] valid (iter=21) J=0.01 err=1.66 46 | [2016/03/09 13:07:04] monitor (iter=21) J=0.00 err=0.00 47 | [2016/03/09 13:07:12] valid (iter=22) J=0.01 err=1.53 48 | [2016/03/09 13:10:57] monitor (iter=22) J=0.01 err=3.00 49 | [2016/03/09 13:11:04] valid (iter=23) J=0.01 err=1.31 * 50 | [2016/03/09 13:14:32] monitor (iter=23) J=0.00 err=0.00 51 | [2016/03/09 13:14:39] valid (iter=24) J=0.01 err=1.34 52 | [2016/03/09 13:18:14] monitor (iter=24) J=0.01 err=2.00 53 | [2016/03/09 13:18:21] valid (iter=25) J=0.01 err=1.31 54 | [2016/03/09 13:21:53] monitor (iter=25) J=0.00 err=1.00 55 | [2016/03/09 13:21:59] valid (iter=26) J=0.01 err=1.29 56 | [2016/03/09 13:25:30] monitor (iter=26) J=0.01 err=2.00 57 | [2016/03/09 13:25:37] valid (iter=27) J=0.01 err=1.39 * 58 | [2016/03/09 13:29:41] monitor (iter=27) J=0.00 err=0.00 59 | [2016/03/09 13:29:50] valid (iter=28) J=0.01 err=1.33 60 | [2016/03/09 13:33:34] monitor (iter=28) J=0.00 err=0.00 61 | [2016/03/09 13:33:40] valid (iter=29) J=0.01 err=1.31 62 | [2016/03/09 13:37:10] monitor (iter=29) J=0.00 err=1.00 63 | [2016/03/09 13:37:16] valid (iter=30) J=0.01 err=1.35 64 | [2016/03/09 13:40:50] monitor (iter=30) J=0.01 err=1.00 65 | [2016/03/09 13:40:58] test (iter=31) J=0.01 err=1.38 66 | [2016/03/09 13:41:04] valid (iter=31) J=0.01 err=1.32 * 67 | [2016/03/09 13:44:36] monitor (iter=31) J=0.01 err=2.00 68 | [2016/03/09 13:44:43] valid (iter=32) J=0.01 err=1.29 69 | [2016/03/09 13:48:12] monitor (iter=32) J=0.00 err=1.00 70 | [2016/03/09 13:48:19] valid (iter=33) J=0.01 err=1.36 71 | [2016/03/09 13:52:24] monitor (iter=33) J=0.00 err=0.00 72 | [2016/03/09 13:52:32] valid (iter=34) J=0.01 err=1.35 73 | [2016/03/09 13:56:05] monitor (iter=34) J=0.00 err=1.00 74 | [2016/03/09 13:56:12] valid (iter=35) J=0.01 err=1.31 * 75 | [2016/03/09 13:59:43] monitor (iter=35) J=0.01 err=1.00 76 | [2016/03/09 13:59:50] valid (iter=36) J=0.01 err=1.37 77 | [2016/03/09 14:03:30] monitor (iter=36) J=0.00 err=0.00 78 | [2016/03/09 14:03:37] valid (iter=37) J=0.01 err=1.35 79 | [2016/03/09 14:07:25] monitor (iter=37) J=0.00 err=0.00 80 | [2016/03/09 14:07:40] valid (iter=38) J=0.01 err=1.33 81 | [2016/03/09 14:13:49] monitor (iter=38) J=0.01 err=1.00 82 | [2016/03/09 14:13:59] valid (iter=39) J=0.01 err=1.34 83 | [2016/03/09 14:19:36] monitor (iter=39) J=0.00 err=1.00 84 | [2016/03/09 14:19:45] valid (iter=40) J=0.01 err=1.36 85 | [2016/03/09 14:25:44] monitor (iter=40) J=0.00 err=0.00 86 | [2016/03/09 14:25:56] test (iter=41) J=0.01 err=1.33 87 | [2016/03/09 14:26:07] valid (iter=41) J=0.01 err=1.36 88 | [2016/03/09 14:31:34] monitor (iter=41) J=0.00 err=0.00 89 | [2016/03/09 14:31:46] valid (iter=42) J=0.01 err=1.29 90 | [2016/03/09 14:37:30] monitor (iter=42) J=0.01 err=1.00 91 | [2016/03/09 14:37:40] valid (iter=43) J=0.01 err=1.32 92 | [2016/03/09 14:43:21] monitor (iter=43) J=0.00 err=0.00 93 | [2016/03/09 14:43:32] valid (iter=44) J=0.01 err=1.37 94 | [2016/03/09 14:49:53] monitor (iter=44) J=0.00 err=1.00 95 | [2016/03/09 14:50:04] valid (iter=45) J=0.01 err=1.27 96 | [2016/03/09 14:55:42] monitor (iter=45) J=0.01 err=1.00 97 | [2016/03/09 14:55:54] valid (iter=46) J=0.01 err=1.28 98 | [2016/03/09 15:01:49] monitor (iter=46) J=0.00 err=0.00 99 | [2016/03/09 15:01:58] valid (iter=47) J=0.01 err=1.31 100 | [2016/03/09 15:08:42] monitor (iter=47) J=0.00 err=0.00 101 | [2016/03/09 15:08:53] valid (iter=48) J=0.01 err=1.28 102 | [2016/03/09 15:14:27] monitor (iter=48) J=0.00 err=0.00 103 | [2016/03/09 15:14:38] valid (iter=49) J=0.01 err=1.32 104 | [2016/03/09 15:20:29] monitor (iter=49) J=0.01 err=1.00 105 | [2016/03/09 15:20:40] valid (iter=50) J=0.01 err=1.31 106 | [2016/03/09 15:27:00] monitor (iter=50) J=0.01 err=1.00 107 | [2016/03/09 15:27:11] test (iter=51) J=0.01 err=1.36 108 | [2016/03/09 15:27:23] valid (iter=51) J=0.01 err=1.31 109 | [2016/03/09 15:33:01] monitor (iter=51) J=0.01 err=3.00 110 | [2016/03/09 15:33:12] valid (iter=52) J=0.01 err=1.28 111 | [2016/03/09 15:38:56] monitor (iter=52) J=0.01 err=2.00 112 | [2016/03/09 15:39:06] valid (iter=53) J=0.01 err=1.32 113 | [2016/03/09 15:44:29] monitor (iter=53) J=0.01 err=0.00 114 | [2016/03/09 15:44:39] valid (iter=54) J=0.01 err=1.31 115 | [2016/03/09 15:50:18] monitor (iter=54) J=0.00 err=0.00 116 | [2016/03/09 15:50:28] valid (iter=55) J=0.01 err=1.32 117 | [2016/03/09 15:50:39] test (iter=0) J=0.01 err=1.36 118 | [2016/03/09 15:50:48] test (iter=0) J=0.01 err=1.36 119 | -------------------------------------------------------------------------------- /models/highwaylrdropoutbnl2hl-l2reg3e-3-n1024-d256-T5-p02-03-03-05-b20-a3e-3-autoannealing.log: -------------------------------------------------------------------------------- 1 | # deepy version: 0.1.7 2 | [2016/03/09 14:07:16] test (iter=1) J=38152.50 err=89.80 3 | [2016/03/09 14:07:25] valid (iter=1) J=36126.05 err=89.35 * 4 | [2016/03/09 14:13:08] monitor (iter=1) J=0.07 err=12.00 5 | [2016/03/09 14:13:18] valid (iter=2) J=0.04 err=3.84 * 6 | [2016/03/09 14:18:41] monitor (iter=2) J=0.03 err=3.00 7 | [2016/03/09 14:18:50] valid (iter=3) J=0.02 err=2.92 * 8 | [2016/03/09 14:24:25] monitor (iter=3) J=0.02 err=4.00 9 | [2016/03/09 14:24:35] valid (iter=4) J=0.02 err=2.82 * 10 | [2016/03/09 14:30:00] monitor (iter=4) J=0.03 err=3.00 11 | [2016/03/09 14:30:09] valid (iter=5) J=0.02 err=2.25 * 12 | [2016/03/09 14:36:00] monitor (iter=5) J=0.04 err=3.00 13 | [2016/03/09 14:36:11] valid (iter=6) J=0.02 err=2.34 14 | [2016/03/09 14:41:36] monitor (iter=6) J=0.02 err=3.00 15 | [2016/03/09 14:41:47] valid (iter=7) J=0.02 err=2.32 * 16 | [2016/03/09 14:47:53] monitor (iter=7) J=0.03 err=3.00 17 | [2016/03/09 14:48:03] valid (iter=8) J=0.02 err=2.15 * 18 | [2016/03/09 14:53:25] monitor (iter=8) J=0.02 err=4.00 19 | [2016/03/09 14:53:35] valid (iter=9) J=0.02 err=2.12 20 | [2016/03/09 14:59:06] monitor (iter=9) J=0.02 err=4.00 21 | [2016/03/09 14:59:18] valid (iter=10) J=0.02 err=2.04 * 22 | [2016/03/09 15:05:09] monitor (iter=10) J=0.01 err=1.00 23 | [2016/03/09 15:05:18] test (iter=11) J=0.02 err=1.82 24 | [2016/03/09 15:05:28] valid (iter=11) J=0.02 err=1.81 25 | [2016/03/09 15:11:23] monitor (iter=11) J=0.01 err=1.00 26 | [2016/03/09 15:11:34] valid (iter=12) J=0.02 err=1.76 * 27 | [2016/03/09 15:17:10] monitor (iter=12) J=0.02 err=2.00 28 | [2016/03/09 15:17:21] valid (iter=13) J=0.02 err=1.99 29 | [2016/03/09 15:23:24] monitor (iter=13) J=0.01 err=0.00 30 | [2016/03/09 15:23:33] valid (iter=14) J=0.02 err=1.70 31 | [2016/03/09 15:29:02] monitor (iter=14) J=0.00 err=0.00 32 | [2016/03/09 15:29:11] valid (iter=15) J=0.02 err=1.71 33 | [2016/03/09 15:34:29] monitor (iter=15) J=0.01 err=0.00 34 | [2016/03/09 15:34:39] valid (iter=16) J=0.02 err=2.01 * 35 | [2016/03/09 15:40:08] monitor (iter=16) J=0.01 err=1.00 36 | [2016/03/09 15:40:17] valid (iter=17) J=0.01 err=1.58 * 37 | [2016/03/09 15:45:52] monitor (iter=17) J=0.02 err=4.00 38 | [2016/03/09 15:46:03] valid (iter=18) J=0.02 err=1.58 39 | [2016/03/09 15:51:03] monitor (iter=18) J=0.01 err=0.00 40 | [2016/03/09 15:51:09] valid (iter=19) J=0.02 err=1.52 41 | [2016/03/09 15:54:16] monitor (iter=19) J=0.01 err=1.00 42 | [2016/03/09 15:54:21] valid (iter=20) J=0.02 err=1.56 43 | [2016/03/09 15:57:20] monitor (iter=20) J=0.00 err=0.00 44 | [2016/03/09 15:57:26] test (iter=21) J=0.01 err=1.57 45 | [2016/03/09 15:57:31] valid (iter=21) J=0.01 err=1.66 46 | [2016/03/09 16:00:31] monitor (iter=21) J=0.00 err=0.00 47 | [2016/03/09 16:00:36] valid (iter=22) J=0.01 err=1.45 * 48 | [2016/03/09 16:03:36] monitor (iter=22) J=0.00 err=1.00 49 | [2016/03/09 16:03:41] valid (iter=23) J=0.01 err=1.30 * 50 | [2016/03/09 16:06:41] monitor (iter=23) J=0.00 err=0.00 51 | [2016/03/09 16:06:46] valid (iter=24) J=0.01 err=1.35 52 | [2016/03/09 16:09:46] monitor (iter=24) J=0.01 err=1.00 53 | [2016/03/09 16:09:51] valid (iter=25) J=0.02 err=1.46 54 | [2016/03/09 16:12:51] monitor (iter=25) J=0.01 err=1.00 55 | [2016/03/09 16:12:56] valid (iter=26) J=0.01 err=1.29 56 | [2016/03/09 16:15:56] monitor (iter=26) J=0.00 err=0.00 57 | [2016/03/09 16:16:01] valid (iter=27) J=0.01 err=1.39 * 58 | [2016/03/09 16:19:01] monitor (iter=27) J=0.00 err=0.00 59 | [2016/03/09 16:19:06] valid (iter=28) J=0.01 err=1.38 60 | [2016/03/09 16:22:06] monitor (iter=28) J=0.00 err=0.00 61 | [2016/03/09 16:22:11] valid (iter=29) J=0.01 err=1.26 62 | [2016/03/09 16:25:11] monitor (iter=29) J=0.00 err=1.00 63 | [2016/03/09 16:25:16] valid (iter=30) J=0.01 err=1.30 64 | [2016/03/09 16:28:15] monitor (iter=30) J=0.01 err=2.00 65 | [2016/03/09 16:28:20] test (iter=31) J=0.01 err=1.43 66 | [2016/03/09 16:28:25] valid (iter=31) J=0.01 err=1.38 * 67 | [2016/03/09 16:31:25] monitor (iter=31) J=0.00 err=1.00 68 | [2016/03/09 16:31:30] valid (iter=32) J=0.01 err=1.31 69 | [2016/03/09 16:34:30] monitor (iter=32) J=0.00 err=0.00 70 | [2016/03/09 16:34:35] valid (iter=33) J=0.01 err=1.31 71 | [2016/03/09 16:37:35] monitor (iter=33) J=0.00 err=0.00 72 | [2016/03/09 16:37:40] valid (iter=34) J=0.01 err=1.32 73 | [2016/03/09 16:40:40] monitor (iter=34) J=0.00 err=0.00 74 | [2016/03/09 16:40:45] valid (iter=35) J=0.01 err=1.33 75 | [2016/03/09 16:43:45] monitor (iter=35) J=0.00 err=0.00 76 | [2016/03/09 16:43:50] valid (iter=36) J=0.01 err=1.28 77 | [2016/03/09 16:46:50] monitor (iter=36) J=0.00 err=0.00 78 | [2016/03/09 16:46:55] valid (iter=37) J=0.01 err=1.32 79 | [2016/03/09 16:49:54] monitor (iter=37) J=0.00 err=0.00 80 | [2016/03/09 16:50:00] valid (iter=38) J=0.01 err=1.34 81 | [2016/03/09 16:53:00] monitor (iter=38) J=0.01 err=1.00 82 | [2016/03/09 16:53:05] valid (iter=39) J=0.01 err=1.30 * 83 | [2016/03/09 16:56:05] monitor (iter=39) J=0.00 err=0.00 84 | [2016/03/09 16:56:10] valid (iter=40) J=0.01 err=1.36 85 | [2016/03/09 16:59:10] monitor (iter=40) J=0.00 err=0.00 86 | [2016/03/09 16:59:15] test (iter=41) J=0.01 err=1.23 87 | [2016/03/09 16:59:20] valid (iter=41) J=0.01 err=1.33 88 | [2016/03/09 17:02:20] monitor (iter=41) J=0.00 err=0.00 89 | [2016/03/09 17:02:25] valid (iter=42) J=0.01 err=1.34 90 | [2016/03/09 17:05:25] monitor (iter=42) J=0.00 err=0.00 91 | [2016/03/09 17:05:30] valid (iter=43) J=0.01 err=1.31 92 | [2016/03/09 17:08:30] monitor (iter=43) J=0.00 err=1.00 93 | [2016/03/09 17:08:35] valid (iter=44) J=0.01 err=1.33 94 | [2016/03/09 17:11:35] monitor (iter=44) J=0.01 err=1.00 95 | [2016/03/09 17:11:40] valid (iter=45) J=0.01 err=1.32 96 | [2016/03/09 17:14:40] monitor (iter=45) J=0.01 err=1.00 97 | [2016/03/09 17:14:45] valid (iter=46) J=0.01 err=1.30 98 | [2016/03/09 17:17:45] monitor (iter=46) J=0.00 err=0.00 99 | [2016/03/09 17:17:50] valid (iter=47) J=0.01 err=1.32 100 | [2016/03/09 17:20:47] monitor (iter=47) J=0.00 err=0.00 101 | [2016/03/09 17:20:53] valid (iter=48) J=0.01 err=1.34 * 102 | [2016/03/09 17:23:48] monitor (iter=48) J=0.00 err=0.00 103 | [2016/03/09 17:23:53] valid (iter=49) J=0.01 err=1.33 104 | [2016/03/09 17:26:47] monitor (iter=49) J=0.00 err=0.00 105 | [2016/03/09 17:26:52] valid (iter=50) J=0.01 err=1.34 106 | [2016/03/09 17:29:46] monitor (iter=50) J=0.00 err=0.00 107 | [2016/03/09 17:29:51] test (iter=51) J=0.01 err=1.21 108 | [2016/03/09 17:29:56] valid (iter=51) J=0.01 err=1.30 109 | [2016/03/09 17:32:50] monitor (iter=51) J=0.00 err=0.00 110 | [2016/03/09 17:32:55] valid (iter=52) J=0.01 err=1.33 111 | [2016/03/09 17:35:49] monitor (iter=52) J=0.01 err=1.00 112 | [2016/03/09 17:35:54] valid (iter=53) J=0.01 err=1.31 113 | [2016/03/09 17:38:49] monitor (iter=53) J=0.00 err=0.00 114 | [2016/03/09 17:38:54] valid (iter=54) J=0.01 err=1.34 115 | [2016/03/09 17:41:48] monitor (iter=54) J=0.00 err=0.00 116 | [2016/03/09 17:41:53] valid (iter=55) J=0.01 err=1.31 117 | [2016/03/09 17:44:47] monitor (iter=55) J=0.00 err=1.00 118 | [2016/03/09 17:44:52] valid (iter=56) J=0.01 err=1.32 119 | [2016/03/09 17:47:45] monitor (iter=56) J=0.00 err=0.00 120 | [2016/03/09 17:47:51] valid (iter=57) J=0.01 err=1.32 121 | [2016/03/09 17:50:47] monitor (iter=57) J=0.00 err=0.00 122 | [2016/03/09 17:50:52] valid (iter=58) J=0.01 err=1.32 123 | [2016/03/09 17:53:49] monitor (iter=58) J=0.00 err=0.00 124 | [2016/03/09 17:53:54] valid (iter=59) J=0.01 err=1.34 125 | [2016/03/09 17:56:51] monitor (iter=59) J=0.00 err=0.00 126 | [2016/03/09 17:56:56] valid (iter=60) J=0.01 err=1.33 127 | [2016/03/09 17:59:52] monitor (iter=60) J=0.01 err=1.00 128 | [2016/03/09 17:59:57] test (iter=61) J=0.01 err=1.22 129 | [2016/03/09 18:00:03] valid (iter=61) J=0.01 err=1.31 130 | [2016/03/09 18:02:59] monitor (iter=61) J=0.00 err=0.00 131 | [2016/03/09 18:03:04] valid (iter=62) J=0.01 err=1.32 132 | [2016/03/09 18:06:00] monitor (iter=62) J=0.00 err=0.00 133 | [2016/03/09 18:06:05] valid (iter=63) J=0.01 err=1.35 134 | [2016/03/09 18:09:02] monitor (iter=63) J=0.00 err=0.00 135 | [2016/03/09 18:09:07] valid (iter=64) J=0.01 err=1.33 136 | [2016/03/09 18:12:02] monitor (iter=64) J=0.00 err=0.00 137 | [2016/03/09 18:12:07] valid (iter=65) J=0.01 err=1.33 138 | [2016/03/09 18:15:02] monitor (iter=65) J=0.00 err=0.00 139 | [2016/03/09 18:15:07] valid (iter=66) J=0.01 err=1.34 140 | [2016/03/09 18:18:03] monitor (iter=66) J=0.00 err=1.00 141 | [2016/03/09 18:18:08] valid (iter=67) J=0.01 err=1.35 142 | [2016/03/09 18:21:03] monitor (iter=67) J=0.00 err=0.00 143 | [2016/03/09 18:21:08] valid (iter=68) J=0.01 err=1.33 144 | [2016/03/09 18:21:13] test (iter=0) J=0.01 err=1.17 145 | [2016/03/09 18:21:19] test (iter=0) J=0.01 err=1.17 146 | -------------------------------------------------------------------------------- /highwaylrdiagdropoutbn_layer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # by Antonio Valerio Miceli Barone 5 | 6 | import numpy as np 7 | from deepy.layers import NeuralLayer 8 | from deepy.utils import build_activation, global_theano_rand, OrthogonalInitializer, FLOATX 9 | 10 | import theano.tensor as T 11 | 12 | class HighwayLayerLRDiagDropoutBatchNorm(NeuralLayer): 13 | """ 14 | Low-rank plus diagonal Highway network layer with internal dropout and batch normalization. 15 | 16 | Extends http://arxiv.org/abs/1505.00387. 17 | """ 18 | 19 | def __init__(self, activation='relu', init=None, gate_bias=-5, projection_dim=10, d_p_0 = 0.3, d_p_1 = 0.3, epsilon=1e-4, tau=0.1, diag_init_val=1e-2, quasi_ortho_init=False): 20 | super(HighwayLayerLRDiagDropoutBatchNorm, self).__init__("highwayLRDiagDropoutBatchNorm") 21 | self.activation = activation 22 | self.init = init 23 | self.gate_bias = gate_bias 24 | self.projection_dim = projection_dim 25 | self.d_p_0 = d_p_0 26 | self.d_p_1 = d_p_1 27 | self.epsilon = epsilon 28 | self.tau = tau 29 | self.diag_init_val = diag_init_val 30 | self.quasi_ortho_init = quasi_ortho_init 31 | 32 | def setup(self): 33 | self.output_dim = self.input_dim 34 | self._act = build_activation(self.activation) 35 | self.W_hl = self.create_weight(self.input_dim, self.projection_dim, "hl", initializer=self.init) 36 | self.W_tl = self.create_weight(self.input_dim, self.projection_dim, "tl", initializer=self.init) 37 | self.W_hr = self.create_weight(self.projection_dim, self.input_dim, "hr", initializer=self.init) 38 | self.W_tr = self.create_weight(self.projection_dim, self.input_dim, "tr", initializer=self.init) 39 | self.B_h = self.create_bias(self.input_dim, "h") 40 | self.B_t = self.create_bias(self.input_dim, "t", value=self.gate_bias) 41 | self.D_h = self.create_vector(self.input_dim, "D_h") 42 | self.D_t = self.create_vector(self.input_dim, "D_t") 43 | self.D_h.set_value(np.ones(self.input_dim, dtype=FLOATX) * self.diag_init_val) 44 | self.D_t.set_value(np.ones(self.input_dim, dtype=FLOATX) * self.diag_init_val) 45 | 46 | self.S_h = self.create_vector(self.input_dim, "S_h") 47 | self.S_t = self.create_vector(self.input_dim, "S_t") 48 | self.S_h.set_value(np.ones(self.input_dim, dtype=FLOATX)) 49 | self.S_t.set_value(np.ones(self.input_dim, dtype=FLOATX)) 50 | 51 | self.register_parameters(self.W_hl, self.B_h, self.W_tl, self.B_t, self.W_hr, self.W_tr, self.D_h, self.D_t, self.S_h, self.S_t) 52 | 53 | self.Mean_hl = self.create_vector(self.projection_dim, "Mean_hl") 54 | self.Mean_tl = self.create_vector(self.projection_dim, "Mean_tl") 55 | self.Mean_hr = self.create_vector(self.input_dim, "Mean_hr") 56 | self.Mean_tr = self.create_vector(self.input_dim, "Mean_tr") 57 | self.Std_hl = self.create_vector(self.projection_dim, "Std_hl") 58 | self.Std_tl = self.create_vector(self.projection_dim, "Std_tl") 59 | self.Std_hr = self.create_vector(self.input_dim, "Std_hr") 60 | self.Std_tr = self.create_vector(self.input_dim, "Std_tr") 61 | self.Std_hl.set_value(np.ones(self.projection_dim, dtype=FLOATX)) 62 | self.Std_tl.set_value(np.ones(self.projection_dim, dtype=FLOATX)) 63 | self.Std_hr.set_value(np.ones(self.input_dim, dtype=FLOATX)) 64 | self.Std_tr.set_value(np.ones(self.input_dim, dtype=FLOATX)) 65 | 66 | self.register_free_parameters(self.Mean_hl, self.Mean_tl, self.Mean_hr, self.Mean_tr, self.Std_hl, self.Std_tl, self.Std_hr, self.Std_tr) 67 | 68 | if self.quasi_ortho_init: 69 | self.setup_quasi_ortho_init() 70 | 71 | def setup_quasi_ortho_init(self): 72 | ortho_init = OrthogonalInitializer() 73 | ortho_init.rand = self.init.rand 74 | 75 | # Initialize low-rank decomposition matrices 76 | w_h = ortho_init.sample((self.input_dim, self.input_dim)) 77 | w_t = ortho_init.sample((self.input_dim, self.input_dim)) 78 | w_h_u, w_h_sv, w_h_v = np.linalg.svd(w_h) 79 | w_t_u, w_t_sv, w_t_v = np.linalg.svd(w_t) 80 | h_sqsv_truncated = np.diag(np.sqrt(w_h_sv[:self.projection_dim])) 81 | t_sqsv_truncated = np.diag(np.sqrt(w_t_sv[:self.projection_dim])) 82 | w_h_u_truncated = w_h_u[:, :self.projection_dim] 83 | w_t_u_truncated = w_t_u[:, :self.projection_dim] 84 | w_h_v_truncated = w_h_v[:self.projection_dim, :] 85 | w_t_v_truncated = w_t_v[:self.projection_dim, :] 86 | w_hl = np.dot(w_h_u_truncated, h_sqsv_truncated) 87 | w_tl = np.dot(w_t_u_truncated, t_sqsv_truncated) 88 | w_hr = np.dot(h_sqsv_truncated, w_h_v_truncated) 89 | w_tr = np.dot(t_sqsv_truncated, w_t_v_truncated) 90 | 91 | # Initialize diagonal matrix 92 | test_vec = self.init.rand.normal(0.0, 1.0, (self.input_dim,)) 93 | err_h = np.dot(test_vec, w_h) - np.dot(test_vec, w_hl).dot(w_hr) 94 | err_t = np.dot(test_vec, w_t) - np.dot(test_vec, w_tl).dot(w_tr) 95 | d_h = err_h / (test_vec + self.epsilon) 96 | d_t = err_t / (test_vec + self.epsilon) 97 | 98 | # Correct for dropout 99 | w_hl /= (1.0 - self.d_p_0) 100 | w_tl /= (1.0 - self.d_p_0) 101 | d_h /= (1.0 - self.d_p_0) 102 | d_t /= (1.0 - self.d_p_0) 103 | w_hr /= (1.0 - self.d_p_1) 104 | w_tr /= (1.0 - self.d_p_1) 105 | 106 | # Set values 107 | self.W_hl.set_value(w_hl.astype(FLOATX)) 108 | self.W_tl.set_value(w_tl.astype(FLOATX)) 109 | self.W_hr.set_value(w_hr.astype(FLOATX)) 110 | self.W_tr.set_value(w_tr.astype(FLOATX)) 111 | self.D_h.set_value(d_h.astype(FLOATX)) 112 | self.D_t.set_value(d_t.astype(FLOATX)) 113 | 114 | def output(self, x): 115 | d_0 = global_theano_rand.binomial(x.shape, p=1-self.d_p_0, dtype=FLOATX) 116 | d_1 = global_theano_rand.binomial((x.shape[0], self.projection_dim), p=1-self.d_p_1, dtype=FLOATX) 117 | 118 | tl_raw = T.dot(x * d_0, self.W_tl) 119 | hl_raw = T.dot(x * d_0, self.W_hl) 120 | tl_mean = T.mean(tl_raw, axis=0) 121 | hl_mean = T.mean(hl_raw, axis=0) 122 | tl_std = T.std(tl_raw, axis=0) 123 | hl_std = T.std(hl_raw, axis=0) 124 | tl = (tl_raw - tl_mean) / (tl_std + self.epsilon) 125 | hl = (hl_raw - hl_mean) / (hl_std + self.epsilon) 126 | new_Mean_tl = self.tau * tl_mean + (1.0 - self.tau) * self.Mean_tl 127 | new_Mean_hl = self.tau * hl_mean + (1.0 - self.tau) * self.Mean_hl 128 | new_Std_tl = self.tau * tl_std + (1.0 - self.tau) * self.Std_tl 129 | new_Std_hl = self.tau * hl_std + (1.0 - self.tau) * self.Std_hl 130 | 131 | tr_raw = (tl * d_1).dot(self.W_tr) + (x * d_0 * self.D_h) 132 | hr_raw = (hl * d_1).dot(self.W_hr) + (x * d_0 * self.D_t) 133 | tr_mean = T.mean(tr_raw, axis=0) 134 | hr_mean = T.mean(hr_raw, axis=0) 135 | tr_std = T.std(tr_raw, axis=0) 136 | hr_std = T.std(hr_raw, axis=0) 137 | tr = (tr_raw - tr_mean) / (tr_std + self.epsilon) 138 | hr = (hr_raw - hr_mean) / (hr_std + self.epsilon) 139 | new_Mean_tr = self.tau * tr_mean + (1.0 - self.tau) * self.Mean_tr 140 | new_Mean_hr = self.tau * hr_mean + (1.0 - self.tau) * self.Mean_hr 141 | new_Std_tr = self.tau * tr_std + (1.0 - self.tau) * self.Std_tr 142 | new_Std_hr = self.tau * hr_std + (1.0 - self.tau) * self.Std_hr 143 | 144 | t = T.nnet.sigmoid(tr * self.S_t + self.B_t) 145 | h = self._act(hr * self.S_h + self.B_h) 146 | rv = h * t + x * (1 - t) 147 | 148 | self.register_training_updates((self.Mean_tl, new_Mean_tl), 149 | (self.Mean_hl, new_Mean_hl), 150 | (self.Mean_tr, new_Mean_tr), 151 | (self.Mean_hr, new_Mean_hr), 152 | (self.Std_tl, new_Std_tl), 153 | (self.Std_hl, new_Std_hl), 154 | (self.Std_tr, new_Std_tr), 155 | (self.Std_hr, new_Std_hr)) 156 | 157 | return rv 158 | 159 | def test_output(self, x): 160 | d_0 = 1.0 - self.d_p_0 161 | d_1 = 1.0 - self.d_p_1 162 | 163 | tl_raw = T.dot(x * d_0, self.W_tl) 164 | hl_raw = T.dot(x * d_0, self.W_hl) 165 | tl = (tl_raw - self.Mean_tl) / (self.Std_tl + self.epsilon) 166 | hl = (hl_raw - self.Mean_hl) / (self.Std_hl + self.epsilon) 167 | 168 | tr_raw = (tl * d_1).dot(self.W_tr) + (x * d_0 * self.D_h) 169 | hr_raw = (hl * d_1).dot(self.W_hr) + (x * d_0 * self.D_t) 170 | tr = (tr_raw - self.Mean_tr) / (self.Std_tr + self.epsilon) 171 | hr = (hr_raw - self.Mean_hr) / (self.Std_hr + self.epsilon) 172 | 173 | t = T.nnet.sigmoid(tr * self.S_t + self.B_t) 174 | h = self._act(hr * self.S_h + self.B_h) 175 | rv = h * t + x * (1 - t) 176 | 177 | return rv 178 | 179 | --------------------------------------------------------------------------------