├── Readme.md ├── custom.py └── mnist.py /Readme.md: -------------------------------------------------------------------------------- 1 | # AdversarialGradient 2 | 3 | ## Motivations 4 | 5 | This code reproduces some of the experimental results reported in: 6 | [Improving back-propagation by adding an adversarial gradient](http://arxiv.org/abs/1510.04189). 7 | The paper introduces a very simple variant of 8 | [adversarial training](http://arxiv.org/abs/1412.6572) 9 | which yields very impressive results on MNIST, 10 | that is to say **about 0.80% error rate with a 2 x 400 ReLU MLP**. 11 | 12 | ## Requirements 13 | 14 | * Python 2.7, Numpy, Scipy 15 | * [Theano](http://deeplearning.net/software/theano/install.html) 16 | * [Lasagne](http://lasagne.readthedocs.org/en/latest/user/installation.html) 17 | 18 | ## How-to-run-it 19 | 20 | Firstly, download the MNIST dataset: 21 | 22 | wget http://deeplearning.net/data/mnist/mnist.pkl.gz 23 | 24 | Then, run the training script (which contains all the relevant hyperparameters): 25 | 26 | python mnist.py 27 | 28 | The training only lasts **5 minutes** on a TitanX GPU. 29 | The best validation error rate should be about **0.83%**, 30 | and the associated test error rate about **0.93%**. 31 | -------------------------------------------------------------------------------- /custom.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | 4 | from collections import OrderedDict 5 | 6 | import numpy as np 7 | 8 | # specifying the gpu to use 9 | # import theano.sandbox.cuda 10 | # theano.sandbox.cuda.use('gpu1') 11 | import theano 12 | import theano.tensor as T 13 | 14 | import lasagne 15 | 16 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 17 | 18 | # Given a dataset and a model, this function trains the model on the dataset for several epochs 19 | # (There is no default trainer function in Lasagne yet) 20 | def train(train_fn,val_fn, 21 | model, 22 | batch_size, 23 | LR_start,LR_decay, 24 | num_epochs, 25 | X_train,y_train, 26 | X_val,y_val, 27 | X_test,y_test, 28 | save_path=None, 29 | shuffle_parts=1): 30 | 31 | # A function which shuffles a dataset 32 | def shuffle(X,y): 33 | 34 | # print(len(X)) 35 | 36 | chunk_size = len(X)/shuffle_parts 37 | shuffled_range = range(chunk_size) 38 | 39 | X_buffer = np.copy(X[0:chunk_size]) 40 | y_buffer = np.copy(y[0:chunk_size]) 41 | 42 | for k in range(shuffle_parts): 43 | 44 | np.random.shuffle(shuffled_range) 45 | 46 | for i in range(chunk_size): 47 | 48 | X_buffer[i] = X[k*chunk_size+shuffled_range[i]] 49 | y_buffer[i] = y[k*chunk_size+shuffled_range[i]] 50 | 51 | X[k*chunk_size:(k+1)*chunk_size] = X_buffer 52 | y[k*chunk_size:(k+1)*chunk_size] = y_buffer 53 | 54 | return X,y 55 | 56 | # This function trains the model a full epoch (on the whole dataset) 57 | def train_epoch(X,y,LR): 58 | 59 | loss = 0 60 | batches = len(X)/batch_size 61 | 62 | for i in range(batches): 63 | loss += train_fn(X[i*batch_size:(i+1)*batch_size],y[i*batch_size:(i+1)*batch_size],LR) 64 | 65 | loss/=batches 66 | 67 | return loss 68 | 69 | # This function tests the model a full epoch (on the whole dataset) 70 | def val_epoch(X,y): 71 | 72 | err = 0 73 | loss = 0 74 | batches = len(X)/batch_size 75 | 76 | for i in range(batches): 77 | new_loss, new_err = val_fn(X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size]) 78 | err += new_err 79 | loss += new_loss 80 | 81 | err = err / batches * 100 82 | loss /= batches 83 | 84 | return err, loss 85 | 86 | # shuffle the train set 87 | X_train,y_train = shuffle(X_train,y_train) 88 | best_val_err = 100 89 | best_epoch = 1 90 | LR = LR_start 91 | 92 | # We iterate over epochs: 93 | for epoch in range(num_epochs): 94 | 95 | start_time = time.time() 96 | 97 | train_loss = train_epoch(X_train,y_train,LR) 98 | X_train,y_train = shuffle(X_train,y_train) 99 | 100 | val_err, val_loss = val_epoch(X_val,y_val) 101 | 102 | # test if validation error went down 103 | if val_err <= best_val_err: 104 | 105 | best_val_err = val_err 106 | best_epoch = epoch+1 107 | 108 | test_err, test_loss = val_epoch(X_test,y_test) 109 | 110 | if save_path is not None: 111 | np.savez(save_path, *lasagne.layers.get_all_param_values(model)) 112 | 113 | epoch_duration = time.time() - start_time 114 | 115 | # Then we print the results for this epoch: 116 | print("Epoch "+str(epoch + 1)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s") 117 | print(" LR: "+str(LR)) 118 | print(" training loss: "+str(train_loss)) 119 | print(" validation loss: "+str(val_loss)) 120 | print(" validation error rate: "+str(val_err)+"%") 121 | print(" best epoch: "+str(best_epoch)) 122 | print(" best validation error rate: "+str(best_val_err)+"%") 123 | print(" test loss: "+str(test_loss)) 124 | print(" test error rate: "+str(test_err)+"%") 125 | 126 | # decay the LR 127 | LR *= LR_decay -------------------------------------------------------------------------------- /mnist.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | import time 5 | 6 | import numpy as np 7 | np.random.seed(1234) # for reproducibility 8 | 9 | import theano 10 | import theano.tensor as T 11 | 12 | import lasagne 13 | 14 | import cPickle 15 | import gzip 16 | 17 | import custom 18 | from collections import OrderedDict 19 | 20 | if __name__ == "__main__": 21 | 22 | batch_size = 32 23 | print("batch_size = "+str(batch_size)) 24 | num_units = 400 25 | print("num_units = "+str(num_units)) 26 | num_epochs = 150 27 | print("num_epochs = "+str(num_epochs)) 28 | activation = T.nnet.relu 29 | print("activation = T.nnet.relu") 30 | 31 | # Decaying LR 32 | LR_start = 0.0001 33 | print("LR_start = "+str(LR_start)) 34 | LR_fin = LR_start 35 | print("LR_fin = "+str(LR_fin)) 36 | LR_decay = (LR_fin/LR_start)**(1./num_epochs) 37 | print("LR_decay = "+str(LR_decay)) 38 | 39 | save_path = "mnist_parameters.npz" 40 | print("save_path = "+str(save_path)) 41 | 42 | shuffle_parts = 1 43 | print("shuffle_parts = "+str(shuffle_parts)) 44 | 45 | print('Loading MNIST dataset...') 46 | 47 | # Loading the MNIST test set 48 | # You can get mnist.pkl.gz at http://deeplearning.net/data/mnist/mnist.pkl.gz 49 | f = gzip.open('mnist.pkl.gz', 'rb') 50 | train_set, valid_set, test_set = cPickle.load(f) 51 | f.close() 52 | 53 | # bc01 format 54 | train_set_X = train_set[0] 55 | valid_set_X = valid_set[0] 56 | test_set_X = test_set[0] 57 | 58 | # flatten targets 59 | train_set_t = np.hstack(train_set[1]) 60 | valid_set_t = np.hstack(valid_set[1]) 61 | test_set_t = np.hstack(test_set[1]) 62 | 63 | # Onehot the targets 64 | train_set_t = np.float32(np.eye(10)[train_set_t]) 65 | valid_set_t = np.float32(np.eye(10)[valid_set_t]) 66 | test_set_t = np.float32(np.eye(10)[test_set_t]) 67 | 68 | print('Specifying the graph...') 69 | 70 | # Prepare Theano variables for inputs and targets 71 | X = T.matrix('inputs') 72 | target = T.matrix('targets') 73 | LR = T.scalar('LR', dtype=theano.config.floatX) 74 | 75 | # input layer 76 | l = lasagne.layers.InputLayer(shape=(None, 784),input_var=X) 77 | 78 | # hidden layer 79 | l = lasagne.layers.DenseLayer(l, nonlinearity=lasagne.nonlinearities.identity, num_units=num_units) 80 | l = lasagne.layers.NonlinearityLayer(l,nonlinearity=activation) 81 | 82 | # hidden layer 83 | l = lasagne.layers.DenseLayer(l, nonlinearity=lasagne.nonlinearities.identity, num_units=num_units) 84 | l = lasagne.layers.NonlinearityLayer(l,nonlinearity=activation) 85 | 86 | # output layer 87 | l = lasagne.layers.DenseLayer(l, nonlinearity=lasagne.nonlinearities.identity,num_units=10) 88 | l = lasagne.layers.NonlinearityLayer(l,nonlinearity=lasagne.nonlinearities.sigmoid) 89 | 90 | def loss(t,y): 91 | return T.mean(T.nnet.binary_crossentropy(y, t)) 92 | # return -T.mean(t*T.log(y)+(1-t)*T.log(1-y)) 93 | 94 | # THIS IS THE INTERESTING PART 95 | # adversarial objective 96 | # as in http://arxiv.org/pdf/1510.04189.pdf 97 | train_output = lasagne.layers.get_output(l, inputs = X, deterministic=True) 98 | train_loss = loss(target,train_output) 99 | adversarial_X = theano.gradient.disconnected_grad(X + 0.08 * T.sgn(theano.gradient.grad(cost=train_loss,wrt=X))) 100 | train_output = lasagne.layers.get_output(l, inputs = adversarial_X, deterministic=False) 101 | train_loss = loss(target,train_output) 102 | 103 | # Parameters updates 104 | params = lasagne.layers.get_all_params(l,trainable=True) 105 | updates = lasagne.updates.adam(loss_or_grads=train_loss, params=params, learning_rate=LR) 106 | 107 | # error rate 108 | test_output = lasagne.layers.get_output(l, deterministic=True) 109 | test_loss = loss(target,test_output) 110 | test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX) 111 | 112 | print('Compiling the graph...') 113 | 114 | # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) 115 | # and returning the corresponding training loss: 116 | train_fn = theano.function([X, target, LR], train_loss, updates=updates) 117 | 118 | # Compile a second function computing the validation loss and accuracy: 119 | val_fn = theano.function([X, target], [test_loss, test_err]) 120 | 121 | print('Training...') 122 | 123 | custom.train( 124 | train_fn,val_fn, 125 | l, 126 | batch_size, 127 | LR_start,LR_decay, 128 | num_epochs, 129 | train_set_X,train_set_t, 130 | valid_set_X,valid_set_t, 131 | test_set_X,test_set_t, 132 | save_path, 133 | shuffle_parts) 134 | --------------------------------------------------------------------------------