├── Readme.md
├── custom.py
└── mnist.py


/Readme.md:
--------------------------------------------------------------------------------
 1 | # AdversarialGradient
 2 | 
 3 | ## Motivations
 4 | 
 5 | This code reproduces some of the experimental results reported in: 
 6 | [Improving back-propagation by adding an adversarial gradient](http://arxiv.org/abs/1510.04189).
 7 | The paper introduces a very simple variant of 
 8 | [adversarial training](http://arxiv.org/abs/1412.6572)
 9 | which yields very impressive results on MNIST,
10 | that is to say **about 0.80% error rate with a 2 x 400 ReLU MLP**.
11 | 
12 | ## Requirements
13 | 
14 | * Python 2.7, Numpy, Scipy
15 | * [Theano](http://deeplearning.net/software/theano/install.html)
16 | * [Lasagne](http://lasagne.readthedocs.org/en/latest/user/installation.html)
17 | 
18 | ## How-to-run-it
19 | 
20 | Firstly, download the MNIST dataset:
21 | 
22 |     wget http://deeplearning.net/data/mnist/mnist.pkl.gz
23 |     
24 | Then, run the training script (which contains all the relevant hyperparameters):
25 | 
26 |     python mnist.py
27 | 
28 | The training only lasts **5 minutes** on a TitanX GPU.
29 | The best validation error rate should be about **0.83%**,
30 | and the associated test error rate about **0.93%**.
31 | 


--------------------------------------------------------------------------------
/custom.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import time
  3 | 
  4 | from collections import OrderedDict
  5 | 
  6 | import numpy as np
  7 | 
  8 | # specifying the gpu to use
  9 | # import theano.sandbox.cuda
 10 | # theano.sandbox.cuda.use('gpu1') 
 11 | import theano
 12 | import theano.tensor as T
 13 | 
 14 | import lasagne
 15 | 
 16 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 17 | 
 18 | # Given a dataset and a model, this function trains the model on the dataset for several epochs
 19 | # (There is no default trainer function in Lasagne yet)
 20 | def train(train_fn,val_fn,
 21 |             model,
 22 |             batch_size,
 23 |             LR_start,LR_decay,
 24 |             num_epochs,
 25 |             X_train,y_train,
 26 |             X_val,y_val,
 27 |             X_test,y_test,
 28 |             save_path=None,
 29 |             shuffle_parts=1):
 30 |     
 31 |     # A function which shuffles a dataset
 32 |     def shuffle(X,y):
 33 |         
 34 |         # print(len(X))
 35 |         
 36 |         chunk_size = len(X)/shuffle_parts
 37 |         shuffled_range = range(chunk_size)
 38 |         
 39 |         X_buffer = np.copy(X[0:chunk_size])
 40 |         y_buffer = np.copy(y[0:chunk_size])
 41 |         
 42 |         for k in range(shuffle_parts):
 43 |             
 44 |             np.random.shuffle(shuffled_range)
 45 | 
 46 |             for i in range(chunk_size):
 47 |                 
 48 |                 X_buffer[i] = X[k*chunk_size+shuffled_range[i]]
 49 |                 y_buffer[i] = y[k*chunk_size+shuffled_range[i]]
 50 |             
 51 |             X[k*chunk_size:(k+1)*chunk_size] = X_buffer
 52 |             y[k*chunk_size:(k+1)*chunk_size] = y_buffer
 53 |         
 54 |         return X,y
 55 |     
 56 |     # This function trains the model a full epoch (on the whole dataset)
 57 |     def train_epoch(X,y,LR):
 58 |         
 59 |         loss = 0
 60 |         batches = len(X)/batch_size
 61 |         
 62 |         for i in range(batches):
 63 |             loss += train_fn(X[i*batch_size:(i+1)*batch_size],y[i*batch_size:(i+1)*batch_size],LR)
 64 |         
 65 |         loss/=batches
 66 |         
 67 |         return loss
 68 |     
 69 |     # This function tests the model a full epoch (on the whole dataset)
 70 |     def val_epoch(X,y):
 71 |         
 72 |         err = 0
 73 |         loss = 0
 74 |         batches = len(X)/batch_size
 75 |         
 76 |         for i in range(batches):
 77 |             new_loss, new_err = val_fn(X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size])
 78 |             err += new_err
 79 |             loss += new_loss
 80 |         
 81 |         err = err / batches * 100
 82 |         loss /= batches
 83 | 
 84 |         return err, loss
 85 |     
 86 |     # shuffle the train set
 87 |     X_train,y_train = shuffle(X_train,y_train)
 88 |     best_val_err = 100
 89 |     best_epoch = 1
 90 |     LR = LR_start
 91 |     
 92 |     # We iterate over epochs:
 93 |     for epoch in range(num_epochs):
 94 |         
 95 |         start_time = time.time()
 96 |         
 97 |         train_loss = train_epoch(X_train,y_train,LR)
 98 |         X_train,y_train = shuffle(X_train,y_train)
 99 |         
100 |         val_err, val_loss = val_epoch(X_val,y_val)
101 |         
102 |         # test if validation error went down
103 |         if val_err <= best_val_err:
104 |             
105 |             best_val_err = val_err
106 |             best_epoch = epoch+1
107 |             
108 |             test_err, test_loss = val_epoch(X_test,y_test)
109 |             
110 |             if save_path is not None:
111 |                 np.savez(save_path, *lasagne.layers.get_all_param_values(model))
112 |         
113 |         epoch_duration = time.time() - start_time
114 | 
115 |         # Then we print the results for this epoch:
116 |         print("Epoch "+str(epoch + 1)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s")
117 |         print("  LR:                            "+str(LR))
118 |         print("  training loss:                 "+str(train_loss))
119 |         print("  validation loss:               "+str(val_loss))
120 |         print("  validation error rate:         "+str(val_err)+"%")
121 |         print("  best epoch:                    "+str(best_epoch))
122 |         print("  best validation error rate:    "+str(best_val_err)+"%")
123 |         print("  test loss:                     "+str(test_loss))
124 |         print("  test error rate:               "+str(test_err)+"%") 
125 |         
126 |         # decay the LR
127 |         LR *= LR_decay


--------------------------------------------------------------------------------
/mnist.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | import os
  4 | import time
  5 | 
  6 | import numpy as np
  7 | np.random.seed(1234)  # for reproducibility
  8 | 
  9 | import theano
 10 | import theano.tensor as T
 11 | 
 12 | import lasagne
 13 | 
 14 | import cPickle
 15 | import gzip
 16 | 
 17 | import custom
 18 | from collections import OrderedDict
 19 | 
 20 | if __name__ == "__main__":
 21 |     
 22 |     batch_size = 32
 23 |     print("batch_size = "+str(batch_size))
 24 |     num_units = 400
 25 |     print("num_units = "+str(num_units))
 26 |     num_epochs = 150
 27 |     print("num_epochs = "+str(num_epochs))
 28 |     activation = T.nnet.relu
 29 |     print("activation = T.nnet.relu")
 30 |     
 31 |     # Decaying LR 
 32 |     LR_start = 0.0001
 33 |     print("LR_start = "+str(LR_start))
 34 |     LR_fin = LR_start
 35 |     print("LR_fin = "+str(LR_fin))
 36 |     LR_decay = (LR_fin/LR_start)**(1./num_epochs)
 37 |     print("LR_decay = "+str(LR_decay))
 38 |     
 39 |     save_path = "mnist_parameters.npz"
 40 |     print("save_path = "+str(save_path))
 41 |     
 42 |     shuffle_parts = 1
 43 |     print("shuffle_parts = "+str(shuffle_parts))
 44 |     
 45 |     print('Loading MNIST dataset...')
 46 |     
 47 |     # Loading the MNIST test set
 48 |     # You can get mnist.pkl.gz at http://deeplearning.net/data/mnist/mnist.pkl.gz
 49 |     f = gzip.open('mnist.pkl.gz', 'rb')
 50 |     train_set, valid_set, test_set = cPickle.load(f)
 51 |     f.close()
 52 |     
 53 |     # bc01 format    
 54 |     train_set_X = train_set[0]
 55 |     valid_set_X = valid_set[0]
 56 |     test_set_X = test_set[0]
 57 |     
 58 |     # flatten targets
 59 |     train_set_t = np.hstack(train_set[1])
 60 |     valid_set_t = np.hstack(valid_set[1])
 61 |     test_set_t = np.hstack(test_set[1])
 62 |     
 63 |     # Onehot the targets
 64 |     train_set_t = np.float32(np.eye(10)[train_set_t])    
 65 |     valid_set_t = np.float32(np.eye(10)[valid_set_t])
 66 |     test_set_t = np.float32(np.eye(10)[test_set_t])
 67 | 
 68 |     print('Specifying the graph...') 
 69 |     
 70 |     # Prepare Theano variables for inputs and targets
 71 |     X = T.matrix('inputs')
 72 |     target = T.matrix('targets')
 73 |     LR = T.scalar('LR', dtype=theano.config.floatX)
 74 |     
 75 |     # input layer
 76 |     l = lasagne.layers.InputLayer(shape=(None, 784),input_var=X)
 77 |     
 78 |     # hidden layer
 79 |     l = lasagne.layers.DenseLayer(l, nonlinearity=lasagne.nonlinearities.identity, num_units=num_units)
 80 |     l = lasagne.layers.NonlinearityLayer(l,nonlinearity=activation)
 81 |     
 82 |     # hidden layer
 83 |     l = lasagne.layers.DenseLayer(l, nonlinearity=lasagne.nonlinearities.identity, num_units=num_units)
 84 |     l = lasagne.layers.NonlinearityLayer(l,nonlinearity=activation)
 85 | 
 86 |     # output layer
 87 |     l = lasagne.layers.DenseLayer(l, nonlinearity=lasagne.nonlinearities.identity,num_units=10)
 88 |     l = lasagne.layers.NonlinearityLayer(l,nonlinearity=lasagne.nonlinearities.sigmoid)
 89 |     
 90 |     def loss(t,y):
 91 |       return T.mean(T.nnet.binary_crossentropy(y, t))
 92 |       # return -T.mean(t*T.log(y)+(1-t)*T.log(1-y))
 93 |     
 94 |     # THIS IS THE INTERESTING PART
 95 |     # adversarial objective
 96 |     # as in http://arxiv.org/pdf/1510.04189.pdf
 97 |     train_output = lasagne.layers.get_output(l, inputs = X, deterministic=True)
 98 |     train_loss = loss(target,train_output)
 99 |     adversarial_X = theano.gradient.disconnected_grad(X + 0.08 * T.sgn(theano.gradient.grad(cost=train_loss,wrt=X)))
100 |     train_output = lasagne.layers.get_output(l, inputs = adversarial_X, deterministic=False)
101 |     train_loss = loss(target,train_output)
102 |     
103 |     # Parameters updates
104 |     params = lasagne.layers.get_all_params(l,trainable=True)
105 |     updates = lasagne.updates.adam(loss_or_grads=train_loss, params=params, learning_rate=LR)
106 |     
107 |     # error rate
108 |     test_output = lasagne.layers.get_output(l, deterministic=True)
109 |     test_loss = loss(target,test_output)
110 |     test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX)
111 |     
112 |     print('Compiling the graph...')
113 |     
114 |     # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) 
115 |     # and returning the corresponding training loss:
116 |     train_fn = theano.function([X, target, LR], train_loss, updates=updates)
117 | 
118 |     # Compile a second function computing the validation loss and accuracy:
119 |     val_fn = theano.function([X, target], [test_loss, test_err])
120 | 
121 |     print('Training...')
122 |     
123 |     custom.train(
124 |             train_fn,val_fn,
125 |             l,
126 |             batch_size,
127 |             LR_start,LR_decay,
128 |             num_epochs,
129 |             train_set_X,train_set_t,
130 |             valid_set_X,valid_set_t,
131 |             test_set_X,test_set_t,
132 |             save_path,
133 |             shuffle_parts)
134 |             


--------------------------------------------------------------------------------