├── Readme.md ├── experiments └── mnist │ ├── cnn_mnist_init.m │ ├── cnn_mnist_tt.m │ └── mnist.py └── src ├── matlab ├── num_params.m ├── vl_nntt_backward.m ├── vl_nntt_forward.m └── vl_test_ttlayers.m └── python └── ttlayer.py /Readme.md: -------------------------------------------------------------------------------- 1 | # TensorNet 2 | 3 | 4 | This is a MATLAB and Theano+Lasagne implementation of the _Tensor Train layer_ (_TT-layer_) of a neural network. For a [TensorFlow implementation](https://github.com/timgaripov/TensorNet-TF) see a separate repository. 5 | 6 | In short, the TT-layer acts as a fully-connected layer but is much more compact and allows to use lots of hidden units without slowing down the learning and inference. 7 | For the additional information see the following paper: 8 | 9 | Tensorizing Neural Networks 10 | Alexander Novikov, Dmitry Podoprikhin, Anton Osokin, Dmitry Vetrov; In _Advances in Neural Information Processing Systems 28_ (NIPS-2015) [[arXiv](http://arxiv.org/abs/1509.06569)]. 11 | 12 | Please cite it if you write a scientific paper using this code. 13 | In BiBTeX format: 14 | ```latex 15 | @incollection{novikov15tensornet, 16 | author = {Novikov, Alexander and Podoprikhin, Dmitry and Osokin, Anton and Vetrov, Dmitry}, 17 | title = {Tensorizing Neural Networks}, 18 | booktitle = {Advances in Neural Information Processing Systems 28 (NIPS)}, 19 | year = {2015}, 20 | } 21 | ``` 22 | 23 | # Installation 24 | 25 | ### MATLAB version 26 | 27 | Install the [TT-Toolbox](https://github.com/oseledets/TT-Toolbox) (just download it and run `setup.m` to add everything important into the MATLAB path). 28 | 29 | Install the [MatConvNet framework](http://www.vlfeat.org/matconvnet/) (preferably with the GPU support). TensorNet works with MatConvNet 1.0-beta11 (April 2015) and higher (the latest tested version is 1.0-beta14). 30 | Add the `mataconvnet_path/examples` folder to the MATLAB path to be able to use the `cnn_train` function. 31 | 32 | Copy this repository and add the `src/matlab` folder into the MATLAB path. 33 | 34 | Now you can test TensorNet using the command 35 | ``` matlab 36 | vl_test_ttlayers 37 | ``` 38 | 39 | To test GPU support (if you have compiled MatConvNet in GPU mode) use: 40 | ``` matlab 41 | vl_test_ttlayers(1) 42 | ``` 43 | 44 | ### Theano+Lasagne version 45 | Install fresh version of [Theano](http://deeplearning.net/software/theano/) and [Lasagne](https://lasagne.readthedocs.org/en/latest/). 46 | 47 | Copy this repository and add the `src/python` folder into the Python path. 48 | 49 | # Pretrained models 50 | ### MNIST shapes 51 | In this experiment we compared how shapes and ranks influence the performance of the TT-layer using the MNIST dataset (see figure 1 and section 6.1 of the original paper for the details). Download [models in the MatConvNet format](https://www.dropbox.com/s/zk3fqnj2pyxek5c/mnist_shapes.mat?dl=1) (.mat file, 2.9 Mb) and [preprocessed MNIST dataset](https://www.dropbox.com/s/annpg39hbmdxrig/imdb.mat?dl=1) (.mat file, 132 Mb). 52 | 53 | You will find a cell array of models with metadata, the first and the last epochs of training included for each model. Example of usage (computing the validation error): 54 | ``` matlab 55 | imdb = load('imdb.mat'); 56 | load('mnist_shapes.mat'); 57 | % Choose (for example) the 5-th model whose shape equal 4 x 8 x 8 x 4. 58 | net = models{5}.lastEpoch.net; 59 | % Remove the softmax layer (unnecessary during the validation). 60 | net.layers(end) = []; 61 | valIdx = find(imdb.images.set == 3); 62 | res = vl_simplenn(net, imdb.images.data(:, :, :, valIdx)); 63 | scores = squeeze(res(end).x); 64 | [bestScore, best] = max(scores); 65 | acc = mean(best == imdb.images.labels(valIdx)); 66 | fprintf('Accuracy is %f\n', acc); 67 | ``` 68 | 69 | # Reproducing experiments 70 | Right now just one basic example on the MNIST dataset is available (more experiments from the paper are coming soon). To try it out, navigate to the `experiments/mnist` folder and type the following command in the MATLAB prompt: 71 | ``` matlab 72 | [net_tt, info_tt] = cnn_mnist_tt('expDir', 'data/mnist-tt'); 73 | ``` 74 | -------------------------------------------------------------------------------- /experiments/mnist/cnn_mnist_init.m: -------------------------------------------------------------------------------- 1 | function net = cnn_mnist_init(varargin) 2 | % Initialize a simple TensorNet for MNIST. 3 | 4 | net.layers = {} ; 5 | inputModeSize = [4, 8, 8, 4] ; 6 | secondModeSize = [5, 5, 5, 5] ; 7 | ranks = [1, 2, 2, 2, 1] ; 8 | W = tt_rand(secondModeSize.*inputModeSize, length(secondModeSize), ranks, []) ; 9 | W = tt_matrix(W, secondModeSize, inputModeSize) ; 10 | W.core = single(W.core) ; 11 | net.layers{end+1} = struct('type', 'custom', ... 12 | 'forward', @vl_nntt_forward, ... 13 | 'backward', @vl_nntt_backward, ... 14 | 'W', W, ... 15 | 'weights', {{W.core, zeros(1,1,prod(secondModeSize),'single')}}, ... 16 | 'learningRate', [1, 2], ... 17 | 'weightDecay', [1, 0], ... 18 | 'outHeight', 1, ... 19 | 'outWidth', 1, ... 20 | 'outChannels', prod(secondModeSize)) ; 21 | net.layers{end+1} = struct('type', 'relu') ; 22 | net.layers{end+1} = struct('type', 'conv', ... 23 | 'weights', {{0.1*randn(1,1,prod(secondModeSize),10, 'single'), zeros(1, 10, 'single')}}, ... 24 | 'learningRate', [1, 2], ... 25 | 'weightDecay', [1, 0], ... 26 | 'stride', 1, ... 27 | 'pad', 0) ; 28 | net.layers{end+1} = struct('type', 'softmaxloss') ; 29 | -------------------------------------------------------------------------------- /experiments/mnist/cnn_mnist_tt.m: -------------------------------------------------------------------------------- 1 | function [net, info] = cnn_mnist_tt(varargin) 2 | % CNN_MNIST_TT Demonstrated TensorNet on MNIST. 3 | 4 | % Fix the random seed. 5 | rng(0); 6 | 7 | opts.expDir = fullfile('data','mnist-baseline') ; 8 | [opts, varargin] = vl_argparse(opts, varargin) ; 9 | 10 | opts.dataDir = fullfile('data','mnist') ; 11 | opts.imdbPath = fullfile(opts.expDir, 'imdb.mat'); 12 | opts.train.batchSize = 100 ; 13 | opts.train.numEpochs = 100 ; 14 | opts.train.continue = true ; 15 | opts.train.gpus = [] ; 16 | opts.train.learningRate = logspace(-2, -5, 45) ; 17 | opts.train.expDir = opts.expDir ; 18 | opts = vl_argparse(opts, varargin) ; 19 | 20 | % -------------------------------------------------------------------- 21 | % Prepare data 22 | % -------------------------------------------------------------------- 23 | 24 | if exist(opts.imdbPath, 'file') 25 | imdb = load(opts.imdbPath) ; 26 | else 27 | imdb = getMnistImdb(opts) ; 28 | mkdir(opts.expDir) ; 29 | save(opts.imdbPath, '-struct', 'imdb') ; 30 | end 31 | 32 | net = cnn_mnist_init() ; 33 | 34 | % -------------------------------------------------------------------- 35 | % Train 36 | % -------------------------------------------------------------------- 37 | 38 | [net, info] = cnn_train(net, imdb, @getBatch, ... 39 | opts.train, ... 40 | 'val', find(imdb.images.set == 3)) ; 41 | 42 | % -------------------------------------------------------------------- 43 | function [im, labels] = getBatch(imdb, batch) 44 | % -------------------------------------------------------------------- 45 | im = imdb.images.data(:,:,:,batch) ; 46 | labels = imdb.images.labels(1,batch) ; 47 | 48 | % -------------------------------------------------------------------- 49 | function imdb = getMnistImdb(opts) 50 | % -------------------------------------------------------------------- 51 | % Prepare the imdb structure, returns image data with images of size 32 x 32 52 | % and the mean image subtracted. 53 | files = {'train-images-idx3-ubyte', ... 54 | 'train-labels-idx1-ubyte', ... 55 | 't10k-images-idx3-ubyte', ... 56 | 't10k-labels-idx1-ubyte'} ; 57 | 58 | if ~exist(opts.dataDir, 'dir') 59 | mkdir(opts.dataDir) ; 60 | end 61 | 62 | for i=1:4 63 | if ~exist(fullfile(opts.dataDir, files{i}), 'file') 64 | url = sprintf('http://yann.lecun.com/exdb/mnist/%s.gz',files{i}) ; 65 | fprintf('downloading %s\n', url) ; 66 | gunzip(url, opts.dataDir) ; 67 | end 68 | end 69 | 70 | f=fopen(fullfile(opts.dataDir, 'train-images-idx3-ubyte'),'r') ; 71 | x1=fread(f,inf,'uint8'); 72 | fclose(f) ; 73 | x1=permute(reshape(x1(17:end),28,28,60e3),[2 1 3]) ; 74 | 75 | f=fopen(fullfile(opts.dataDir, 't10k-images-idx3-ubyte'),'r') ; 76 | x2=fread(f,inf,'uint8'); 77 | fclose(f) ; 78 | x2=permute(reshape(x2(17:end),28,28,10e3),[2 1 3]) ; 79 | 80 | f=fopen(fullfile(opts.dataDir, 'train-labels-idx1-ubyte'),'r') ; 81 | y1=fread(f,inf,'uint8'); 82 | fclose(f) ; 83 | y1=double(y1(9:end)')+1 ; 84 | 85 | f=fopen(fullfile(opts.dataDir, 't10k-labels-idx1-ubyte'),'r') ; 86 | y2=fread(f,inf,'uint8'); 87 | fclose(f) ; 88 | y2=double(y2(9:end)')+1 ; 89 | 90 | set = [ones(1,numel(y1)) 3*ones(1,numel(y2))] ; 91 | dataSmall = single(reshape(cat(3, x1, x2),28,28,1,[])) ; 92 | % Fill the image with zeros on the border to resize it to 32 x 32. 93 | data = zeros(32, 32, 1, size(dataSmall, 4)) ; 94 | data(3:30, 3:30, :, :) = dataSmall ; 95 | dataMean = mean(data(:,:,:,set == 1), 4) ; 96 | data = bsxfun(@minus, data, dataMean) ; 97 | 98 | imdb.images.data = data ; 99 | imdb.images.data_mean = dataMean ; 100 | imdb.images.labels = cat(2, y1, y2) ; 101 | imdb.images.set = set ; 102 | imdb.meta.sets = {'train', 'val', 'test'} ; 103 | imdb.meta.classes = arrayfun(@(x)sprintf('%d',x),0:9,'uniformoutput',false) ; 104 | -------------------------------------------------------------------------------- /experiments/mnist/mnist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Usage example employing Lasagne and TT-layer on the MNIST dataset. 5 | 6 | This is a simplified version of 7 | https://github.com/Lasagne/Lasagne/blob/master/examples/mnist.py 8 | with using the TT-layer. 9 | """ 10 | 11 | from __future__ import print_function 12 | 13 | import sys 14 | import os 15 | import time 16 | 17 | import numpy as np 18 | import theano 19 | import theano.tensor as T 20 | 21 | import lasagne 22 | 23 | from ttlayer import TTLayer 24 | 25 | 26 | # ################## Download and prepare the MNIST dataset ################## 27 | # This is just some way of getting the MNIST dataset from an online location 28 | # and loading it into numpy arrays. It doesn't involve Lasagne at all. 29 | 30 | def load_dataset(): 31 | # We first define a download function, supporting both Python 2 and 3. 32 | if sys.version_info[0] == 2: 33 | from urllib import urlretrieve 34 | else: 35 | from urllib.request import urlretrieve 36 | 37 | def download(filename, source='http://yann.lecun.com/exdb/mnist/'): 38 | print("Downloading %s" % filename) 39 | urlretrieve(source + filename, filename) 40 | 41 | # We then define functions for loading MNIST images and labels. 42 | # For convenience, they also download the requested files if needed. 43 | import gzip 44 | 45 | def load_mnist_images(filename): 46 | if not os.path.exists(filename): 47 | download(filename) 48 | # Read the inputs in Yann LeCun's binary format. 49 | with gzip.open(filename, 'rb') as f: 50 | data = np.frombuffer(f.read(), np.uint8, offset=16) 51 | # The inputs are vectors now, we reshape them to monochrome 2D images, 52 | # following the shape convention: (examples, channels, rows, columns) 53 | data = data.reshape(-1, 1, 28, 28) 54 | # The inputs come as bytes, we convert them to float32 in range [0,1]. 55 | # (Actually to range [0, 255/256], for compatibility to the version 56 | # provided at http://deeplearning.net/data/mnist/mnist.pkl.gz.) 57 | return data / np.float32(256) 58 | 59 | def load_mnist_labels(filename): 60 | if not os.path.exists(filename): 61 | download(filename) 62 | # Read the labels in Yann LeCun's binary format. 63 | with gzip.open(filename, 'rb') as f: 64 | data = np.frombuffer(f.read(), np.uint8, offset=8) 65 | # The labels are vectors of integers now, that's exactly what we want. 66 | return data 67 | 68 | # We can now download and read the training and test set images and labels. 69 | X_train = load_mnist_images('train-images-idx3-ubyte.gz') 70 | y_train = load_mnist_labels('train-labels-idx1-ubyte.gz') 71 | X_test = load_mnist_images('t10k-images-idx3-ubyte.gz') 72 | y_test = load_mnist_labels('t10k-labels-idx1-ubyte.gz') 73 | 74 | # We reserve the last 10000 training examples for validation. 75 | X_train, X_val = X_train[:-10000], X_train[-10000:] 76 | y_train, y_val = y_train[:-10000], y_train[-10000:] 77 | 78 | # We just return all the arrays in order, as expected in main(). 79 | # (It doesn't matter how we do this as long as we can read them again.) 80 | return X_train, y_train, X_val, y_val, X_test, y_test 81 | 82 | 83 | # ##################### Build the neural network model ####################### 84 | # A function that takes a Theano variable representing the input and returns 85 | # the output layer of a neural network model built in Lasagne. 86 | 87 | def build_mlp(input_var=None): 88 | # Input layer, specifying the expected input shape of the network 89 | # (unspecified batchsize, 1 channel, 28 rows and 28 columns) and 90 | # linking it to the given Theano variable `input_var`, if any: 91 | l_in = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), 92 | input_var=input_var) 93 | 94 | # Build a TT-layer with 800 output units and all the ranks equal 3. 95 | l_hid1 = TTLayer( 96 | l_in, tt_input_shape=[4, 7, 4, 7], tt_output_shape=[5, 5, 8, 4], 97 | tt_ranks=[1, 3, 3, 3, 1], 98 | nonlinearity=lasagne.nonlinearities.rectify) 99 | 100 | # Another 800-unit layer: 101 | l_hid2 = lasagne.layers.DenseLayer( 102 | l_hid1, num_units=800, 103 | nonlinearity=lasagne.nonlinearities.rectify) 104 | 105 | # Finally, we'll add the fully-connected output layer, of 10 softmax units: 106 | l_out = lasagne.layers.DenseLayer( 107 | l_hid2, num_units=10, 108 | nonlinearity=lasagne.nonlinearities.softmax) 109 | 110 | # Each layer is linked to its incoming layer(s), so we only need to pass 111 | # the output layer to give access to a network in Lasagne: 112 | return l_out 113 | 114 | # ############################# Batch iterator ############################### 115 | # This is just a simple helper function iterating over training data in 116 | # mini-batches of a particular size, optionally in random order. It assumes 117 | # data is available as numpy arrays. For big datasets, you could load numpy 118 | # arrays as memory-mapped files (np.load(..., mmap_mode='r')), or write your 119 | # own custom data iteration function. For small datasets, you can also copy 120 | # them to GPU at once for slightly improved performance. This would involve 121 | # several changes in the main program, though, and is not demonstrated here. 122 | 123 | def iterate_minibatches(inputs, targets, batchsize, shuffle=False): 124 | assert len(inputs) == len(targets) 125 | if shuffle: 126 | indices = np.arange(len(inputs)) 127 | np.random.shuffle(indices) 128 | for start_idx in range(0, len(inputs) - batchsize + 1, batchsize): 129 | if shuffle: 130 | excerpt = indices[start_idx:start_idx + batchsize] 131 | else: 132 | excerpt = slice(start_idx, start_idx + batchsize) 133 | yield inputs[excerpt], targets[excerpt] 134 | 135 | 136 | # ############################## Main program ################################ 137 | # Everything else will be handled in our main program now. We could pull out 138 | # more functions to better separate the code, but it wouldn't make it any 139 | # easier to read. 140 | 141 | def main(num_epochs=500): 142 | # Load the dataset 143 | print("Loading data...") 144 | X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() 145 | 146 | # Prepare Theano variables for inputs and targets 147 | input_var = T.tensor4('inputs') 148 | target_var = T.ivector('targets') 149 | 150 | # Create neural network model. 151 | print("Building model and compiling functions...") 152 | network = build_mlp(input_var) 153 | 154 | # Create a loss expression for training, i.e., a scalar objective we want 155 | # to minimize (for our multi-class problem, it is the cross-entropy loss): 156 | prediction = lasagne.layers.get_output(network) 157 | loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) 158 | loss = loss.mean() 159 | # We could add some weight decay as well here, see lasagne.regularization. 160 | 161 | # Create update expressions for training, i.e., how to modify the 162 | # parameters at each training step. Here, we'll use Stochastic Gradient 163 | # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. 164 | params = lasagne.layers.get_all_params(network, trainable=True) 165 | updates = lasagne.updates.nesterov_momentum( 166 | loss, params, learning_rate=0.01, momentum=0.9) 167 | 168 | # Create a loss expression for validation/testing. The crucial difference 169 | # here is that we do a deterministic forward pass through the network, 170 | # disabling dropout layers. 171 | test_prediction = lasagne.layers.get_output(network, deterministic=True) 172 | test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, 173 | target_var) 174 | test_loss = test_loss.mean() 175 | # As a bonus, also create an expression for the classification accuracy: 176 | test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), 177 | dtype=theano.config.floatX) 178 | 179 | # Compile a function performing a training step on a mini-batch (by giving 180 | # the updates dictionary) and returning the corresponding training loss: 181 | train_fn = theano.function([input_var, target_var], loss, updates=updates) 182 | 183 | # Compile a second function computing the validation loss and accuracy: 184 | val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) 185 | 186 | # Finally, launch the training loop. 187 | print("Starting training...") 188 | # We iterate over epochs: 189 | for epoch in range(num_epochs): 190 | # In each epoch, we do a full pass over the training data: 191 | train_err = 0 192 | train_batches = 0 193 | start_time = time.time() 194 | for batch in iterate_minibatches(X_train, y_train, 500, shuffle=True): 195 | inputs, targets = batch 196 | train_err += train_fn(inputs, targets) 197 | train_batches += 1 198 | 199 | # And a full pass over the validation data: 200 | val_err = 0 201 | val_acc = 0 202 | val_batches = 0 203 | for batch in iterate_minibatches(X_val, y_val, 500, shuffle=False): 204 | inputs, targets = batch 205 | err, acc = val_fn(inputs, targets) 206 | val_err += err 207 | val_acc += acc 208 | val_batches += 1 209 | 210 | # Then we print the results for this epoch: 211 | print("Epoch {} of {} took {:.3f}s".format( 212 | epoch + 1, num_epochs, time.time() - start_time)) 213 | print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) 214 | print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) 215 | print(" validation accuracy:\t\t{:.2f} %".format( 216 | val_acc / val_batches * 100)) 217 | 218 | # After training, we compute and print the test error: 219 | test_err = 0 220 | test_acc = 0 221 | test_batches = 0 222 | for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False): 223 | inputs, targets = batch 224 | err, acc = val_fn(inputs, targets) 225 | test_err += err 226 | test_acc += acc 227 | test_batches += 1 228 | print("Final results:") 229 | print(" test loss:\t\t\t{:.6f}".format(test_err / test_batches)) 230 | print(" test accuracy:\t\t{:.2f} %".format( 231 | test_acc / test_batches * 100)) 232 | 233 | # Optionally, you could now dump the network weights to a file like this: 234 | # np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) 235 | # 236 | # And load them again later on like this: 237 | # with np.load('model.npz') as f: 238 | # param_values = [f['arr_%d' % i] for i in range(len(f.files))] 239 | # lasagne.layers.set_all_param_values(network, param_values) 240 | 241 | 242 | if __name__ == '__main__': 243 | if ('--help' in sys.argv) or ('-h' in sys.argv): 244 | print("Trains a neural network on MNIST using Lasagne.") 245 | print("Usage: %s [EPOCHS]" % sys.argv[0]) 246 | print() 247 | print("EPOCHS: number of training epochs to perform (default: 500)") 248 | else: 249 | kwargs = {} 250 | if len(sys.argv) > 1: 251 | kwargs['num_epochs'] = int(sys.argv[1]) 252 | main(**kwargs) 253 | -------------------------------------------------------------------------------- /src/matlab/num_params.m: -------------------------------------------------------------------------------- 1 | function num = num_params(layers) 2 | % Compute the number of parameters in a neural network defined by a cell 3 | % array of layers. 4 | num = 0; 5 | for iLayer = 1:numel(layers) 6 | if isfield(layers{iLayer}, 'weights') 7 | for iW = 1:numel(layers{iLayer}.weights) 8 | num = num + numel(layers{iLayer}.weights{iW}); 9 | end 10 | end 11 | % Deprecated. 12 | if isfield(layers{iLayer}, 'filters') 13 | num = num + numel(layers{iLayer}.filters); 14 | end 15 | if isfield(layers{iLayer}, 'biases') 16 | num = num + numel(layers{iLayer}.biases); 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /src/matlab/vl_nntt_backward.m: -------------------------------------------------------------------------------- 1 | function in = vl_nntt_backward(layer, in, out) 2 | % VL_NNTT_BACKWARD Tensor Train layer backward pass 3 | % in = VL_NNTT_BACKWARD(layer, in, out) computes all the necessary 4 | % derivatives for the back-propagation algorithm. 5 | % 6 | % The transformation of the layer (the forward pass) is defined as: 7 | % Y = out.x = layer.W * in.x + biases, 8 | % where biases are stored in layer.weights{2}. 9 | % 10 | % in.dzdx is the derivative of the neural network's out Z with respect to 11 | % the input in.x; 12 | % in.dzdw{1} is the derivative of Z w.r.t. the cores of the 13 | % TT-decomposition of the matrix W; 14 | % in.dzdw{2} is the derivative of Z w.r.t. the biases. 15 | % 16 | % in.x is of size inHeight x inWidth x inChannels x batchSize. 17 | % 18 | % The complexity of the backward pass is 19 | % O(ttRank^4 * modeSize * numTTCores^2 * inHeight * inWidth * inChannels * batchSize), 20 | % where 21 | % inHeight * inWidth * inChannels == modeSize^numTTCores. 22 | % TODO: make sure that we do not move the weights out of the GPU here 23 | layer.W.core = layer.weights{1}; 24 | W = layer.W; 25 | [inHeight, inWidth, inChannels, batchSize] = size(in.x); 26 | 27 | in.dzdx = full(W' * reshape(out.dzdx, [], batchSize)); 28 | in.dzdx = reshape(in.dzdx, inHeight, inWidth, inChannels, batchSize); 29 | 30 | if numel(layer.weights{2}) > 0 31 | in.dzdw{2} = sum(out.dzdx, 4); 32 | else 33 | in.dzdw{2} = []; 34 | end 35 | DZDWCore = zeros(size(W.core), 'single'); 36 | if isa(in.x, 'gpuArray') 37 | DZDWCore = gpuArray(DZDWCore); 38 | end 39 | rankArr = rank(W); 40 | corePos = W.ps; 41 | % We have a TT matrix W(i1, ..., in; j1, ..., jd). 42 | % Y = sum_{j,imageIdx} W(i, j) * in.x(j, imageIdx) + b(j). 43 | numDims = length(W.n); 44 | coreArr = core2cell(W); 45 | % On the beginning of the derDim iteration rightSum depends on: 46 | % rightSum(alpha_derDim+2, i_derDim+2, ..., i_n, imageIdx, j_1, j_2, ..., j_derDim+1). 47 | rightSum = reshape(in.x, [prod(W.m), batchSize]); 48 | rightSum = rightSum'; 49 | for derDim = numDims:-1:1 50 | % Computing the derivative of the Y w.r.t. the G_{derDim}. 51 | if (derDim < numDims) 52 | rightDim = derDim + 1; 53 | sumSize = W.m(rightDim) * rankArr(rightDim+1); 54 | core = reshape(coreArr{rightDim}, [], sumSize); 55 | rightSum = reshape(rightSum, [], W.m(rightDim)); 56 | rightSum = core * reshape(rightSum', sumSize, []); 57 | end 58 | 59 | if derDim >= 2 60 | % Permute core dimensions from 61 | % alpha_derDim-1, i_derDim-1, j_derDim-1, alpha_derDim 62 | % to 63 | % alpha_derDim-1, i_derDim-1, alpha_derDim, j_derDim-1. 64 | core = permute(coreArr{derDim-1}, [1, 2, 4, 3]); 65 | core = reshape(core, [], W.m(derDim-1)); 66 | % Permute (shift) dimensions from 67 | % alpha_derDim+1, i_derDim+1, ..., i_n, imageIdx, j_1, j_2, ..., j_derDim 68 | % to 69 | % j_derDim-1, j_derDim, alpha_derDim+1, i_derDim+1, ..., i_n, imageIdx, j_1, j_2, ..., j_derDim-2. 70 | leftSum = reshape(rightSum, [rankArr(derDim+1)*prod(W.n(derDim+1:end))*batchSize*prod(W.m(1:derDim-2)), prod(W.m(derDim-1:derDim))]); 71 | leftSum = core * reshape(leftSum.', W.m(derDim-1), []); 72 | % Permute dimensions from 73 | % alpha_derDim-1, i_derDim-1, alpha_derDim, j_derDim, alpha_derDim+1, 74 | % j_1, j_2, ..., j_derDim-2, i_derDim+1, ..., i_n, imageIdx 75 | % to 76 | % alpha_derDim-1, i_derDim-1, i_derDim+1, ..., i_n, 77 | % imageIdx, alpha_derDim, j_derDim, alpha_derDim+1, 78 | % j_1, ..., j_derDim-2. 79 | leftSumDims = [rankArr(derDim-1)*W.n(derDim-1), rankArr(derDim)*W.m(derDim)*rankArr(derDim+1), ... 80 | prod(W.n(derDim+1:end))*batchSize, prod(W.m(1:derDim-2))]; 81 | leftSum = reshape(leftSum, leftSumDims); 82 | leftSum = permute(leftSum, [1, 3, 2, 4]); 83 | % 84 | % On the beginning of the leftDim iteration leftSum depends on: 85 | % leftSum(alpha_leftDim+1, 86 | % i_leftDim+1, ..., i_derDim-1, i_derDim+1, ..., i_n, 87 | % imageIdx, 88 | % alpha_derDim, j_derDim, alpha_derDim+1) 89 | for leftDim = derDim-2:-1:1 90 | sumSize = W.m(leftDim) * rankArr(leftDim+1); 91 | core = reshape(coreArr{leftDim}, [], sumSize); 92 | leftSum = reshape(leftSum, [], W.m(leftDim)); 93 | leftSum = core * reshape(leftSum', sumSize, []); 94 | end 95 | elseif (derDim == 1) 96 | % Permute (shift) dimensions from 97 | % alpha_2, i_2, ..., i_n, imageIdx, j_1 98 | % to 99 | % i_2, ..., i_n, imageIdx, j_1, alpha_2 100 | leftSum = reshape(rightSum, rankArr(derDim+1), [], batchSize, W.m(derDim)); 101 | leftSum = permute(leftSum, [2, 3, 4, 1]); 102 | else 103 | error('Something bad happened('); 104 | end 105 | coreSize = rankArr(derDim) * W.n(derDim) * W.m(derDim) * rankArr(derDim+1); 106 | leftISize = prod(W.n(1:derDim-1)); 107 | rightISize = prod(W.n(derDim+1:end)); 108 | % Permute dimensions from 109 | % i_1, ..., i_n, imageIdx 110 | % to 111 | % i_derDim, i_1, ..., i_derDim-1, i_derDim+1, ..., i_n, imageIdx 112 | currout.dzdx = reshape(out.dzdx, leftISize, W.n(derDim), rightISize*batchSize); 113 | currout.dzdx = permute(currout.dzdx, [2, 1, 3]); 114 | sumSize = leftISize * rightISize * batchSize; 115 | der = reshape(currout.dzdx, [], sumSize) * reshape(leftSum, sumSize, []); 116 | 117 | % Permute derivative dimensions from 118 | % i_derDim, alpha_derDim, j_derDim, alpha_derDim+1 119 | % to 120 | % alpha_derDim, i_derDim, j_derDim, alpha_derDim+1. 121 | der = reshape(der, W.n(derDim), rankArr(derDim), W.m(derDim)*rankArr(derDim+1)); 122 | der = permute(der, [2, 1, 3]); 123 | DZDWCore(corePos(derDim):corePos(derDim+1)-1) = der; 124 | end 125 | in.dzdw{1} = DZDWCore; 126 | end 127 | -------------------------------------------------------------------------------- /src/matlab/vl_nntt_forward.m: -------------------------------------------------------------------------------- 1 | function out = vl_nntt_forward(layer, in, out) 2 | % VL_NNTT_FORWARD Tensor Train layer forward pass 3 | % out = VL_NNTT_FORWARD(layer, in, out) applies a linear operator layer.W 4 | % which is represented in the TT-format to the data in.x: 5 | % out.x = layer.W * in.x + biases, 6 | % where biases are stored in layer.weights{2}. 7 | % 8 | % The parameters of the model are the values of TT cores (layer.weights{1}) 9 | % and the biases (layer.weights{2}). 10 | % 11 | % in.x is of size inHeight x inWidth x inChannels x batchSize. 12 | % 13 | % The complexity of the forward pass is 14 | % O(ttRank^2 * modeSize * numTTCores * inHeight * inWidth * inChannels * batchSize), 15 | % where 16 | % inHeight * inWidth * inChannels == modeSize^numTTCores. 17 | 18 | layer.W.core = layer.weights{1}; 19 | [inHeight, inWidth, inChannels, batchSize] = size(in.x); 20 | 21 | 22 | out.x = full(layer.W * reshape(in.x, [], batchSize)); 23 | if numel(layer.weights{2}) > 0 24 | out.x = bsxfun(@plus, out.x, layer.weights{2}(:)); 25 | end 26 | out.x = reshape(out.x, layer.outHeight, layer.outWidth, layer.outChannels, batchSize); 27 | end 28 | -------------------------------------------------------------------------------- /src/matlab/vl_test_ttlayers.m: -------------------------------------------------------------------------------- 1 | function vl_test_ttlayers(gpu, tests) 2 | % VL_TEST_TTLAYERS Test the TT-layer with numeric differentiation 3 | % VL_TEST_TTLAYERS(0) Test the CPU implementation. 4 | % VL_TEST_TTLAYERS(1) Test the GPU implementation. 5 | 6 | range = 100; 7 | 8 | if nargin < 1, gpu = false ; end 9 | if gpu 10 | grandn = @(varargin) range * gpuArray.randn(varargin{:}); 11 | grand = @(varargin) range * gpuArray.rand(varargin{:}); 12 | else 13 | grandn = @(varargin) range * randn(varargin{:}); 14 | grand = @(varargin) range * rand(varargin{:}); 15 | end 16 | 17 | switch gpu 18 | case 0, 19 | fprintf('testing the CPU code\n'); 20 | case 1 21 | fprintf('testing the GPU code\n'); 22 | end 23 | 24 | rng(1); 25 | 26 | if nargin < 2 27 | tests = 1:3; 28 | end 29 | 30 | function y = vl_nntt_forward_weights(layer, in, out, iGroup, values) 31 | layer.weights{iGroup} = values; 32 | outIn = vl_nntt_forward(layer, in, out); 33 | y = outIn.x; 34 | end 35 | 36 | function y = vl_nntt_forward_x(layer, in, out, x) 37 | in.x = x; 38 | outIn = vl_nntt_forward(layer, in, out); 39 | y = outIn.x; 40 | end 41 | 42 | for l = tests 43 | fprintf('test number %d\n', l) 44 | % resets random number generator to obtain reproducible results 45 | if gpu 46 | parallel.gpu.rng(0, 'combRecursive'); 47 | else 48 | rng(0, 'combRecursive'); 49 | end 50 | switch l 51 | case 1 52 | disp('Testing vl_nntt_* with the identity TT-matrix.'); 53 | 54 | in.x = grandn(8, 32, 3, 4, 'single'); 55 | W = tt_ones([4, 4, 4, 4, 3]); 56 | W.core = single(W.core); 57 | W = diag(W); 58 | layer.W = W; 59 | layer.weights{1} = W.core; 60 | if gpu 61 | layer.weights{1} = gpuArray(layer.weights{1}); 62 | end 63 | layer.outHeight = 8; 64 | layer.outWidth = 32; 65 | layer.outChannels = 3; 66 | layer.weights{2} = grandn(8 * 32 * 3, 1, 'single'); 67 | out = []; 68 | out = vl_nntt_forward(layer, in, out); 69 | y = out.x; 70 | out.dzdx = grandn(size(y), 'single'); 71 | in = vl_nntt_backward(layer, in, out); 72 | for iGroup = 1:numel(layer.weights) 73 | vl_testder(@(w) vl_nntt_forward_weights(layer, in, out, iGroup, w), layer.weights{iGroup}, out.dzdx, in.dzdw{iGroup}, range * 1e-2); 74 | end 75 | vl_testder(@(x) vl_nntt_forward_x(layer, in, out, x), in.x, out.dzdx, in.dzdx, range * 1e-2); 76 | 77 | case 2 78 | disp('Testing vl_nntt_* with a random square TT-matrix.'); 79 | % Shape for the input and output tensors. 80 | tensorShape = [4, 4, 4, 4, 3]; 81 | batchSize = 10; 82 | ranks = [1, 4, 6, 10, 5, 1]; 83 | W = tt_rand(tensorShape.^2, 5, ranks); 84 | W.core = single(W.core); 85 | W = tt_matrix(W, tensorShape, tensorShape); 86 | layer.W = W; 87 | layer.weights{1} = W.core; 88 | if gpu 89 | layer.weights{1} = gpuArray(layer.weights{1}); 90 | end 91 | layer.outHeight = 8; 92 | layer.outWidth = 32; 93 | layer.outChannels = 3; 94 | in.x = grandn(8, 32, 3, batchSize, 'single'); 95 | layer.weights{2} = grandn(8 * 32 * 3, 1, 'single'); 96 | out = []; 97 | out = vl_nntt_forward(layer, in, out); 98 | y = out.x; 99 | exactY = full(W) * reshape(in.x, [], batchSize); 100 | exactY = bsxfun(@plus, exactY, layer.weights{2}); 101 | vl_testsim(y, reshape(exactY, 8, 32, 3, batchSize)); 102 | out.dzdx = grandn(size(y), 'single'); 103 | in = vl_nntt_backward(layer, in, out); 104 | for iGroup = 1:numel(layer.weights) 105 | vl_testder(@(w) vl_nntt_forward_weights(layer, in, out, iGroup, w), layer.weights{iGroup}, out.dzdx, in.dzdw{iGroup}, range * 1e-2); 106 | end 107 | vl_testder(@(x) vl_nntt_forward_x(layer, in, out, x), in.x, out.dzdx, in.dzdx, range * 1e-2); 108 | 109 | case 3 110 | disp('Testing vl_nntt_* with random rectangular TT-matrices.'); 111 | for bias = [false true] 112 | for batchSize = [1 3] 113 | inputTensorShape = [3, 6, 4, 5]; 114 | outputTensorShape = [4, 11, 7, 13]; 115 | layer.outHeight = 2 * 11; 116 | layer.outWidth = 7 * 13; 117 | layer.outChannels = 2; 118 | ranks = [1, 5, 9, 5, 1]; 119 | W = tt_rand(outputTensorShape .* inputTensorShape, length(inputTensorShape), ranks, []); 120 | W.core = single(W.core); 121 | W = tt_matrix(W, outputTensorShape, inputTensorShape); 122 | layer.W = W; 123 | layer.weights{1} = W.core; 124 | if gpu 125 | layer.weights{1} = gpuArray(layer.weights{1}); 126 | end 127 | if bias 128 | layer.weights{2} = grandn(prod(outputTensorShape), 1, 'single'); 129 | else 130 | layer.weights{2} = []; 131 | end 132 | in.x = grandn(9, 8, 5, batchSize, 'single'); 133 | out = []; 134 | out = vl_nntt_forward(layer, in, out); 135 | y = out.x; 136 | exactY = full(W) * reshape(in.x, [], batchSize); 137 | if bias 138 | exactY = bsxfun(@plus, exactY, layer.weights{2}); 139 | end 140 | vl_testsim(y, reshape(exactY, layer.outHeight, layer.outWidth, layer.outChannels, batchSize)); 141 | out.dzdx = grandn(size(y), 'single'); 142 | in = vl_nntt_backward(layer, in, out); 143 | for iGroup = 1:numel(layer.weights) 144 | vl_testder(@(w) vl_nntt_forward_weights(layer, in, out, iGroup, w), layer.weights{iGroup}, out.dzdx, in.dzdw{iGroup}, range * 1e-2); 145 | end 146 | vl_testder(@(x) vl_nntt_forward_x(layer, in, out, x), in.x, out.dzdx, in.dzdx, range * 1e-2); 147 | end 148 | end 149 | end 150 | end 151 | end 152 | -------------------------------------------------------------------------------- /src/python/ttlayer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import lasagne 5 | 6 | class TTLayer(lasagne.layers.Layer): 7 | """ 8 | Parameters 9 | ---------- 10 | References 11 | ---------- 12 | .. [1] Tensorizing Neural Networks 13 | Alexander Novikov, Dmitry Podoprikhin, Anton Osokin, Dmitry Vetrov 14 | In Advances in Neural Information Processing Systems 28 (NIPS-2015) 15 | Notes 16 | ----- 17 | Examples 18 | -------- 19 | """ 20 | def __init__(self, incoming, tt_input_shape, tt_output_shape, tt_ranks, 21 | cores=lasagne.init.Normal(0.01), b=lasagne.init.Constant(0.), 22 | nonlinearity=lasagne.nonlinearities.rectify, **kwargs): 23 | super(TTLayer, self).__init__(incoming, **kwargs) 24 | self.nonlinearity = (nonlinearities.identity if nonlinearity is None 25 | else nonlinearity) 26 | num_inputs = int(np.prod(self.input_shape[1:])) 27 | tt_input_shape = np.array(tt_input_shape) 28 | tt_output_shape = np.array(tt_output_shape) 29 | tt_ranks = np.array(tt_ranks) 30 | if np.prod(tt_input_shape) != num_inputs: 31 | raise ValueError("The size of the input tensor (i.e. product " 32 | "of the elements in tt_input_shape) should " 33 | "equal to the number of input neurons %d." % 34 | (num_inputs)) 35 | if tt_input_shape.shape[0] != tt_output_shape.shape[0]: 36 | raise ValueError("The number of input and output dimensions " 37 | "should be the same.") 38 | if tt_ranks.shape[0] != tt_output_shape.shape[0] + 1: 39 | raise ValueError("The number of the TT-ranks should be " 40 | "1 + the number of the dimensions.") 41 | self.tt_input_shape = tt_input_shape 42 | self.tt_output_shape = tt_output_shape 43 | self.tt_ranks = tt_ranks 44 | self.nonlinearity = nonlinearity 45 | self.num_dim = tt_input_shape.shape[0] 46 | local_cores_arr = _generate_orthogonal_tt_cores(tt_input_shape, 47 | tt_output_shape, 48 | tt_ranks) 49 | self.cores_arr = self.add_param(local_cores_arr, local_cores_arr.shape, 50 | name='cores_arr') 51 | if b is None: 52 | self.b = None 53 | else: 54 | num_units = np.prod(tt_output_shape) 55 | self.b = self.add_param(b, (num_units,), name="b", 56 | regularizable=False) 57 | 58 | def get_output_for(self, input, **kwargs): 59 | # theano.scan doesn't work when intermediate results' shape changes over 60 | # iterations (see https://github.com/Theano/Theano/issues/2127), 61 | # so we are using `for loop` instead. 62 | res = input 63 | # TODO: it maybe faster to precompute the indices in advance. 64 | core_arr_idx = 0 65 | for k in range(self.num_dim - 1, -1, -1): 66 | # res is of size o_k+1 x ... x o_d x batch_size x i_1 x ... x i_k-1 x i_k x r_k+1 67 | curr_shape = (self.tt_input_shape[k] * self.tt_ranks[k + 1], self.tt_ranks[k] * self.tt_output_shape[k]) 68 | curr_core = self.cores_arr[core_arr_idx:core_arr_idx+T.prod(curr_shape)].reshape(curr_shape) 69 | res = T.dot(res.reshape((-1, curr_shape[0])), curr_core) 70 | # res is of size o_k+1 x ... x o_d x batch_size x i_1 x ... x i_k-1 x r_k x o_k 71 | res = T.transpose(res.reshape((-1, self.tt_output_shape[k]))) 72 | # res is of size o_k x o_k+1 x ... x o_d x batch_size x i_1 x ... x i_k-1 x r_k 73 | core_arr_idx += T.prod(curr_shape) 74 | # res is of size o_1 x ... x o_d x batch_size 75 | res = T.transpose(res.reshape((-1, input.shape[0]))) 76 | # res is of size batch_size x o_1 x ... x o_d 77 | if self.b is not None: 78 | res = res + self.b.dimshuffle('x', 0) 79 | return self.nonlinearity(res) 80 | 81 | def get_output_shape_for(self, input_shape): 82 | return (input_shape[0], np.prod(self.tt_output_shape)) 83 | 84 | 85 | def _generate_orthogonal_tt_cores(input_shape, output_shape, ranks): 86 | # Generate random orthogonalized tt-tensor. 87 | input_shape = np.array(input_shape) 88 | output_shape = np.array(output_shape) 89 | ranks = np.array(ranks) 90 | cores_arr_len = np.sum(input_shape * output_shape * 91 | ranks[1:] * ranks[:-1]) 92 | cores_arr = lasagne.utils.floatX(np.zeros(cores_arr_len)) 93 | cores_arr_idx = 0 94 | core_list = [] 95 | rv = 1 96 | for k in range(input_shape.shape[0]): 97 | shape = [ranks[k], input_shape[k], output_shape[k], ranks[k+1]] 98 | tall_shape = (np.prod(shape[:3]), shape[3]) 99 | curr_core = np.dot(rv, lasagne.random.get_rng().normal(0, 1, size=(shape[0], np.prod(shape[1:])))) 100 | curr_core = curr_core.reshape(tall_shape) 101 | if k < input_shape.shape[0]-1: 102 | curr_core, rv = np.linalg.qr(curr_core) 103 | cores_arr[cores_arr_idx:cores_arr_idx+curr_core.size] = curr_core.flatten() 104 | cores_arr_idx += curr_core.size 105 | # TODO: use something reasonable instead of this dirty hack. 106 | glarot_style = (np.prod(input_shape) * np.prod(ranks))**(1.0 / input_shape.shape[0]) 107 | return (0.1 / glarot_style) * lasagne.utils.floatX(cores_arr) 108 | --------------------------------------------------------------------------------