├── custom
    ├── __init__.py
    ├── nonlinearities.py
    ├── objectives.py
    └── updates.py
├── utils
    ├── __init__.py
    ├── regularization.py
    ├── io.py
    ├── data_structures.py
    ├── signal.py
    ├── lcn.py
    ├── draw_net.py
    └── ffmpeg.py
├── modelzoo
    ├── __init__.py
    ├── pretrained_encoder.py
    ├── lstm_classifier_majority_vote.py
    ├── deltanet_v1.py
    ├── autoencoder.py
    ├── lstm_classifier_baseline.py
    ├── deltanet.py
    ├── avletters_convae.py
    ├── avletters_convae_bn.py
    ├── avletters_convae_drop.py
    ├── adenet_v2.py
    ├── avletters_convae_bndrop.py
    ├── adenet_v1.py
    ├── adenet_v1_1.py
    ├── baseline_end2end.py
    ├── avnet.py
    ├── adenet_3stream_dct.py
    ├── adenet_v2_4.py
    ├── adenet_v2_nodelta.py
    ├── adenet_v2_2.py
    ├── adenet_v2_3.py
    └── adenet_v4.py
├── dbn
    ├── displayImage.m
    ├── computeDCTfeatAndDeltas.m
    ├── computeDCTfeat.m
    ├── resizeImages.m
    ├── extractNN.m
    ├── deltas.m
    ├── DCT_Features.m
    ├── computeStates.m
    ├── normaliseData.m
    ├── RBMup.m
    ├── RBMdown.m
    ├── unfoldDBNtoNN.m
    ├── computeActivations.m
    ├── visualiseHiddenLayerWeights.m
    ├── unfoldDBNToClsf.m
    ├── dbnParamsInit.m
    ├── trainDBN.m
    ├── exampleDBN_AE.m
    ├── unfoldDBNtoAE.m
    └── zigzag.m
├── .gitignore
├── oulu
    ├── playvid.py
    ├── preprocess_images.py
    ├── landmarking.py
    └── ae_finetuner.py
├── test
    ├── test_model_io.py
    ├── test_preprocessing.py
    └── test_gen_batch_from_file.py
├── avletters
    ├── preprocess_images.py
    └── ae_finetuner.py
├── runners
    ├── extract_encoder_from_model.py
    └── extract_lstm_from_model.py
├── avletters2
    └── prepare_data.py
├── README.md
├── landmarking
    └── landmarker.py
└── cuave
    └── prepare_data.py


/custom/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/modelzoo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dbn/displayImage.m:
--------------------------------------------------------------------------------
1 | function [] = displayImage( image1D, h, w )
2 | %DISPLAYIMAGE Summary of this function goes here
3 | %   Detailed explanation goes here
4 | image = mat2gray(reshape(image1D, h, w));
5 | imshow(image);
6 | 
7 | end
8 | 
9 | 


--------------------------------------------------------------------------------
/dbn/computeDCTfeatAndDeltas.m:
--------------------------------------------------------------------------------
1 | function dctFeatures = computeDCTfeatAndDeltas(dataMatrix, w, h, noCoeff)
2 |     dctFeatures = computeDCTfeat(dataMatrix, w, h, noCoeff);
3 |     d1 = deltas(dctFeatures, 9);
4 |     d2 = deltas(d1, 9);
5 |     dctFeatures = horzcat(dctFeatures, d1, d2);
6 | end
7 | 


--------------------------------------------------------------------------------
/dbn/computeDCTfeat.m:
--------------------------------------------------------------------------------
 1 | function dctFeatures = computeDCTfeat(dataMatrix, w, h, noCoeff)
 2 | 
 3 | 
 4 | 
 5 | [noIm, dim] = size(dataMatrix);
 6 | imMatrix = zeros(h, w, noIm);
 7 | 
 8 | for i = 1:noIm
 9 |     imMatrix(:,:,i) = reshape(dataMatrix(i,:), h, w);
10 | end
11 | 
12 | dctFeatures = DCT_Features(imMatrix,noCoeff,[]);


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # DS_Store
 2 | .DS_Store
 3 | **/.DS_Store
 4 | 
 5 | #.idea
 6 | .idea/
 7 | 
 8 | # ipython checkpoints
 9 | **/.ipynb_checkpoints/
10 | 
11 | # data and model files
12 | */data
13 | */models
14 | */results
15 | config/
16 | 
17 | examples/
18 | 
19 | # .pyc files
20 | *.pyc
21 | 
22 | # png files
23 | *.png
24 | *.sh
25 | *.csv
26 | runners/experiments/
27 | 


--------------------------------------------------------------------------------
/dbn/resizeImages.m:
--------------------------------------------------------------------------------
 1 | function [ imMatrix ] = resizeImages( dataMatrix, oldHt, oldWt, newHt, newWt )
 2 | %RESIZEIMAGES Summary of this function goes here
 3 | %   Detailed explanation goes here
 4 | [noIm, ~] = size(dataMatrix);
 5 | imMatrix = zeros(noIm, newWt * newHt);
 6 | 
 7 | for i = 1:noIm
 8 |     img = reshape(dataMatrix(i,:), oldHt, oldWt);
 9 |     img = imresize(img, [newHt, newWt]);
10 |     imMatrix(i,:) = reshape(img, 1, newHt * newWt);
11 | end
12 | 
13 | end


--------------------------------------------------------------------------------
/dbn/extractNN.m:
--------------------------------------------------------------------------------
 1 | function [w1,w2,w3,w4,w5,w6,w7,w8,b1,b2,b3,b4,b5,b6,b7,b8] = extractNN( nn )
 2 | %EXTRACTNN Summary of this function goes here
 3 | %   Detailed explanation goes here
 4 | w1 = nn.W{1,1};
 5 | w2 = nn.W{1,2};
 6 | w3 = nn.W{1,3};
 7 | w4 = nn.W{1,4};
 8 | w5 = nn.W{1,5};
 9 | w6 = nn.W{1,6};
10 | w7 = nn.W{1,7};
11 | w8 = nn.W{1,8};
12 | b1 = nn.biases{1,1};
13 | b2 = nn.biases{1,2};
14 | b3 = nn.biases{1,3};
15 | b4 = nn.biases{1,4};
16 | b5 = nn.biases{1,5};
17 | b6 = nn.biases{1,6};
18 | b7 = nn.biases{1,7};
19 | b8 = nn.biases{1,8};
20 | end
21 | 
22 | 


--------------------------------------------------------------------------------
/custom/nonlinearities.py:
--------------------------------------------------------------------------------
 1 | from lasagne.nonlinearities import *
 2 | 
 3 | 
 4 | def select_nonlinearity(string):
 5 |     nonlinearities = {'rectify': rectify,
 6 |                       'sigmoid': sigmoid,
 7 |                       'leaky_rectify': leaky_rectify,
 8 |                       'very_leaky_rectify': very_leaky_rectify,
 9 |                       'tanh': tanh,
10 |                       'linear': linear,
11 |                       'softmax': softmax,
12 |                       'softplus': softplus,
13 |                       'elu': elu,
14 |                       'scaled_tanh': ScaledTanh,
15 |                       'identity': identity}
16 |     return nonlinearities[string]
17 | 


--------------------------------------------------------------------------------
/utils/regularization.py:
--------------------------------------------------------------------------------
 1 | def early_stop(cost_window):
 2 |     if len(cost_window) < 2:
 3 |         return False
 4 |     else:
 5 |         curr = cost_window[0]
 6 |         for idx, cost in enumerate(cost_window):
 7 |             if curr < cost or idx == 0:
 8 |                 curr = cost
 9 |             else:
10 |                 return False
11 |         return True
12 | 
13 | 
14 | def early_stop2(cost_window, min_val_cost, threshold):
15 |     if len(cost_window) < 2:
16 |         return False
17 |     else:
18 |         count = 0
19 |         for cost in cost_window:
20 |             if cost > min_val_cost:
21 |                 count += 1
22 |             if count == threshold:
23 |                 return True


--------------------------------------------------------------------------------
/oulu/playvid.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import numpy as np
 3 | import cv2
 4 | 
 5 | for idx in range(31, 61):
 6 |     videofile = '../examples/data/s30_v1_u{}.mp4'.format(idx)
 7 |     print('video file: {}'.format(videofile))
 8 |     cap = cv2.VideoCapture(videofile)
 9 | 
10 |     while cap.isOpened():
11 |         ret, frame = cap.read()
12 |         if ret:
13 |             frame = cv2.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_LINEAR)
14 |             gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
15 |             cv2.imshow('frame', gray)
16 |         else:
17 |             break
18 |         if cv2.waitKey(1) & 0xFF == ord('q'):
19 |             break
20 |     cap.release()
21 |     cv2.destroyAllWindows()


--------------------------------------------------------------------------------
/dbn/deltas.m:
--------------------------------------------------------------------------------
 1 | function d = deltas(x, w)
 2 | % D = deltas(X,W)  Calculate the deltas (derivatives) of a sequence
 3 | %    Use a W-point window (W odd, default 9) to calculate deltas using a
 4 | %    simple linear slope.  This mirrors the delta calculation performed 
 5 | %    in feacalc etc.  Each row of X is filtered separately.
 6 | % 2003-06-30 dpwe@ee.columbia.edu
 7 | 
 8 | if nargin < 2
 9 |   w = 9;
10 | end
11 | 
12 | [nr,nc] = size(x);
13 | 
14 | % Define window shape
15 | hlen = floor(w/2);
16 | w = 2*hlen + 1;
17 | win = hlen:-1:-hlen;
18 | 
19 | % pad data by repeating first and last columns
20 | xx = [repmat(x(:,1),1,hlen),x,repmat(x(:,end),1,hlen)];
21 | 
22 | % Apply the delta filter
23 | d = filter(win, 1, xx, [], 2);  % filter along dim 2 (rows)
24 | 
25 | % Trim edges
26 | d = d(:,2*hlen + [1:nc]);
27 | 
28 | 


--------------------------------------------------------------------------------
/modelzoo/pretrained_encoder.py:
--------------------------------------------------------------------------------
 1 | from lasagne.layers import DenseLayer
 2 | 
 3 | 
 4 | def create_pretrained_encoder(incoming, weights, biases, shapes, nonlinearities, names):
 5 |     encoder = DenseLayer(incoming, shapes[0], W=weights[0], b=biases[0], nonlinearity=nonlinearities[0], name=names[0])
 6 |     for i, num_units in enumerate(shapes[1:], 1):
 7 |         encoder = DenseLayer(encoder, shapes[i], W=weights[i], b=biases[i],
 8 |                              nonlinearity=nonlinearities[i], name=names[i])
 9 |     return encoder
10 | 
11 | 
12 | def create_encoder(incoming, shapes, nonlinearities, names):
13 |     encoder = DenseLayer(incoming, shapes[0], nonlinearity=nonlinearities[0], name=names[0])
14 |     for i, num_units in enumerate(shapes[1:], 1):
15 |         encoder = DenseLayer(encoder, shapes[i], nonlinearity=nonlinearities[i], name=names[i])
16 |     return encoder
17 | 


--------------------------------------------------------------------------------
/dbn/DCT_Features.m:
--------------------------------------------------------------------------------
 1 | function features = DCT_Features(ROIs,NumberOfCoefs2Keep,visualize)
 2 | % Extract plain DCT features for given ROIs. 
 3 | % Coefs are computed over the whole ROIs (non-block approach). 
 4 | % Keep 2:NumberOfCoefs2Keep+1 zig-zag arranged coefs. 
 5 | 
 6 | 
 7 | if nargin<3 || isempty(visualize)
 8 |     visualize = 0;
 9 | end
10 | nFrames = size(ROIs,3);
11 | % Initialization of zigzag vectors
12 | features = zeros(nFrames,NumberOfCoefs2Keep);
13 | 
14 | if visualize == 1
15 |     figure;
16 | end
17 | 
18 | for i=1:nFrames
19 |     CurrentFrame = ROIs(:,:,i);
20 |     DCTImage = dct2(CurrentFrame);
21 |     DCTzigzagVector = zigzag(DCTImage);
22 |     features(i,:) = DCTzigzagVector(2:NumberOfCoefs2Keep+1);   
23 |     if visualize == 1
24 |         imshow(DCTImage,[]), colormap(jet(64))
25 |         drawnow
26 |         pause(0.04)
27 |     end
28 |     clear DCTImage DCTzigzagvector;
29 | end
30 | 
31 | 
32 | end
33 | 
34 | 


--------------------------------------------------------------------------------
/dbn/computeStates.m:
--------------------------------------------------------------------------------
 1 | function states = computeStates(layerType, probs, data)
 2 | % computeStates - Computes states of hidden/visible layer of an RBM
 3 | 
 4 | % INPUTS
 5 | % layerType: activation function of given layer, e.g. 'sigm', 'linear',
 6 | % 'ReLu'
 7 | 
 8 | % probs: activation matrix, noExamples x noNeurons
 9 | 
10 | % data: data matrix, it's the input to the neurons, noExamples x noNeurons
11 | 
12 | % OUTPUTS
13 | % states: states matrix, noExamples x noNeurons
14 | 
15 | 
16 | [numExamples,numHid] = size(probs);
17 | 
18 | if strcmpi(layerType,'sigm')
19 |   
20 |       states = probs > rand(numExamples,numHid);
21 |   
22 |   elseif strcmpi(layerType,'linear')
23 |       
24 |       states = probs + randn(numExamples,numHid); 
25 |       
26 |   elseif strcmpi(layerType,'ReLu')
27 |       
28 | 
29 |       sigma = 1./(1 + exp(-data));
30 |       noise = sigma .* randn(numExamples, numHid);
31 |       states =  max(0,data + noise); 
32 |       
33 | end
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/dbn/normaliseData.m:
--------------------------------------------------------------------------------
 1 | function [data,PS] = normaliseData(trFcn, data, PS)
 2 | 
 3 | % in case of linear visible layer it is recommended by Hinton in "A practical guide 
 4 | %to training RBMs" to make each dimension of the feature vector to have
 5 | %zero mean and unit standard deviation.
 6 | if strcmpi(trFcn, 'linear')
 7 |     
 8 |     if isempty(PS)
 9 |         ymean = 0;
10 |         ystd = 1;
11 |         [data,PS] = mapstd(data,ymean,ystd);
12 |     else
13 |         data = mapstd('apply',data,PS);
14 |         
15 |     end
16 | %    [data,PS] = mapstd(data',ymean,ystd);
17 | %    data = data';
18 |    
19 | %    each image is zero normalised and divided by the std over all pixers over
20 | % all images
21 | % s = std(data(:));
22 | % 
23 | % [dataTemp,PS] = mapstd(data,ymean,ystd);
24 | % PS.xstd = repmat(s,size(data, 1),1);
25 | % [data,PS] = mapstd('apply',data,PS);
26 | 
27 |    
28 |    
29 | % in case the activation function of the visible layer is "sigm" i.e. data
30 | % are binary, then simply divide by the max value so the data are in the
31 | % range [0, 1].
32 | elseif strcmpi(trFcn, 'sigm')
33 |     data = data/max(data(:)); %255;
34 | end


--------------------------------------------------------------------------------
/dbn/RBMup.m:
--------------------------------------------------------------------------------
 1 | function [activations, states] = RBMup(data, weights, hidbiases, hL_type)
 2 | % RBMup - Computes activations and states of RBM's hidden layer
 3 | 
 4 | % INPUTS
 5 | % data: data matrix, noExamples x noDimensions
 6 | 
 7 | % weights: matrix containing the RBM weights, noVisibleUnits x
 8 | % noHiddenUNits
 9 | 
10 | %hidbiases: biases of hidden layer, 1 x NoVisibleNeurons
11 | 
12 | % hL_type: activation function of hidden layer, e.g. 'sigm', 'linear',
13 | % 'ReLu'
14 | 
15 | % OUTPUTS
16 | % activations: activation matrix, noExamples x noNeurons (hidden neurons)
17 | 
18 | % states: states of hidden neurons, noExamples x noNeurons (hidden neurons)
19 | 
20 | [numExamples numDims] = size(data);
21 | 
22 | %  input to hidden neurons - batchSize x noHidden neurons, each row
23 | %  contains the input to the hidden units
24 | hidInp = data * weights; 
25 | 
26 | % create biases matrix
27 | hidBiasesMatrx = repmat(hidbiases,numExamples,1);
28 |   
29 | finalHidInp = hidInp + hidBiasesMatrx;
30 |    
31 | % contains activations of hidden units, batchSize x noHidden neurons
32 | activations = computeActivations(hL_type, finalHidInp);
33 |   
34 | % compute hidden states 
35 | states = computeStates(hL_type, activations, finalHidInp);


--------------------------------------------------------------------------------
/utils/io.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import scipy.io as sio
 3 | import lasagne as las
 4 | sys.path.insert(0, '../')
 5 | try:
 6 |     import cPickle as pickle
 7 | except:
 8 |     import pickle
 9 | 
10 | 
11 | def read_data_split_file(path, sep=','):
12 |     with open(path) as f:
13 |         subjects = f.readline().split(sep)
14 |         subjects = [int(s) for s in subjects]
15 |     return subjects
16 | 
17 | 
18 | def load_mat_file(path):
19 |     """
20 |     Loads .mat file
21 |     :param path: path to .mat file
22 |     :return: dictionary containing .mat data
23 |     """
24 |     return sio.loadmat(path)
25 | 
26 | 
27 | def save_mat(dict, path):
28 |     print('save matlab file...')
29 |     sio.savemat(path, dict)
30 | 
31 | 
32 | def save_model(model, path):
33 |     pickle.dump(model, open(path, 'wb'))
34 | 
35 | 
36 | def load_model(path):
37 |     return pickle.load(open(path, 'rb'))
38 | 
39 | 
40 | def save_model_params(network, path):
41 |     all_param_values = las.layers.get_all_param_values(network)
42 |     pickle.dump(all_param_values, open(path, 'wb'))
43 | 
44 | 
45 | def load_model_params(network, path):
46 |     all_param_values = pickle.load(open(path, 'rb'))
47 |     las.layers.set_all_param_values(network, all_param_values)
48 |     return network
49 | 


--------------------------------------------------------------------------------
/dbn/RBMdown.m:
--------------------------------------------------------------------------------
 1 | function  [activations, states] = RBMdown(data, weights, visbiases, vL_type)
 2 | % RBMdown - Computes activations and states of RBM's hidden layer
 3 | 
 4 | % INPUTS
 5 | % data: data matrix, noExamples x noDimensions
 6 | 
 7 | % weights: matrix containing the RBM weights, noVisibleUnits x
 8 | % noHiddenUNits
 9 | 
10 | % visbiases: biases of visible layer, 1 x NoVisibleNeurons
11 | 
12 | % vL_type: activation function of visible layer, e.g. 'sigm', 'linear',
13 | % 'ReLu'
14 | 
15 | % OUTPUTS
16 | % activations: activation matrix, noExamples x noNeurons (visible neurons)
17 | 
18 | % states: states of visible neurons, noExamples x noNeurons (visible neurons)
19 | 
20 | 
21 | % batchSize x noDims, each row contains one example generated from the
22 | % hidden states through backpopagating their states multiplied by the
23 | % weights
24 | numExamples = size(data, 1);
25 |     
26 | inpFromHidden = data * weights';
27 |            
28 | visBiasesMatrix = repmat(visbiases,numExamples,1);
29 |          
30 | finalVisInput = inpFromHidden + visBiasesMatrix; 
31 | 
32 |  %activations of visible units
33 |  activations = computeActivations(vL_type, finalVisInput);
34 |   
35 |  % compute visible states 
36 |  states = computeStates(vL_type, activations, finalVisInput);
37 | 


--------------------------------------------------------------------------------
/dbn/unfoldDBNtoNN.m:
--------------------------------------------------------------------------------
 1 | function nn = unfoldDBNtoNN(dbnParams, dbn, outputSize)
 2 | % unfoldDBNtoNN - Unfolds DBN to NN
 3 | 
 4 | % INPUTS
 5 | % dbnParams: structure containing the DBN params, see manual for more
 6 | % details
 7 | 
 8 | % dbn: structure which contains the weights (W), the hidden biases (hidbiases) and
 9 | % the visible biases (visbiases) for each RBM layer
10 | 
11 | % outputSize: size of output layer
12 | 
13 | % OUTPUTS
14 | % nn: neural network structure, see manual for details
15 | 
16 |     
17 | if dbnParams.type == 1 % AE
18 |         
19 | 
20 |     disp('Unfolding DBN to AE')
21 |     
22 |     [weightsAE, biasesAE, newActivationFunctions, newLayers] = unfoldDBNtoAE(dbnParams, dbn, outputSize);
23 | %   nn = paramsNNinit(newLayers, newActivationFunctions);
24 |     nn.activationFunctions = newActivationFunctions;
25 |     nn.layers = newLayers;
26 |     nn.W = weightsAE;
27 |     nn.biases = biasesAE;
28 |         
29 |     
30 | elseif dbnParams.type == 2 % classification
31 | 
32 |     disp('Unfolding DBN to Classifier')
33 |     
34 |     [weightsClsf, biasesClsf, newActivationFunctions, newLayers] = unfoldDBNToClsf(dbnParams, dbn, outputSize);
35 |     nn = paramsNNinit(newLayers, newActivationFunctions);   
36 |     nn.W = weightsClsf;
37 |     nn.biases = biasesClsf;
38 |     
39 | end
40 | 
41 | 
42 | nn.pretraining = 1;
43 |         
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/dbn/computeActivations.m:
--------------------------------------------------------------------------------
 1 | function activations = computeActivations(layerType, data)
 2 | % computeActivations - Computes activations of a hidden or output layer
 3 | 
 4 | % INPUTS
 5 | % layerType: activation function of given layer, e.g. 'sigm', 'linear',
 6 | % 'ReLu'
 7 | 
 8 | % data: data matrix, it's the input to the neurons, noExamples x noNeurons
 9 | 
10 | % OUTPUTS
11 | % activations: activation matrix, noExamples x noNeurons
12 | 
13 | outputSize = size(data, 2);
14 | 
15 |  if strcmpi(layerType,'sigm')
16 |   
17 |       activations = 1./(1 + exp(-data));  
18 |       
19 |  elseif strcmpi(layerType,'tanh')
20 |   
21 |      activations = 2 * (1./(1 + exp(-2*data))) - 1; % tanh(z) = 2*sigm(2z) - 1
22 |      
23 |   elseif strcmpi(layerType,'linear')
24 |       
25 |       activations =  data; 
26 |       
27 |   elseif strcmpi(layerType,'ReLu')
28 |       
29 |       activations =  max(0,data); 
30 |       
31 |  elseif strcmpi(layerType, 'leakyReLu')
32 |      
33 |      activations = max(0.01 * data, data);
34 |      
35 |  elseif strcmpi(layerType, 'softplus')
36 |      
37 |      activations = log(1 + exp(data));
38 |      
39 |  elseif strcmpi(layerType, 'softsign')
40 |      
41 |      activations = data ./ (1 + abs(data));
42 |       
43 |  elseif strcmpi(layerType, 'softmax')
44 |      
45 |       activNominator = exp(data);
46 |       sumActiv = sum(activNominator, 2);
47 |       activations = activNominator ./ repmat(sumActiv, 1, outputSize);
48 |      
49 |   end


--------------------------------------------------------------------------------
/dbn/visualiseHiddenLayerWeights.m:
--------------------------------------------------------------------------------
 1 | function visualiseHiddenLayerWeights(weights,col,row,noImageRows)
 2 | % visualiseHiddenLayerWeights - Visualises as an image the given weights
 3 | 
 4 | % INPUTS
 5 | % weights: weightMatrix, noInputs x noHiddenNeurons (first hidden layer,
 6 | % since we usually visualise weights of the first hidden layer only)
 7 | 
 8 | % col: number of image columns 
 9 | 
10 | % row: number of image rows
11 | % The product of col and row must be equal to the number of inputs, i.e.,
12 | % the number of rows of the weights matrix
13 | 
14 | % noImageRows: number of image rows, i.e., if 10 then there will be 10 rows
15 | % of images where each row will contain floor(noHiddenNeurons / noImageRows)
16 | 
17 | [inpSize, N] = size(weights);
18 | 
19 | % find minimum/maximum weight value
20 | minValue = min(weights(:));
21 | maxValue = max(weights(:));
22 | 
23 | % no images per Row
24 | noExPerRow = floor(N / noImageRows);
25 | 
26 | img2Disp = cell(noImageRows, noExPerRow);
27 | 
28 | 
29 | for i = 1:noImageRows
30 |     
31 |     baseInd = (i - 1) * noExPerRow;  
32 | 
33 |     for j = 1:noExPerRow
34 |     
35 |         selInd =  baseInd + j;
36 |               
37 |         img = reshape(weights(:,selInd),row,col);
38 | 
39 |         img(:,end+1:end+3) = minValue; 
40 |         img(end+1:end+3,:) = minValue; 
41 |       
42 |         img2Disp{i,j} = img;
43 |  
44 |     end
45 |  
46 | end
47 | 
48 | img2DispFinal = cell2mat(img2Disp);
49 | imagesc(img2DispFinal,[minValue,maxValue]); colormap gray; axis equal; axis off;
50 | 
51 | 
52 | drawnow;
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/test/test_model_io.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | import theano.tensor as T
 4 | from lasagne.nonlinearities import rectify, linear
 5 | from modelzoo import deltanet_majority_vote
 6 | from utils.io import save_mat
 7 | 
 8 | 
 9 | class TestModelIO(unittest.TestCase):
10 |     def test_load_params(self):
11 |         window = T.iscalar('theta')
12 |         inputs1 = T.tensor3('inputs1', dtype='float32')
13 |         mask = T.matrix('mask', dtype='uint8')
14 |         network = deltanet_majority_vote.load_saved_model('../oulu/results/best_models/1stream_mfcc_w3s3.6.pkl',
15 |                                                           ([500, 200, 100, 50], [rectify, rectify, rectify, linear]),
16 |                                                           (None, None, 91), inputs1, (None, None), mask,
17 |                                                           250, window, 10)
18 |         d = deltanet_majority_vote.extract_encoder_weights(network, ['fc1', 'fc2', 'fc3', 'bottleneck'],
19 |                                                            [('w1', 'b1'), ('w2', 'b2'), ('w3', 'b3'), ('w4', 'b4')])
20 |         b = deltanet_majority_vote.extract_lstm_weights(network, ['f_blstm1', 'b_blstm1'],
21 |                                                         ['flstm', 'blstm'])
22 |         expected_keys = ['w1', 'w2', 'w3', 'w4', 'b1', 'b2', 'b3', 'b4']
23 |         keys = d.keys()
24 |         for k in keys:
25 |             assert k in expected_keys
26 |             assert type(d[k]) == np.ndarray
27 |         save_mat(d, '../oulu/models/oulu_1stream_mfcc_w3s3.mat')
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     unittest.main()
32 | 


--------------------------------------------------------------------------------
/dbn/unfoldDBNToClsf.m:
--------------------------------------------------------------------------------
 1 | function [weightsClsf, biasesClsf, newActivationFunctions newLayers] = unfoldDBNToClsf(dbnParams,dbn,outputSize)
 2 | % unfoldDBNToClsf - Unfolds DBN to NN for classification purposes
 3 | 
 4 | % INPUTS
 5 | % dbnParams: structure containing the DBN params, see manual for more
 6 | % details
 7 | 
 8 | % dbn: structure which contains the weights (W), the hidden biases (hidbiases) and
 9 | % the visible biases (visbiases) for each RBM layer
10 | 
11 | % outputSize: size of output layer
12 | 
13 | % OUTPUTS
14 | % weightsClsf: 1xN cell array, where N is the number of layers (hidden + output
15 | % layer), each cell contains the weights of the corresponding layer
16 | 
17 | % biasesClsf: 1xN cell array, where N is the number of layers (hidden + output
18 | % layer), each cell contains the biases of the corresponding layer
19 | 
20 | % newActivationFunctions: 1xN cell array, where N is the number of layers (hidden + output
21 | % layer), each cell contains the activation function of the corresponding layer
22 | 
23 | % newLayers: 1xN vector, where N is the number of layers (hidden + output
24 | % layer), each entry contains the size of the corresponding layer
25 | 
26 | % if classification then last layer is softmax
27 | newActivationFunctions = [dbnParams.hiddenActivationFunctions 'softmax'];
28 | 
29 | newLayers = [dbnParams.hiddenLayers outputSize];
30 |      
31 | % initialise weights/biases of new layer
32 | % hinton in his code initialises the last layer like this
33 | % http://www.cs.toronto.edu/~hinton/MatlabForSciencePaper.html
34 | lastLayerW = 0.1*randn(newLayers(end - 1), outputSize);
35 | lastLayerBiases = 0.1*randn(1, outputSize);
36 | 
37 | weightsClsf = [dbn.W lastLayerW];
38 | biasesClsf = [dbn.hidbiases lastLayerBiases];
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/custom/objectives.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as tt
 2 | 
 3 | 
 4 | def temporal_softmax_loss(x, y, mask):
 5 |     """
 6 |     A temporal version of softmax loss for use in RNNs. We assume that we are
 7 |     making predictions over a vocabulary of size V for each timestep of a
 8 |     timeseries of length T, over a minibatch of size N. The input x gives scores
 9 |     for all vocabulary elements at all timesteps, and y gives the indices of the
10 |     ground-truth element at each timestep. We use a cross-entropy loss at each
11 |     timestep, summing the loss over all timesteps and averaging across the
12 |     minibatch.
13 |     As an additional complication, we may want to ignore the model output at some
14 |     timesteps, since sequences of different length may have been combined into a
15 |     minibatch and padded with NULL tokens. The optional mask argument tells us
16 |     which elements should contribute to the loss.
17 |     Inputs:
18 |     - x: Input scores, of shape (N, T, V)
19 |     - y: Ground-truth indices, of shape (N, T) where each element is in the range
20 |          0 <= y[i, t] < V
21 |     - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not
22 |       the scores at x[i, t] should contribute to the loss.
23 |     Returns a tuple of:
24 |     - loss: Scalar giving loss
25 |     """
26 | 
27 |     N, T, V = x.shape
28 | 
29 |     x_flat = x.reshape((N * T, V))
30 |     y_flat = y.reshape((N * T,))
31 |     mask_flat = mask.reshape((N * T,))
32 |     total_frames = tt.sum(mask_flat)
33 | 
34 |     probs = tt.exp(x_flat - tt.max(x_flat, axis=1, keepdims=True))
35 |     probs /= tt.sum(probs, axis=1, keepdims=True)
36 |     # loss = -tt.sum(mask_flat * tt.log(probs[tt.arange(N * T), y_flat])) / N
37 |     loss = -tt.sum(mask_flat * tt.log(probs[tt.arange(N * T), y_flat])) / total_frames
38 | 
39 |     return loss
40 | 


--------------------------------------------------------------------------------
/dbn/dbnParamsInit.m:
--------------------------------------------------------------------------------
 1 | function dbnParams = dbnParamsInit(type,hiddenActivationFunctions, hiddenLayers)
 2 | % dbnParamsInit - Create Parameters for DBN
 3 | 
 4 | % INPUTS
 5 | % type: type of DBN to be trained, 1 is AE, 2 is classifier 
 6 | 
 7 | % hiddenActivationFunctions: 1xN cell array, where N is the number of
 8 | % hidden layers, each cell contains the activation function ('sigm', 'linear', 'ReLu') of the
 9 | % corresponding layer, e.g., {'sigm' 'sigm' 'sigm' 'sigm'} 
10 | 
11 | % hiddenLayers: 1xN vector, where N is the number of
12 | % hidden layers, each entry contains the size of the
13 | % corresponding hidden layer, e.g., [500 500 500 200] 
14 | 
15 | % OUTPUTS
16 | % dbnParams: structure which contains the dbnParams, see the manual for
17 | % more details
18 | 
19 | rbmParams.epochs = 10;
20 | rbmParams.batchsize = 100;
21 | rbmParams.lrW = 0.1; % learningRate for weights
22 | rbmParams.lrVb = 0.1; % learningRate for visible biases
23 | rbmParams.lrHb = 0.1; % learningRate for hidden biases
24 | 
25 | rbmParams.lrW_linear = 0.001; % learning for weights when one layer is linear
26 | rbmParams.lrVb_linear = 0.001; % learning for visible biases when one layer is linear
27 | rbmParams.lrHb_linear = 0.001; % learning for hidden biases when one layer is linear
28 | 
29 | rbmParams.weightPenaltyL2  = 0.0002;% L2 regularisation  
30 | 
31 | rbmParams.initMomentum = 0.5; % initial momentum
32 | rbmParams.finalMomentum = 0.9; % final momentum
33 | 
34 | rbmParams.momentumEpochThres = 5; %threshold after which the final momentum is used
35 | 
36 | rbmParams.type = 1; %1 is what Hinton suggests in "A practical guide to training RBMs", 2 is consistent with theory
37 | %check myRBMtrain
38 | 
39 | dbnParams.rbmParams = rbmParams;
40 | 
41 | dbnParams.type = type; %1 is AE, 2 is classifier
42 | dbnParams.inputActivationFunction = 'sigm';
43 | 
44 | dbnParams.hiddenActivationFunctions = hiddenActivationFunctions;
45 | dbnParams.hiddenLayers = hiddenLayers;
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/utils/data_structures.py:
--------------------------------------------------------------------------------
 1 | class circular_list(object):
 2 |     def __init__(self, size, init=None):
 3 |         self._data = []
 4 |         self.MAX_SIZE = size
 5 |         if init is not None:
 6 |             for i in range(size):
 7 |                 self._data.append(init)
 8 | 
 9 |     def push(self, item):
10 |         """
11 |         push item to the tail
12 |         :param item: item to insert
13 |         :return:
14 |         """
15 |         if len(self._data) == self.MAX_SIZE:
16 |             # full we have to pop the oldest item (head)
17 |             self._data.pop(0)
18 |         self._data.append(item)
19 | 
20 |     def pop(self):
21 |         """
22 |         pops the first item in the queue
23 |         :return: head of queue
24 |         """
25 |         if len(self._data) == 0:
26 |             return None
27 |         else:
28 |             return self._data.pop(0)
29 | 
30 |     def __iter__(self):
31 |         self.index = 0
32 |         return self
33 | 
34 |     def next(self):
35 |         if self.index == len(self._data):
36 |             raise StopIteration
37 |         else:
38 |             self.index += 1
39 |             return self._data[self.index - 1]
40 | 
41 |     def __getitem__(self, index):
42 |         return self._data[index]
43 | 
44 |     def __setitem__(self, index, value):
45 |         self._data[index] = value
46 | 
47 |     def __len__(self):
48 |         return len(self._data)
49 | 
50 | 
51 | def test_circular_list():
52 |     clist = circular_list(5)
53 |     clist.push(1)
54 |     clist.push(2)
55 |     clist.push(3)
56 |     clist.push(4)
57 |     clist.push(5)
58 |     clist.push(6)
59 |     clist.push(7)
60 | 
61 |     clist[1] = 8
62 | 
63 |     assert clist[0] == 3
64 |     assert clist[1] == 8
65 |     assert clist[2] == 5
66 |     assert clist[3] == 6
67 |     assert clist[4] == 7
68 |     assert len(clist) == 5
69 | 
70 |     clist2 = circular_list(7, 'hello')
71 |     for item in clist2:
72 |         assert item == 'hello'
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     test_circular_list()
77 | 


--------------------------------------------------------------------------------
/modelzoo/lstm_classifier_majority_vote.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | import lasagne as las
 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ReshapeLayer, ElemwiseSumLayer
 5 | from lasagne.layers import Gate
 6 | from lasagne.nonlinearities import tanh
 7 | from custom.layers import create_blstm, create_lstm
 8 | 
 9 | 
10 | def create_model(input_shape, input_var, mask_shape, mask_var, lstm_size=250, output_classes=26,
11 |                  w_init=las.init.GlorotUniform(), use_peepholes=False, use_blstm=True):
12 |     gate_parameters = Gate(
13 |         W_in=w_init, W_hid=w_init,
14 |         b=las.init.Constant(0.))
15 |     cell_parameters = Gate(
16 |         W_in=w_init, W_hid=w_init,
17 |         # Setting W_cell to None denotes that no cell connection will be used.
18 |         W_cell=None, b=las.init.Constant(0.),
19 |         # By convention, the cell nonlinearity is tanh in an LSTM.
20 |         nonlinearity=tanh)
21 | 
22 |     l_in = InputLayer(input_shape, input_var, 'input')
23 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
24 | 
25 |     symbolic_seqlen = l_in.input_var.shape[1]
26 |     if use_blstm:
27 |         f_lstm, b_lstm = create_blstm(l_in, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes)
28 |         l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum')
29 | 
30 |         # reshape to (num_examples * seq_len, lstm_size)
31 |         l_reshape = ReshapeLayer(l_sum, (-1, lstm_size), name='reshape')
32 |     else:
33 |         l_lstm = create_lstm(l_in, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes)
34 |         l_reshape = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape')
35 | 
36 |     # Now, we can apply feed-forward layers as usual.
37 |     # We want the network to predict a classification for the sequence,
38 |     # so we'll use a the number of classes.
39 |     l_softmax = DenseLayer(
40 |         l_reshape, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax')
41 | 
42 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output')
43 |     return l_out
44 | 


--------------------------------------------------------------------------------
/dbn/trainDBN.m:
--------------------------------------------------------------------------------
 1 | function [dbn, errorPerBatch errorPerSample] = trainDBN(dataMatrix, dbnParams)
 2 | % trainDBN - Trains a DBN
 3 | 
 4 | % INPUTS
 5 | % dataMatrix: matrix containing the training examples, size: noExamples x
 6 | % Dimensionality
 7 | % dbnParams: structure containing the DBN params, see manual for more
 8 | % details
 9 | 
10 | % OUTPUTS
11 | % dbn: structure which contains the weights (W), the hidden biases (hidbiases) and
12 | % the visible biases (visbiases) for each RBM layer
13 | 
14 | % errorPerBatch: 1xN cell array where N is the number of hidden layers (=
15 | % the number of RBMs to train and stack). Each cell contains the average
16 | % minibatch error per epoch. If number of epochs is 100 then each cell will
17 | % be 1 x 100
18 | 
19 | % errorPerSample: same as above but contains the average error per training
20 | % sample
21 | 
22 | activationFunctionsAllLayers = [dbnParams.inputActivationFunction, dbnParams.hiddenActivationFunctions];
23 | 
24 | hiddenLayers = dbnParams.hiddenLayers;
25 | nHidLayers = length(hiddenLayers);
26 | 
27 | for i = 1:nHidLayers 
28 | 
29 |     noHidNeurons = hiddenLayers(i);
30 |     [numExamples, numDims] = size(dataMatrix);
31 | 
32 |     fprintf(1,'Pretraining Layer %d with RBM: %d-%d \n',i, numDims,noHidNeurons);
33 | 
34 |     hLayer = activationFunctionsAllLayers(i + 1); % activation function of hidden layer
35 |     vLayer = activationFunctionsAllLayers(i); % activation function of visible layer
36 |     
37 |     trFctnLayers = [vLayer hLayer];
38 |     
39 |     % train RBM
40 |     [rbm, errorPerBatch{i}, errorPerSample{i}] = trainRBM(dataMatrix, dbnParams, noHidNeurons, trFctnLayers);
41 |   
42 |     % save RBM weights to corresponding DBN layer
43 |     dbn.W{i} = rbm.W; 
44 |     dbn.hidbiases{i} = rbm.hidbiases;
45 |     dbn.visbiases{i} = rbm.visbiases;
46 |     
47 |     % compute RBMs hidden activations
48 |     [posHidProbs, posHidStates] = RBMup(dataMatrix, rbm.W, rbm.hidbiases, hLayer);
49 |     
50 |     % and use them as new inputs for the following RBM
51 |     dataMatrix = posHidProbs;
52 | 
53 | end
54 | 
55 | disp('DBN training done')


--------------------------------------------------------------------------------
/dbn/exampleDBN_AE.m:
--------------------------------------------------------------------------------
 1 | 
 2 | type = 1; % 1 is AE, 2 is classifier, 
 3 | 
 4 | 
 5 | 
 6 | % train_x = double(train_x(1:50000,:));
 7 | % train_y = double(train_y(1:50000,:));
 8 | 
 9 | train_x = dataMatrix;
10 | %train_x = trData; %vertcat(trData, valData, testData);
11 | % train_x = cat(1, testDataResized, trainDataResized);
12 | 
13 | 
14 | inputSize = size(train_x,2);
15 | 
16 | if type == 1 % AE
17 |    outputSize  = inputSize; % in case of AE it should be equal to the number of inputs
18 | 
19 |    %if type = 1, i.e., AE then the last layer should be linear and usually a
20 | % series of decreasing layers are used
21 |     hiddenActivationFunctions = {'ReLu','ReLu','ReLu','linear'};%{'sigm','sigm','sigm','linear'}; 
22 |     hiddenLayers = [200 100 50 20]; 
23 |    
24 | elseif type == 2 % classifier
25 |     outputSize = size(train_y,2); % in case of classification it should be equal to the number of classes
26 | 
27 |     hiddenActivationFunctions = {'sigm','sigm','sigm'};%{'ReLu','ReLu','ReLu','ReLu'};%
28 |     hiddenLayers = [500 500 1000 ]; % hidden layers sizes, does not include input or output layers
29 | 
30 | end
31 | 
32 | % parameters used for visualisation of first layer weights
33 | visParams.noExamplesPerSubplot = 50; % number of images to show per row
34 | visParams.noSubplots = floor(hiddenLayers(1) / visParams.noExamplesPerSubplot);
35 | visParams.col = 45; %44;% number columns of image
36 | visParams.row = 30; %26 number rows of image
37 | 
38 | 
39 | 
40 | dbnParams = dbnParamsInit(type, hiddenActivationFunctions, hiddenLayers);
41 | dbnParams.inputActivationFunction = 'linear'; %sigm for binary inputs, linear for continuous input
42 | dbnParams.rbmParams.epochs = 20;
43 | 
44 | % normalise data
45 | train_x = normaliseData(dbnParams.inputActivationFunction, train_x,[]);
46 | 
47 | % train Deep Belief Network
48 | [dbn, errorPerBatch, errorPerSample] = trainDBN(train_x, dbnParams);
49 | 
50 | % visualise weights of first layer
51 | % visualiseHiddenLayerWeights(dbn.W{1},visParams.col,visParams.row,visParams.noSubplots);
52 | 
53 | nn = unfoldDBNtoNN(dbnParams, dbn, outputSize);
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/modelzoo/deltanet_v1.py:
--------------------------------------------------------------------------------
 1 | import lasagne as las
 2 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ReshapeLayer, ElemwiseSumLayer
 3 | from lasagne.layers import Gate
 4 | from lasagne.nonlinearities import tanh
 5 | from custom.layers import create_blstm, DeltaLayer, create_lstm
 6 | 
 7 | 
 8 | def create_model(input_shape, input_var, mask_shape, mask_var, window, lstm_size=250, output_classes=26,
 9 |                  w_init=las.init.GlorotUniform(), use_peepholes=False, use_blstm=True):
10 |     gate_parameters = Gate(
11 |         W_in=w_init, W_hid=w_init,
12 |         b=las.init.Constant(0.))
13 |     cell_parameters = Gate(
14 |         W_in=w_init, W_hid=w_init,
15 |         # Setting W_cell to None denotes that no cell connection will be used.
16 |         W_cell=None, b=las.init.Constant(0.),
17 |         # By convention, the cell nonlinearity is tanh in an LSTM.
18 |         nonlinearity=tanh)
19 | 
20 |     l_in = InputLayer(input_shape, input_var, 'input')
21 |     l_mask = InputLayer(mask_shape, mask_var, name='mask')
22 | 
23 |     symbolic_seqlen = l_in.input_var.shape[1]
24 |     l_delta = DeltaLayer(l_in, window, name='delta')
25 | 
26 |     if use_blstm:
27 |         f_lstm, b_lstm = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes)
28 |         l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum')
29 |         # reshape to (num_examples * seq_len, lstm_size)
30 |         l_reshape = ReshapeLayer(l_sum, (-1, lstm_size), name='reshape')
31 |     else:
32 |         l_lstm = create_lstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes)
33 |         l_reshape = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape')
34 | 
35 |     # Now, we can apply feed-forward layers as usual.
36 |     # We want the network to predict a classification for the sequence,
37 |     # so we'll use a the number of classes.
38 |     l_softmax = DenseLayer(
39 |         l_reshape, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax')
40 | 
41 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output')
42 |     return l_out
43 | 


--------------------------------------------------------------------------------
/dbn/unfoldDBNtoAE.m:
--------------------------------------------------------------------------------
 1 | function [weightsAE, biasesAE, newActivationFunctions, newLayers] = unfoldDBNtoAE(dbnParams, dbn, outputSize)
 2 | % unfoldDBNtoAE - Unfolds DBN to an autoencoder NN
 3 | 
 4 | % INPUTS
 5 | % dbnParams: structure containing the DBN params, see manual for more
 6 | % details
 7 | 
 8 | % dbn: structure which contains the weights (W), the hidden biases (hidbiases) and
 9 | % the visible biases (visbiases) for each RBM layer
10 | 
11 | % outputSize: size of output layer
12 | 
13 | % OUTPUTS
14 | % weightsAE: 1xN cell array, where N is the number of layers (hidden + output
15 | % layer), each cell contains the weights of the corresponding layer
16 | 
17 | % biasesAE: 1xN cell array, where N is the number of layers (hidden + output
18 | % layer), each cell contains the biases of the corresponding layer
19 | 
20 | % newActivationFunctions: 1xN cell array, where N is the number of layers (hidden + output
21 | % layer), each cell contains the activation function of the corresponding layer
22 | 
23 | % newLayers: 1xN vector, where N is the number of layers (hidden + output
24 | % layer), each entry contains the size of the corresponding layer
25 | 
26 | noLayers = length(dbnParams.hiddenLayers); 
27 | 
28 | % create encoding layers
29 | weightsAE = dbn.W;
30 | biasesAE = dbn.hidbiases;
31 | inputSize = size(dbn.W{1},1);
32 | 
33 | if inputSize ~= outputSize
34 |     error('Input size is different that output size. In an AE they should have the same size')
35 | end
36 |  
37 | ind = 1;
38 | % create decoding layers, where weights/biases are mirrored from the
39 | % encoding layer
40 | for i = noLayers + 1:2*noLayers
41 |     
42 |     index = i - ind;
43 |     weightsAE{i} = dbn.W{index}';
44 |     biasesAE{i} = dbn.visbiases{index};
45 |     
46 |     ind = ind + 2;
47 |     
48 | end
49 | 
50 | % create new activation functions (activFcn from encoding layer + same
51 | % activFcn flipped for decoding layer + outputActivFcn same as inputActivFcn
52 | newActivationFunctions = [dbnParams.hiddenActivationFunctions fliplr(dbnParams.hiddenActivationFunctions(1:end-1)) dbnParams.inputActivationFunction];
53 | % same as above for hidden layers
54 | newLayers = [dbnParams.hiddenLayers fliplr(dbnParams.hiddenLayers(1:end-1)) outputSize];
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/modelzoo/autoencoder.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | import lasagne as las
 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
 5 | from lasagne.layers import Gate, DropoutLayer
 6 | from lasagne.nonlinearities import tanh, sigmoid, linear
 7 | 
 8 | import scipy.io as sio
 9 | 
10 | 
11 | def load_dbn(path='models/avletters_ae.mat'):
12 |     """
13 |     load a pretrained dbn from path
14 |     :param path: path to the .mat dbn
15 |     :return: pretrained deep belief network
16 |     """
17 |     # create the network using weights from pretrain_nn.mat
18 |     nn = sio.loadmat(path)
19 |     w = []
20 |     b = []
21 |     w.append(nn['w1'])
22 |     w.append(nn['w2'])
23 |     w.append(nn['w3'])
24 |     w.append(nn['w4'])
25 |     w.append(nn['w5'])
26 |     w.append(nn['w6'])
27 |     w.append(nn['w7'])
28 |     w.append(nn['w8'])
29 |     b.append(nn['b1'][0])
30 |     b.append(nn['b2'][0])
31 |     b.append(nn['b3'][0])
32 |     b.append(nn['b4'][0])
33 |     b.append(nn['b5'][0])
34 |     b.append(nn['b6'][0])
35 |     b.append(nn['b7'][0])
36 |     b.append(nn['b8'][0])
37 |     return w, b
38 | 
39 | 
40 | def create_model(incoming, weights, biases, activations, layersizes):
41 |     """
42 |     Create an autoencoder given pretrained weights and activations
43 |     :param: incoming: incoming layer (input layer)
44 |     :param weights: layer weights
45 |     :param biases: layer biases
46 |     :param activations: activation functions for each layer
47 |     :param layersizes: num hidden units for each layer
48 |     :return: autoencoder model
49 |     """
50 |     for i, w in enumerate(weights):
51 |         incoming = DenseLayer(incoming, layersizes[i], w, biases[i], activations[i], name='fc{}'.format(i + 1))
52 |     return incoming
53 | 
54 | 
55 | def create_pretrained_encoder(incoming, weights, biases, activations, layersizes):
56 |     l_1 = DenseLayer(incoming, layersizes[0], W=weights[0], b=biases[0], nonlinearity=activations[0], name='fc1')
57 |     l_2 = DenseLayer(l_1, layersizes[1], W=weights[1], b=biases[1], nonlinearity=activations[1], name='fc2')
58 |     l_3 = DenseLayer(l_2, layersizes[2], W=weights[2], b=biases[2], nonlinearity=activations[2], name='fc3')
59 |     l_4 = DenseLayer(l_3, layersizes[3], W=weights[3], b=biases[3], nonlinearity=activations[3], name='bottleneck')
60 |     return l_4


--------------------------------------------------------------------------------
/dbn/zigzag.m:
--------------------------------------------------------------------------------
 1 | function out=zigzag(in)
 2 | % Zig-zag scanning
 3 | % This function is used to rearrange a matrix of any size into a 1-D array
 4 | % by implementing the ZIG-ZAG SCANNING procedure.
 5 | % IN specifies the input matrix of any size
 6 | % OUT is the resulting zig-zag scanned (1-D) vector
 7 | % having length equal to the total number of elements in the 2-D input matrix
 8 | %
 9 | % As an example,
10 | % IN = [1	2	6	7
11 | %		3	5	8	11
12 | %		4	9	10	12];
13 | % OUT = ZIGZAG(IN)
14 | % OUT=
15 | %	1     2     3     4     5     6     7     8     9    10    11    12
16 | 
17 | %
18 | %
19 | % Oluwadamilola (Damie) Martins Ogunbiyi
20 | % University of Maryland, College Park
21 | % Department of Electrical and Computer Engineering
22 | % Communications and Signal Processing
23 | % 22-March-2010
24 | % Copyright 2009-2010 Black Ace of Diamonds.
25 | 
26 | [num_rows num_cols]=size(in);
27 | 
28 | % Initialise the output vector
29 | out=zeros(1,num_rows*num_cols);
30 | 
31 | cur_row=1;	cur_col=1;	cur_index=1;
32 | 
33 | % First element
34 | %out(1)=in(1,1);
35 | 
36 | while cur_row<=num_rows & cur_col<=num_cols
37 | 	if cur_row==1 & mod(cur_row+cur_col,2)==0 & cur_col~=num_cols
38 | 		out(cur_index)=in(cur_row,cur_col);
39 | 		cur_col=cur_col+1;							%move right at the top
40 | 		cur_index=cur_index+1;
41 | 		
42 | 	elseif cur_row==num_rows & mod(cur_row+cur_col,2)~=0 & cur_col~=num_cols
43 | 		out(cur_index)=in(cur_row,cur_col);
44 | 		cur_col=cur_col+1;							%move right at the bottom
45 | 		cur_index=cur_index+1;
46 | 		
47 | 	elseif cur_col==1 & mod(cur_row+cur_col,2)~=0 & cur_row~=num_rows
48 | 		out(cur_index)=in(cur_row,cur_col);
49 | 		cur_row=cur_row+1;							%move down at the left
50 | 		cur_index=cur_index+1;
51 | 		
52 | 	elseif cur_col==num_cols & mod(cur_row+cur_col,2)==0 & cur_row~=num_rows
53 | 		out(cur_index)=in(cur_row,cur_col);
54 | 		cur_row=cur_row+1;							%move down at the right
55 | 		cur_index=cur_index+1;
56 | 		
57 | 	elseif cur_col~=1 & cur_row~=num_rows & mod(cur_row+cur_col,2)~=0
58 | 		out(cur_index)=in(cur_row,cur_col);
59 | 		cur_row=cur_row+1;		cur_col=cur_col-1;	%move diagonally left down
60 | 		cur_index=cur_index+1;
61 | 		
62 | 	elseif cur_row~=1 & cur_col~=num_cols & mod(cur_row+cur_col,2)==0
63 | 		out(cur_index)=in(cur_row,cur_col);
64 | 		cur_row=cur_row-1;		cur_col=cur_col+1;	%move diagonally right up
65 | 		cur_index=cur_index+1;
66 | 		
67 | 	elseif cur_row==num_rows & cur_col==num_cols	%obtain the bottom right element
68 |         out(end)=in(end);							%end of the operation
69 | 		break										%terminate the operation
70 |     end
71 | end
72 | 


--------------------------------------------------------------------------------
/oulu/preprocess_images.py:
--------------------------------------------------------------------------------
 1 | """
 2 | preprocess the images
 3 | """
 4 | import sys
 5 | sys.path.append('../')
 6 | import argparse
 7 | from utils.io import load_mat_file, save_mat
 8 | from utils.preprocessing import normalize_input
 9 | from utils.preprocessing import sequencewise_mean_image_subtraction, compute_diff_images
10 | from utils.plotting_utils import reshape_images_order
11 | 
12 | 
13 | def reorder_images(data, shape):
14 |     data = reshape_images_order(data, shape)
15 |     return data
16 | 
17 | 
18 | def samplewise_normalize(data):
19 |     data = normalize_input(data)
20 |     return data
21 | 
22 | 
23 | def remove_mean(data, vidlens):
24 |     data = sequencewise_mean_image_subtraction(data, vidlens)
25 |     return data
26 | 
27 | 
28 | def diff_image(data, vidlens):
29 |     data = compute_diff_images(data, vidlens)
30 |     return data
31 | 
32 | 
33 | def parse_options():
34 |     options = dict()
35 |     options['remove_mean'] = False
36 |     options['diff_image'] = False
37 |     options['samplewise_norm'] = False
38 |     options['no_reorder'] = False
39 |     options['output'] = None
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument('--remove_mean', action='store_true', help='remove mean image')
42 |     parser.add_argument('--diff_image', action='store_true', help='compute difference of image')
43 |     parser.add_argument('--samplewise_norm', action='store_true', help='samplewise normalize')
44 |     parser.add_argument('--no_reorder', action='store_true', help='disable data reordering from f to c')
45 |     parser.add_argument('--output', help='write output to .mat file')
46 |     parser.add_argument('input', nargs='+', help='input ouluvs2 .mat file to preprocess')
47 |     args = parser.parse_args()
48 |     if args.remove_mean:
49 |         options['remove_mean'] = args.remove_mean
50 |     if args.diff_image:
51 |         options['diff_image'] = args.diff_image
52 |     if args.samplewise_norm:
53 |         options['samplewise_norm'] = args.samplewise_norm
54 |     if args.no_reorder:
55 |         options['no_reorder'] = args.no_reorder
56 |     if args.output:
57 |         options['output'] = args.output
58 |     if args.input:
59 |         options['input'] = args.input[0]
60 |     return options
61 | 
62 | 
63 | def main():
64 |     options = parse_options()
65 |     data = load_mat_file(options['input'])
66 |     dataMatrix = data['dataMatrix'].astype('float32')
67 |     vidlens = data['videoLengthVec'].reshape((-1,))
68 | 
69 |     if not options['no_reorder']:
70 |         dataMatrix = reorder_images(dataMatrix, (26, 44))
71 |     if options['samplewise_norm']:
72 |         dataMatrix = samplewise_normalize(dataMatrix)
73 |     if options['remove_mean']:
74 |         dataMatrix = remove_mean(dataMatrix, vidlens)
75 |     if options['diff_image']:
76 |         dataMatrix = diff_image(dataMatrix, vidlens)
77 | 
78 |     data['dataMatrix'] = dataMatrix
79 |     if options['output']:
80 |         save_mat(data, options['output'])
81 |     print('data prepared!')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/avletters/preprocess_images.py:
--------------------------------------------------------------------------------
 1 | """
 2 | realign images to c format from f format
 3 | """
 4 | import sys
 5 | sys.path.append('../')
 6 | import argparse
 7 | from utils.io import load_mat_file, save_mat
 8 | from utils.preprocessing import resize_images, normalize_input, sequencewise_mean_image_subtraction, reorder_data
 9 | from utils.preprocessing import compute_dct_features, concat_first_second_deltas
10 | from utils.preprocessing import compute_diff_images, apply_zca_whitening
11 | from utils.plotting_utils import visualize_images
12 | 
13 | 
14 | def resize(data):
15 |     X = data['dataMatrix']
16 |     vidlens = data['videoLengthVec'].reshape((-1,))
17 |     X = resize_images(X)
18 |     # X = apply_zca_whitening(X)
19 |     visualize_images(X[800:864])
20 |     dct_feats = compute_dct_features(X, (30, 40), 30, method='zigzag')
21 |     dct_feats = concat_first_second_deltas(dct_feats, vidlens)
22 |     X = normalize_input(X)
23 |     data['dataMatrix'] = X
24 |     save_mat(data, 'data/resized.mat')
25 |     d = dict()
26 |     d['dctFeatures'] = dct_feats
27 |     save_mat(d, 'data/dctFeat_AVLetters.mat')
28 | 
29 | 
30 | def remove_mean(data):
31 |     X = data['dataMatrix'].astype('float32')
32 |     vidlens = data['videoLengthVec'].reshape((-1,))
33 |     X = resize_images(X)
34 |     X = sequencewise_mean_image_subtraction(X, vidlens)
35 |     # X = apply_zca_whitening(X)
36 |     X_fortran = reorder_data(X, (30, 40), 'c', 'f')
37 |     dct_feats = compute_dct_features(X, (30, 40), 30, method='zigzag')
38 |     dct_feats = concat_first_second_deltas(dct_feats, vidlens)
39 |     d = dict()
40 |     d['dctFeatures'] = dct_feats
41 |     save_mat(d, 'data/dctFeat_mean_removed_AVLetters.mat')
42 |     visualize_images(X[800:864])
43 |     # samplewise normalize
44 |     X = normalize_input(X, centralize=True)
45 |     data['dataMatrix'] = X
46 |     data['dataMatrixF'] = X_fortran
47 |     save_mat(data, 'data/resized_mean_removed.mat')
48 | 
49 | 
50 | def diff_image(data):
51 |     X = data['dataMatrix'].astype('float32')
52 |     vidlens = data['videoLengthVec'].reshape((-1,))
53 |     X = resize_images(X)
54 |     X = apply_zca_whitening(X)
55 |     # X = normalize_input(X)
56 |     visualize_images(X[2000:2081])
57 |     X = compute_diff_images(X, vidlens)
58 |     X = apply_zca_whitening(X)
59 |     X = normalize_input(X)
60 |     visualize_images(X[2000:2081])
61 |     data['dataMatrix'] = X
62 |     save_mat(data, 'data/resized_diff_image_AVLetters.mat')
63 | 
64 | 
65 | def parse_options():
66 |     options = dict()
67 |     options['operation'] = None
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument('--operation', help='remove_mean, diff_image, resize')
70 |     args = parser.parse_args()
71 |     if args.operation:
72 |         options['operation'] = args.operation
73 |     return options
74 | 
75 | 
76 | def main():
77 |     options = parse_options()
78 |     data = load_mat_file('data/allData_mouthROIs.mat')
79 |     if options['operation'] == 'remove_mean':
80 |         remove_mean(data)
81 |     elif options['operation'] == 'diff_image':
82 |         diff_image(data)
83 |     elif options['operation'] == 'resize':
84 |         resize(data)
85 |     else:
86 |         print('unknown operation')
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/test/test_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from utils.io import *
 3 | from utils.preprocessing import *
 4 | 
 5 | 
 6 | class TestPreprocessingMethods(unittest.TestCase):
 7 |     def test_forcealign(self):
 8 |         stream1 = load_mat_file('../oulu/data/allMouthROIsResized_frontal.mat')
 9 |         stream2 = load_mat_file('../oulu/data/mfcc_w3s3.mat')
10 | 
11 |         s1_data_matrix = stream1['dataMatrix'].astype('float32')
12 |         s1_targets = stream1['targetsVec'].reshape((-1,))
13 |         s1_vidlens = stream1['videoLengthVec'].reshape((-1,))
14 |         s1_subjects = stream1['subjectsVec'].reshape((-1,))
15 | 
16 |         s2_data_matrix = stream2['dataMatrix'].astype('float32')
17 |         s2_targets = stream2['targetsVec'].reshape((-1,))
18 |         s2_vidlens = stream2['videoLengthVec'].reshape((-1,))
19 |         s2_subjects = stream2['subjectsVec'].reshape((-1,))
20 | 
21 |         s1, s2 = force_align((s1_data_matrix, s1_targets, s1_vidlens),
22 |                              (s2_data_matrix, s2_targets, s2_vidlens))
23 | 
24 |         s1_data_matrix, s1_targets, s1_vidlens = s1
25 |         s2_data_matrix, s2_targets, s2_vidlens = s2
26 | 
27 |         assert len(s1_data_matrix) == len(s2_data_matrix)
28 |         assert len(s1_targets) == len(s2_targets)
29 |         assert np.sum(s1_vidlens) == np.sum(s2_vidlens)
30 | 
31 |     def test_multistream_forcealign(self):
32 | 
33 |         stream1 = load_mat_file('../oulu/data/allMouthROIsResized_frontal.mat')
34 |         stream2 = load_mat_file('../oulu/data/allMouthROIsResized_frontal.mat')
35 |         stream3 = load_mat_file('../oulu/data/dctFeats_meanrm_w2s1.mat')
36 |         stream4 = load_mat_file('../oulu/data/mfcc_w3s3.mat')
37 | 
38 |         s1_data_matrix = stream1['dataMatrix'].astype('float32')
39 |         s1_targets = stream1['targetsVec'].reshape((-1,))
40 |         s1_vidlens = stream1['videoLengthVec'].reshape((-1,))
41 |         s1_subjects = stream1['subjectsVec'].reshape((-1,))
42 | 
43 |         s2_data_matrix = stream2['dataMatrix'].astype('float32')
44 |         s2_targets = stream2['targetsVec'].reshape((-1,))
45 |         s2_vidlens = stream2['videoLengthVec'].reshape((-1,))
46 |         s2_subjects = stream2['subjectsVec'].reshape((-1,))
47 | 
48 |         s3_data_matrix = stream3['dataMatrix'].astype('float32')
49 |         s3_targets = stream3['targetsVec'].reshape((-1,))
50 |         s3_vidlens = stream3['videoLengthVec'].reshape((-1,))
51 |         s3_subjects = stream3['subjectsVec'].reshape((-1,))
52 | 
53 |         s4_data_matrix = stream4['dataMatrix'].astype('float32')
54 |         s4_targets = stream4['targetsVec'].reshape((-1,))
55 |         s4_vidlens = stream4['videoLengthVec'].reshape((-1,))
56 |         s4_subjects = stream4['subjectsVec'].reshape((-1,))
57 | 
58 |         orig_streams = [
59 |             (s1_data_matrix, s1_targets, s1_vidlens),
60 |             (s2_data_matrix, s2_targets, s2_vidlens),
61 |             (s3_data_matrix, s3_targets, s3_vidlens),
62 |             (s4_data_matrix, s4_targets, s4_vidlens)
63 |         ]
64 | 
65 |         a = multistream_force_align(orig_streams)
66 |         assert len(a[0][0]) == len(a[1][0]) == len(a[2][0]) == len(a[3][0])
67 |         assert len(a[0][1]) == len(a[1][1]) == len(a[2][1]) == len(a[3][1])
68 |         assert len(a[0][2]) == len(a[1][2]) == len(a[2][2]) == len(a[3][2])
69 | 
70 | if __name__ == '__main__':
71 |     unittest.main()
72 | 


--------------------------------------------------------------------------------
/modelzoo/lstm_classifier_baseline.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | import lasagne as las
 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
 5 | from lasagne.layers import Gate, DropoutLayer
 6 | from lasagne.nonlinearities import tanh, sigmoid, linear
 7 | 
 8 | 
 9 | def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name):
10 |     if cell_parameters is None:
11 |         cell_parameters = Gate()
12 |     if gate_parameters is None:
13 |         gate_parameters = Gate()
14 | 
15 |     l_lstm = LSTMLayer(
16 |         l_incoming, hidden_units,
17 |         # We need to specify a separate input for masks
18 |         mask_input=l_mask,
19 |         # Here, we supply the gate parameters for each gate
20 |         ingate=gate_parameters, forgetgate=gate_parameters,
21 |         cell=cell_parameters, outgate=gate_parameters,
22 |         # We'll learn the initialization and use gradient clipping
23 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
24 | 
25 |     return l_lstm
26 | 
27 | 
28 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name):
29 | 
30 |     if cell_parameters is None:
31 |         cell_parameters = Gate()
32 |     if gate_parameters is None:
33 |         gate_parameters = Gate()
34 | 
35 |     l_lstm = LSTMLayer(
36 |         l_incoming, hidden_units,
37 |         # We need to specify a separate input for masks
38 |         mask_input=l_mask,
39 |         # Here, we supply the gate parameters for each gate
40 |         ingate=gate_parameters, forgetgate=gate_parameters,
41 |         cell=cell_parameters, outgate=gate_parameters,
42 |         # We'll learn the initialization and use gradient clipping
43 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
44 | 
45 |     # The "backwards" layer is the same as the first,
46 |     # except that the backwards argument is set to True.
47 |     l_lstm_back = LSTMLayer(
48 |         l_incoming, hidden_units, ingate=gate_parameters,
49 |         mask_input=l_mask, forgetgate=gate_parameters,
50 |         cell=cell_parameters, outgate=gate_parameters,
51 |         learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name))
52 | 
53 |     return l_lstm, l_lstm_back
54 | 
55 | 
56 | def create_model(input_shape, input_var, mask_shape, mask_var, lstm_size=250, output_classes=26,
57 |                  w_init=las.init.Orthogonal()):
58 |     gate_parameters = Gate(
59 |         W_in=w_init, W_hid=w_init,
60 |         b=las.init.Constant(0.))
61 |     cell_parameters = Gate(
62 |         W_in=w_init, W_hid=w_init,
63 |         # Setting W_cell to None denotes that no cell connection will be used.
64 |         W_cell=None, b=las.init.Constant(0.),
65 |         # By convention, the cell nonlinearity is tanh in an LSTM.
66 |         nonlinearity=tanh)
67 | 
68 |     l_in = InputLayer(input_shape, input_var, 'input')
69 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
70 | 
71 |     f_lstm, b_lstm = create_blstm(l_in, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm')
72 | 
73 |     l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum')
74 |     l_forward_slice1 = SliceLayer(l_sum, -1, 1, name='slice1')
75 | 
76 |     # Now, we can apply feed-forward layers as usual.
77 |     # We want the network to predict a classification for the sequence,
78 |     # so we'll use a the number of classes.
79 |     l_out = DenseLayer(
80 |         l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output')
81 | 
82 |     return l_out
83 | 


--------------------------------------------------------------------------------
/runners/extract_encoder_from_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | sys.path.insert(0, '../')
 4 | import numpy as np
 5 | import theano.tensor as T
 6 | import argparse
 7 | from modelzoo import deltanet_majority_vote
 8 | from utils.io import save_mat
 9 | from custom.nonlinearities import select_nonlinearity
10 | 
11 | 
12 | def parse_options():
13 |     options = dict()
14 |     options['config'] = '../cuave/config/1stream.ini'
15 |     options['shape'] = '2000,1000,500,50'
16 |     options['nonlinearities'] = 'rectify,rectify,rectify,linear'
17 |     options['input_dim'] = 1200
18 |     options['lstm_size'] = 250
19 |     options['output_classes'] = 26
20 |     options['use_blstm'] = False
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('--shape', help='shape of encoder. Default: 2000,1000,500,50')
23 |     parser.add_argument('--input_dim', help='input dimension. Default: 1200')
24 |     parser.add_argument('--nonlinearities', help='nolinearities used by encodeer. '
25 |                                                  'Default: rectify,rectify,rectify,linear')
26 |     parser.add_argument('--output', help='output file to write results')
27 |     parser.add_argument('--lstm_size', help='lstm layer size. Default: 250')
28 |     parser.add_argument('--output_classes', help='number of output classes. Default: 10')
29 |     parser.add_argument('--use_blstm', help='use blstm')
30 |     parser.add_argument('input', help='input model.pkl file')
31 | 
32 |     args = parser.parse_args()
33 |     options['input'] = args.input
34 |     if args.shape:
35 |         options['shape'] = args.shape
36 |     if args.input_dim:
37 |         options['input_dim'] = int(args.input_dim)
38 |     if args.nonlinearities:
39 |         options['nonlinearities'] = args.nonlinearities
40 |     if args.lstm_size:
41 |         options['lstm_size'] = int(args.lstm_size)
42 |     if args.output_classes:
43 |         options['output_classes'] = int(args.output_classes)
44 |     if args.output:
45 |         options['output'] = args.output
46 |     if args.use_blstm:
47 |         options['use_blstm'] = True
48 |     return options
49 | 
50 | 
51 | def main():
52 |     options = parse_options()
53 |     print(options)
54 |     window = T.iscalar('theta')
55 |     inputs1 = T.tensor3('inputs1', dtype='float32')
56 |     mask = T.matrix('mask', dtype='uint8')
57 |     shape = [int(i) for i in options['shape'].split(',')]
58 |     nonlinearities = [select_nonlinearity(s) for s in options['nonlinearities'].split(',')]
59 |     network = deltanet_majority_vote.load_saved_model(options['input'],
60 |                                                       (shape, nonlinearities),
61 |                                                       (None, None, options['input_dim']), inputs1, (None, None), mask,
62 |                                                       options['lstm_size'], window, options['output_classes'],
63 |                                                       use_blstm=options['use_blstm'])
64 |     d = deltanet_majority_vote.extract_encoder_weights(network, ['fc1', 'fc2', 'fc3', 'bottleneck'],
65 |                                                        [('w1', 'b1'), ('w2', 'b2'), ('w3', 'b3'), ('w4', 'b4')])
66 |     expected_keys = ['w1', 'w2', 'w3', 'w4', 'b1', 'b2', 'b3', 'b4']
67 |     keys = d.keys()
68 |     for k in keys:
69 |         assert k in expected_keys
70 |         assert type(d[k]) == np.ndarray
71 |     if 'output' in options:
72 |         print('save extracted weights to {}'.format(options['output']))
73 |         save_mat(d, options['output'])
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/avletters2/prepare_data.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | sys.path.insert(0, '../')
 4 | import argparse
 5 | from utils.preprocessing import *
 6 | from utils.io import *
 7 | from utils.plotting_utils import *
 8 | 
 9 | 
10 | def parse_options():
11 |     options = dict()
12 |     options['remove_mean'] = False
13 |     options['diff_image'] = False
14 |     options['samplewise_norm'] = False
15 |     options['merge_samples'] = False
16 |     options['output'] = None
17 |     options['mergesize'] = 3
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--remove_mean', action='store_true', help='remove mean image')
20 |     parser.add_argument('--diff_image', action='store_true', help='compute difference of image')
21 |     parser.add_argument('--samplewise_norm', action='store_true', help='samplewise normalize')
22 |     parser.add_argument('--reorder_data', help='redorder data from f to c convention. eg: 30,50')
23 |     parser.add_argument('--concat_deltas', help='concat 1st and 2nd deltas, default delta window: 2')
24 |     parser.add_argument('--embed_temporal_info', help='embed temporal info to features [window],[step]. ie: 3,1')
25 |     parser.add_argument('--output', help='write output to .mat file')
26 |     parser.add_argument('input', nargs='+', help='input data .mat file to preprocess')
27 |     args = parser.parse_args()
28 |     if args.remove_mean:
29 |         options['remove_mean'] = args.remove_mean
30 |     if args.diff_image:
31 |         options['diff_image'] = args.diff_image
32 |     if args.samplewise_norm:
33 |         options['samplewise_norm'] = args.samplewise_norm
34 |     if args.embed_temporal_info:
35 |         options['embed_temporal_info'] = args.embed_temporal_info
36 |     if args.reorder_data:
37 |         options['reorder_data'] = args.reorder_data
38 |     if args.output:
39 |         options['output'] = args.output
40 |     if args.input:
41 |         options['input'] = args.input[0]
42 |     if args.concat_deltas:
43 |         options['concat_deltas'] = int(args.concat_deltas)
44 |     return options
45 | 
46 | 
47 | def main():
48 |     options = parse_options()
49 |     data = load_mat_file(options['input'])
50 |     data_matrix = data['dataMatrix'].astype('float32')
51 |     vid_len_vec = data['videoLengthVec'].astype('int').reshape((-1,))
52 |     targets_vec = data['targetsVec'].reshape((-1,))
53 | 
54 |     if 'reorder_data' in options:
55 |         imagesize = tuple([int(d) for d in options['reorder_data'].split(',')])
56 |         data_matrix = reorder_data(data_matrix, imagesize)
57 |     if options['samplewise_norm']:
58 |         data_matrix = normalize_input(data_matrix)
59 |     if options['remove_mean']:
60 |         data_matrix = sequencewise_mean_image_subtraction(data_matrix, vid_len_vec)
61 |     if options['diff_image']:
62 |         data_matrix = compute_diff_images(data_matrix, vid_len_vec)
63 |     if 'embed_temporal_info' in options:
64 |         window, step = tuple([int(d) for d in options['embed_temporal_info'].split(',')])
65 |         data_matrix, targets_vec, vid_len_vec = factorize(data_matrix, targets_vec, vid_len_vec, step, 0)
66 |         data_matrix, targets_vec, vid_len_vec = embed_temporal_info(data_matrix, targets_vec, vid_len_vec, window, step)
67 |     if 'concat_deltas' in options:
68 |         data_matrix = concat_first_second_deltas(data_matrix, vid_len_vec, options['concat_deltas'])
69 | 
70 |     data['dataMatrix'] = data_matrix
71 | 
72 |     if 'embed_temporal_info' in options:
73 |         data['videoLengthVec'] = vid_len_vec
74 |         data['targetsVec'] = targets_vec
75 | 
76 |     if 'output' in options:
77 |         save_mat(data, options['output'])
78 |     # print(data.keys())
79 |     print('data prepared!')
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     main()


--------------------------------------------------------------------------------
/modelzoo/deltanet.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | import lasagne as las
 4 | from lasagne.layers import InputLayer, DenseLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
 5 | from lasagne.layers import Gate
 6 | from lasagne.nonlinearities import tanh, linear, rectify
 7 | 
 8 | from custom.layers import DeltaLayer, create_blstm
 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder
10 | 
11 | 
12 | def create_model_using_pretrained_encoder(weights, biases, input_shape, input_var, mask_shape, mask_var,
13 |                                           lstm_size=250, win=T.iscalar('theta'), output_classes=26,
14 |                                           w_init_fn=las.init.Orthogonal(),
15 |                                           use_peepholes=False, nonlinearities=rectify):
16 |     gate_parameters = Gate(
17 |         W_in=w_init_fn, W_hid=w_init_fn,
18 |         b=las.init.Constant(0.))
19 |     cell_parameters = Gate(
20 |         W_in=w_init_fn, W_hid=w_init_fn,
21 |         # Setting W_cell to None denotes that no cell connection will be used.
22 |         W_cell=None, b=las.init.Constant(0.),
23 |         # By convention, the cell nonlinearity is tanh in an LSTM.
24 |         nonlinearity=tanh)
25 | 
26 |     l_in = InputLayer(input_shape, input_var, 'input')
27 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
28 | 
29 |     symbolic_batchsize = l_in.input_var.shape[0]
30 |     symbolic_seqlen = l_in.input_var.shape[1]
31 | 
32 |     l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
33 |     l_encoder = create_pretrained_encoder(l_reshape1, weights, biases,
34 |                                           [2000, 1000, 500, 50],
35 |                                           [nonlinearities, nonlinearities, nonlinearities, linear],
36 |                                           ['fc1', 'fc2', 'fc3', 'bottleneck'])
37 |     encoder_len = las.layers.get_output_shape(l_encoder)[-1]
38 |     l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
39 |     l_delta = DeltaLayer(l_reshape2, win, name='delta')
40 | 
41 |     l_lstm, l_lstm_back = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'bstm1',
42 |                                        use_peepholes)
43 | 
44 |     # We'll combine the forward and backward layer output by summing.
45 |     # Merge layers take in lists of layers to merge as input.
46 |     l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1')
47 | 
48 |     l_forward_slice1 = SliceLayer(l_sum1, -1, 1, name='slice1')
49 | 
50 |     # Now, we can apply feed-forward layers as usual.
51 |     # We want the network to predict a classification for the sequence,
52 |     # so we'll use a the number of classes.
53 |     l_out = DenseLayer(
54 |         l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output')
55 | 
56 |     return l_out
57 | 
58 | 
59 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
60 |                  lstm_size=250, win=T.iscalar('theta)'),
61 |                  output_classes=26):
62 | 
63 |     dbn_layers = dbn.get_all_layers()
64 |     weights = []
65 |     biases = []
66 |     weights.append(dbn_layers[1].W.astype('float32'))
67 |     weights.append(dbn_layers[2].W.astype('float32'))
68 |     weights.append(dbn_layers[3].W.astype('float32'))
69 |     weights.append(dbn_layers[4].W.astype('float32'))
70 |     biases.append(dbn_layers[1].b.astype('float32'))
71 |     biases.append(dbn_layers[2].b.astype('float32'))
72 |     biases.append(dbn_layers[3].b.astype('float32'))
73 |     biases.append(dbn_layers[4].b.astype('float32'))
74 | 
75 |     return create_model_using_pretrained_encoder(weights, biases, input_shape, input_var, mask_shape, mask_var,
76 |                                                  lstm_size, win, output_classes)
77 | 
78 | 


--------------------------------------------------------------------------------
/custom/updates.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import numpy as np
  4 | import theano
  5 | import theano.tensor as T
  6 | import lasagne
  7 | from lasagne import utils
  8 | 
  9 | 
 10 | def generate_lr_map(params, lr_config, default):
 11 |     """
 12 |     generate a layerwise learning map.
 13 |     to change the values of the learning rate at different epochs eg: learning rate decay
 14 |     use a tensor.shared object. To set the value of the variable use
 15 |     tensor.shared.set_value() to set the value of the variable
 16 |     tensor.shared.get_value() to get the value of the variable
 17 |     Ensure the variable type for the variable learning rates are the same type as the model weights.
 18 |     Typically you can call lasagne.utils.floatX(0.001) to ensure this.
 19 |     
 20 |     :param params: model parameters
 21 |     :param lr_config: learning rate configuration map
 22 |     :param default: default value of learning rate if key not found for layer
 23 |     :return: learning rate map
 24 |     """
 25 |     lr_map = {}
 26 |     for param in params:
 27 |         layer_name = param.name[:param.name.rfind('.')]
 28 |         if layer_name in lr_config:
 29 |             lr_map[param] = lr_config[layer_name]
 30 |         else:
 31 |             lr_map[param] = default
 32 |     return lr_map
 33 | 
 34 | 
 35 | def adam_vlr(loss_or_grads, params, lr_map, beta1=0.9,
 36 |          beta2=0.999, epsilon=1e-8):
 37 |     """Adam updates with Variable Learning Rates
 38 | 
 39 |     Adam updates implemented as in [1]_.
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     loss_or_grads : symbolic expression or list of expressions
 44 |         A scalar loss expression, or a list of gradient expressions
 45 |     params : list of shared variables
 46 |         The variables to generate update expressions for
 47 |     lr_map : dictionary of floats
 48 |         Learning rate map containing layer name and associated learning rate
 49 |     beta1 : float
 50 |         Exponential decay rate for the first moment estimates.
 51 |     beta2 : float
 52 |         Exponential decay rate for the second moment estimates.
 53 |     epsilon : float
 54 |         Constant for numerical stability.
 55 | 
 56 |     Returns
 57 |     -------
 58 |     OrderedDict
 59 |         A dictionary mapping each parameter to its update expression
 60 | 
 61 |     Notes
 62 |     -----
 63 |     The paper [1]_ includes an additional hyperparameter lambda. This is only
 64 |     needed to prove convergence of the algorithm and has no practical use
 65 |     (personal communication with the authors), it is therefore omitted here.
 66 | 
 67 |     References
 68 |     ----------
 69 |     .. [1] Kingma, Diederik, and Jimmy Ba (2014):
 70 |            Adam: A Method for Stochastic Optimization.
 71 |            arXiv preprint arXiv:1412.6980.
 72 |     """
 73 |     all_grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params)
 74 |     t_prev = theano.shared(utils.floatX(0.))
 75 |     updates = OrderedDict()
 76 | 
 77 |     # Using theano constant to prevent upcasting of float32
 78 |     one = T.constant(1)
 79 | 
 80 |     t = t_prev + 1
 81 | 
 82 |     for param, g_t in zip(params, all_grads):
 83 |         a_t = lr_map[param]*T.sqrt(one-beta2**t)/(one-beta1**t)
 84 |         value = param.get_value(borrow=True)
 85 |         m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
 86 |                                broadcastable=param.broadcastable)
 87 |         v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
 88 |                                broadcastable=param.broadcastable)
 89 | 
 90 |         m_t = beta1*m_prev + (one-beta1)*g_t
 91 |         v_t = beta2*v_prev + (one-beta2)*g_t**2
 92 |         step = a_t*m_t/(T.sqrt(v_t) + epsilon)
 93 | 
 94 |         updates[m_prev] = m_t
 95 |         updates[v_prev] = v_t
 96 |         updates[param] = param - step
 97 | 
 98 |     updates[t_prev] = t
 99 |     return updates
100 | 


--------------------------------------------------------------------------------
/modelzoo/avletters_convae.py:
--------------------------------------------------------------------------------
 1 | from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer, BatchNormLayer, batch_norm
 2 | from lasagne.nonlinearities import rectify, leaky_rectify, tanh, linear, sigmoid, ScaledTanh
 3 | from lasagne.layers import Conv2DLayer, Deconv2DLayer
 4 | from lasagne.layers import MaxPool2DLayer
 5 | 
 6 | 
 7 | def create_scaled_tanh(scale_in=0.5, scale_out=2.4):
 8 |     """
 9 |     create a scaled hyperbolic tangent to avoid saturation given input range
10 |     of {-1, 1}. Refer to
11 |     :param scale_in:
12 |     :param scale_out:
13 |     :return: scaled hyperbolic tangent callable
14 | 
15 |     References
16 |     ----------
17 |     .. [1] LeCun, Yann A., et al. (1998):
18 |        Efficient BackProp,
19 |        http://link.springer.com/chapter/10.1007/3-540-49430-8_2,
20 |        http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
21 |     .. [2] Masci, Jonathan, et al. (2011):
22 |        Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction,
23 |        http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7,
24 |        http://people.idsia.ch/~ciresan/data/icann2011.pdf
25 |     """
26 |     return ScaledTanh(scale_in, scale_out)
27 | 
28 | 
29 | def extract_encoder(network):
30 |     pass
31 | 
32 | 
33 | def create_model(incoming, options):
34 |     conv_num_filters1 = 100
35 |     conv_num_filters2 = 150
36 |     conv_num_filters3 = 200
37 |     filter_size1 = 5
38 |     filter_size2 = 5
39 |     filter_size3 = 3
40 |     pool_size = 2
41 |     encode_size = options['BOTTLENECK']
42 |     dense_mid_size = options['DENSE']
43 |     pad_in = 'valid'
44 |     pad_out = 'full'
45 |     scaled_tanh = create_scaled_tanh()
46 | 
47 |     conv2d1 = Conv2DLayer(incoming, num_filters=conv_num_filters1, filter_size=filter_size1, pad=pad_in, name='conv2d1', nonlinearity=scaled_tanh)
48 |     maxpool2d2 = MaxPool2DLayer(conv2d1, pool_size=pool_size, name='maxpool2d2')
49 |     conv2d3 = Conv2DLayer(maxpool2d2, num_filters=conv_num_filters2, filter_size=filter_size2, pad=pad_in, name='conv2d3', nonlinearity=scaled_tanh)
50 |     maxpool2d4 = MaxPool2DLayer(conv2d3, pool_size=pool_size, name='maxpool2d4', pad=(1,0))
51 |     conv2d5 = Conv2DLayer(maxpool2d4, num_filters=conv_num_filters3, filter_size=filter_size3, pad=pad_in, name='conv2d5', nonlinearity=scaled_tanh)
52 |     reshape6 = ReshapeLayer(conv2d5, shape=([0], -1), name='reshape6')  # 3000
53 |     reshape6_output = reshape6.output_shape[1]
54 |     dense7 = DenseLayer(reshape6, num_units=dense_mid_size, name='dense7', nonlinearity=scaled_tanh)
55 |     bottleneck = DenseLayer(dense7, num_units=encode_size, name='bottleneck', nonlinearity=linear)
56 |     # print_network(bottleneck)
57 |     dense8 = DenseLayer(bottleneck, num_units=dense_mid_size, W=bottleneck.W.T, name='dense8', nonlinearity=linear)
58 |     dense9 = DenseLayer(dense8, num_units=reshape6_output, W=dense7.W.T, nonlinearity=scaled_tanh, name='dense9')
59 |     reshape10 = ReshapeLayer(dense9, shape=([0], conv_num_filters3, 3, 5), name='reshape10')  # 32 x 4 x 7
60 |     deconv2d11 = Deconv2DLayer(reshape10, conv2d5.input_shape[1], conv2d5.filter_size, stride=conv2d5.stride,
61 |                                W=conv2d5.W, flip_filters=not conv2d5.flip_filters, name='deconv2d11', nonlinearity=scaled_tanh)
62 |     upscale2d12 = Upscale2DLayer(deconv2d11, scale_factor=pool_size, name='upscale2d12')
63 |     deconv2d13 = Deconv2DLayer(upscale2d12, conv2d3.input_shape[1], conv2d3.filter_size, stride=conv2d3.stride,
64 |                                W=conv2d3.W, flip_filters=not conv2d3.flip_filters, name='deconv2d13', nonlinearity=scaled_tanh)
65 |     upscale2d14 = Upscale2DLayer(deconv2d13, scale_factor=pool_size, name='upscale2d14')
66 |     deconv2d15 = Deconv2DLayer(upscale2d14, conv2d1.input_shape[1], conv2d1.filter_size, stride=conv2d1.stride,
67 |                                crop=(1, 0), W=conv2d1.W, flip_filters=not conv2d1.flip_filters, name='deconv2d14', nonlinearity=scaled_tanh)
68 |     reshape16 = ReshapeLayer(deconv2d15, ([0], -1), name='reshape16')
69 |     return reshape16, bottleneck
70 | 


--------------------------------------------------------------------------------
/modelzoo/avletters_convae_bn.py:
--------------------------------------------------------------------------------
 1 | from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer, BatchNormLayer, batch_norm
 2 | from lasagne.nonlinearities import rectify, leaky_rectify, tanh, linear, sigmoid, ScaledTanh
 3 | from lasagne.layers import Conv2DLayer, Deconv2DLayer, DropoutLayer
 4 | from lasagne.layers import MaxPool2DLayer
 5 | from utils.plotting_utils import print_network
 6 | 
 7 | 
 8 | def create_scaled_tanh(scale_in=0.5, scale_out=2.4):
 9 |     """
10 |     create a scaled hyperbolic tangent to avoid saturation given input range
11 |     of {-1, 1}. Refer to
12 |     :param scale_in:
13 |     :param scale_out:
14 |     :return: scaled hyperbolic tangent callable
15 | 
16 |     References
17 |     ----------
18 |     .. [1] LeCun, Yann A., et al. (1998):
19 |        Efficient BackProp,
20 |        http://link.springer.com/chapter/10.1007/3-540-49430-8_2,
21 |        http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
22 |     .. [2] Masci, Jonathan, et al. (2011):
23 |        Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction,
24 |        http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7,
25 |        http://people.idsia.ch/~ciresan/data/icann2011.pdf
26 |     """
27 |     return ScaledTanh(scale_in, scale_out)
28 | 
29 | 
30 | def extract_encoder(network):
31 |     pass
32 | 
33 | 
34 | def create_model(incoming, options):
35 |     conv_num_filters1 = 100
36 |     conv_num_filters2 = 150
37 |     conv_num_filters3 = 200
38 |     filter_size1 = 5
39 |     filter_size2 = 5
40 |     filter_size3 = 3
41 |     pool_size = 2
42 |     encode_size = options['BOTTLENECK']
43 |     dense_mid_size = options['DENSE']
44 |     pad_in = 'valid'
45 |     pad_out = 'full'
46 |     scaled_tanh = create_scaled_tanh()
47 | 
48 |     conv2d1 = Conv2DLayer(incoming, num_filters=conv_num_filters1, filter_size=filter_size1, pad=pad_in, name='conv2d1', nonlinearity=scaled_tanh)
49 |     maxpool2d3 = MaxPool2DLayer(conv2d1, pool_size=pool_size, name='maxpool2d3')
50 |     bn2 = BatchNormLayer(maxpool2d3, name='batchnorm2')
51 |     conv2d4 = Conv2DLayer(bn2, num_filters=conv_num_filters2, filter_size=filter_size2, pad=pad_in, name='conv2d4', nonlinearity=scaled_tanh)
52 |     maxpool2d6 = MaxPool2DLayer(conv2d4, pool_size=pool_size, name='maxpool2d6', pad=(1,0))
53 |     bn3 = BatchNormLayer(maxpool2d6, name='batchnorm3')
54 |     conv2d7 = Conv2DLayer(bn3, num_filters=conv_num_filters3, filter_size=filter_size3, pad=pad_in, name='conv2d7', nonlinearity=scaled_tanh)
55 |     reshape9 = ReshapeLayer(conv2d7, shape=([0], -1), name='reshape9')  # 3000
56 |     reshape9_output = reshape9.output_shape[1]
57 |     bn8 = BatchNormLayer(reshape9, name='batchnorm8')
58 |     dense10 = DenseLayer(bn8, num_units=dense_mid_size, name='dense10', nonlinearity=scaled_tanh)
59 |     bn11 = BatchNormLayer(dense10, name='batchnorm11')
60 |     bottleneck = DenseLayer(bn11, num_units=encode_size, name='bottleneck', nonlinearity=linear)
61 |     # print_network(bottleneck)
62 |     dense12 = DenseLayer(bottleneck, num_units=dense_mid_size, W=bottleneck.W.T, name='dense12', nonlinearity=linear)
63 |     dense13 = DenseLayer(dense12, num_units=reshape9_output, W=dense10.W.T, nonlinearity=scaled_tanh, name='dense13')
64 |     reshape14 = ReshapeLayer(dense13, shape=([0], conv_num_filters3, 3, 5), name='reshape14')  # 32 x 4 x 7
65 |     deconv2d19 = Deconv2DLayer(reshape14, conv2d7.input_shape[1], conv2d7.filter_size, stride=conv2d7.stride,
66 |                                W=conv2d7.W, flip_filters=not conv2d7.flip_filters, name='deconv2d19', nonlinearity=scaled_tanh)
67 |     upscale2d16 = Upscale2DLayer(deconv2d19, scale_factor=pool_size, name='upscale2d16')
68 |     deconv2d17 = Deconv2DLayer(upscale2d16, conv2d4.input_shape[1], conv2d4.filter_size, stride=conv2d4.stride,
69 |                                W=conv2d4.W, flip_filters=not conv2d4.flip_filters, name='deconv2d17', nonlinearity=scaled_tanh)
70 |     upscale2d18 = Upscale2DLayer(deconv2d17, scale_factor=pool_size, name='upscale2d18')
71 |     deconv2d19 = Deconv2DLayer(upscale2d18, conv2d1.input_shape[1], conv2d1.filter_size, stride=conv2d1.stride,
72 |                                crop=(1, 0), W=conv2d1.W, flip_filters=not conv2d1.flip_filters, name='deconv2d14', nonlinearity=scaled_tanh)
73 |     reshape20 = ReshapeLayer(deconv2d19, ([0], -1), name='reshape20')
74 |     return reshape20, bottleneck
75 | 


--------------------------------------------------------------------------------
/runners/extract_lstm_from_model.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | sys.path.insert(0, '../')
 4 | import numpy as np
 5 | import theano.tensor as T
 6 | import argparse
 7 | from modelzoo import deltanet_majority_vote
 8 | from utils.io import save_mat
 9 | from custom.nonlinearities import select_nonlinearity
10 | 
11 | 
12 | def parse_options():
13 |     options = dict()
14 |     options['shape'] = '2000,1000,500,50'
15 |     options['nonlinearities'] = 'rectify,rectify,rectify,linear'
16 |     options['input_dim'] = 1200
17 |     options['lstm_size'] = 250
18 |     options['output_classes'] = 26
19 |     options['layer_names'] = 'f_lstm,b_lstm'
20 |     options['use_blstm'] = False
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('--shape', help='shape of encoder. Default: 2000,1000,500,50')
23 |     parser.add_argument('--input_dim', help='input dimension. Default: 1200')
24 |     parser.add_argument('--nonlinearities', help='nolinearities used by encodeer. '
25 |                                                  'Default: rectify,rectify,rectify,linear')
26 |     parser.add_argument('--output', help='output file to write results')
27 |     parser.add_argument('--lstm_size', help='lstm layer size. Default: 250')
28 |     parser.add_argument('--output_classes', help='number of output classes. Default: 10')
29 |     parser.add_argument('--layer_names', help='names of lstm layers to extract')
30 |     parser.add_argument('--use_blstm', help='use blstm')
31 |     parser.add_argument('input', help='input model.pkl file')
32 | 
33 |     args = parser.parse_args()
34 |     options['input'] = args.input
35 |     if args.shape:
36 |         options['shape'] = args.shape
37 |     if args.input_dim:
38 |         options['input_dim'] = int(args.input_dim)
39 |     if args.nonlinearities:
40 |         options['nonlinearities'] = args.nonlinearities
41 |     if args.lstm_size:
42 |         options['lstm_size'] = int(args.lstm_size)
43 |     if args.output_classes:
44 |         options['output_classes'] = int(args.output_classes)
45 |     if args.output:
46 |         options['output'] = args.output
47 |     if args.layer_names:
48 |         options['layer_names'] = args.layer_names
49 |     if args.use_blstm:
50 |         options['use_blstm'] = True
51 |     return options
52 | 
53 | 
54 | def main():
55 |     options = parse_options()
56 |     print(options)
57 |     window = T.iscalar('theta')
58 |     inputs1 = T.tensor3('inputs1', dtype='float32')
59 |     mask = T.matrix('mask', dtype='uint8')
60 |     shape = [int(i) for i in options['shape'].split(',')]
61 |     nonlinearities = [select_nonlinearity(s) for s in options['nonlinearities'].split(',')]
62 |     layer_names = options['layer_names'].split(',')
63 |     network = deltanet_majority_vote.load_saved_model(options['input'],
64 |                                                       (shape, nonlinearities),
65 |                                                       (None, None, options['input_dim']), inputs1, (None, None), mask,
66 |                                                       options['lstm_size'], window, options['output_classes'],
67 |                                                       use_blstm=options['use_blstm'])
68 |     d = deltanet_majority_vote.extract_lstm_weights(network, layer_names, ['f_lstm', 'b_lstm'])
69 |     expected_keys = ['f_lstm_w_hid_to_cell', 'f_lstm_w_hid_to_forgetgate', 'f_lstm_w_hid_to_ingate',
70 |                      'f_lstm_w_hid_to_outgate', 'f_lstm_w_in_to_cell', 'f_lstm_w_in_to_forgetgate',
71 |                      'f_lstm_w_in_to_ingate', 'f_lstm_w_in_to_outgate', 'f_lstm_b_cell', 'f_lstm_b_forgetgate',
72 |                      'f_lstm_b_ingate', 'f_lstm_b_outgate',
73 |                      'b_lstm_w_hid_to_cell', 'b_lstm_w_hid_to_forgetgate',
74 |                      'b_lstm_w_hid_to_ingate', 'b_lstm_w_hid_to_outgate', 'b_lstm_w_in_to_cell', 'b_lstm_w_in_to_forgetgate',
75 |                      'b_lstm_w_in_to_ingate', 'b_lstm_w_in_to_outgate', 'b_lstm_b_cell', 'b_lstm_b_forgetgate',
76 |                      'b_lstm_b_ingate', 'b_lstm_b_outgate']
77 |     keys = d.keys()
78 |     for k in keys:
79 |         assert k in expected_keys
80 |         assert type(d[k]) == np.ndarray
81 |     if 'output' in options:
82 |         print('save extracted weights to {}'.format(options['output']))
83 |         save_mat(d, options['output'])
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     main()
88 | 


--------------------------------------------------------------------------------
/test/test_gen_batch_from_file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | import numpy as np
 4 | from utils.io import load_mat_file
 5 | from utils.datagen import gen_batch_from_file
 6 | 
 7 | 
 8 | class TestGenBatchFromFile(unittest.TestCase):
 9 |     def test_large_batch(self):
10 |         """
11 |         test large batch where train data is larger than batch size
12 |         :return: batch with data equal to batch size
13 |         """
14 |         # load the test file, and preprocess the path separator and dimensions
15 |         data = load_mat_file('../5words/data/'
16 |                              'data5Words_mouthROIs_basedOnMouthCenter_1pointAndMouthEyesCenter_filenames.mat')
17 |         filenames = data['filenamePaths'].flatten()
18 |         vidlens = data['videoLengthVec'].flatten()
19 |         targets = (data['targetsPerVideoVec'].flatten())
20 |         train_idxs = data['subjectsVec'].flatten() == 1
21 |         val_idxs = data['subjectsVec'].flatten() == 2
22 |         test_idxs = data['subjectsVec'].flatten() == 3
23 |         train_vidlens = vidlens[train_idxs]
24 |         val_vidlens = vidlens[val_idxs]
25 |         test_vidlens = vidlens[test_idxs]
26 |         train_targets = targets[train_idxs] - 1
27 |         val_targets = targets[val_idxs] - 1
28 |         test_targets = targets[test_idxs] - 1
29 | 
30 |         # change the file path format and add path prefix to locate file
31 |         def prepare_filepaths(f):
32 |             return os.path.join('../5words/data', str(f[0].replace('\\', '/')))
33 | 
34 |         # apply to all entries in file lists
35 |         vfunc = np.vectorize(prepare_filepaths)
36 |         filenames = vfunc(filenames)
37 | 
38 |         # generate splits
39 |         training_files = filenames[train_idxs]
40 |         val_files = filenames[val_idxs]
41 |         test_files = filenames[test_idxs]
42 |         datagen = gen_batch_from_file(training_files, train_targets, train_vidlens, 5551)
43 | 
44 |         for i in range(165):
45 |             X_batch, y_batch, mask, idx = next(datagen)
46 |             assert X_batch.shape == (30, 29, 5551)
47 |             assert y_batch.shape == (30,)
48 |             assert mask.shape == (30, 29)
49 |             assert idx.shape == (30,)
50 |         # remainder 4959 % 30
51 |         remainder_batchsize = 4959 % 30
52 |         X_batch, y_batch, mask, idx = next(datagen)
53 |         assert X_batch.shape == (remainder_batchsize, 29, 5551)
54 |         assert y_batch.shape == (remainder_batchsize,)
55 |         assert mask.shape == (remainder_batchsize, 29)
56 |         assert idx.shape == (remainder_batchsize,)
57 | 
58 |     def test_small_batch(self):
59 |         """
60 |         test when training data is smaller than batch size
61 |         :return: batch of length equal to train data len
62 |         """
63 |         # load the test file, and preprocess the path separator and dimensions
64 |         data = load_mat_file('../5words/data/'
65 |                              'data5Words_mouthROIs_basedOnMouthCenter_1pointAndMouthEyesCenter_filenames.mat')
66 |         filenames = data['filenamePaths'].flatten()
67 |         vidlens = data['videoLengthVec'].flatten()
68 |         targets = (data['targetsPerVideoVec'].flatten())
69 |         train_idxs = data['subjectsVec'].flatten() == 1
70 |         val_idxs = data['subjectsVec'].flatten() == 2
71 |         test_idxs = data['subjectsVec'].flatten() == 3
72 |         train_vidlens = vidlens[train_idxs]
73 |         val_vidlens = vidlens[val_idxs]
74 |         test_vidlens = vidlens[test_idxs]
75 |         train_targets = targets[train_idxs] - 1
76 |         val_targets = targets[val_idxs] - 1
77 |         test_targets = targets[test_idxs] - 1
78 | 
79 |         def prepare_filepaths(f):
80 |             return os.path.join('../5words/data', str(f[0].replace('\\', '/')))
81 | 
82 |         vfunc = np.vectorize(prepare_filepaths)
83 |         filenames = vfunc(filenames)
84 |         training_files = filenames[train_idxs]
85 |         val_files = filenames[val_idxs]
86 |         test_files = filenames[test_idxs]
87 |         datagen = gen_batch_from_file(training_files[:10], train_targets[:10], train_vidlens[:10], 5551)
88 |         X_batch, y_batch, mask, idx = next(datagen)
89 | 
90 |         assert X_batch.shape == (10, 29, 5551)
91 |         assert y_batch.shape == (10,)
92 |         assert mask.shape == (10, 29)
93 |         assert idx.shape == (10,)
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     unittest.main()


--------------------------------------------------------------------------------
/modelzoo/avletters_convae_drop.py:
--------------------------------------------------------------------------------
 1 | from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer, BatchNormLayer, batch_norm
 2 | from lasagne.nonlinearities import rectify, leaky_rectify, tanh, linear, sigmoid, ScaledTanh
 3 | from lasagne.layers import Conv2DLayer, Deconv2DLayer, DropoutLayer
 4 | from lasagne.layers import MaxPool2DLayer
 5 | 
 6 | 
 7 | def create_scaled_tanh(scale_in=0.5, scale_out=2.4):
 8 |     """
 9 |     create a scaled hyperbolic tangent to avoid saturation given input range
10 |     of {-1, 1}. Refer to
11 |     :param scale_in:
12 |     :param scale_out:
13 |     :return: scaled hyperbolic tangent callable
14 | 
15 |     References
16 |     ----------
17 |     .. [1] LeCun, Yann A., et al. (1998):
18 |        Efficient BackProp,
19 |        http://link.springer.com/chapter/10.1007/3-540-49430-8_2,
20 |        http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
21 |     .. [2] Masci, Jonathan, et al. (2011):
22 |        Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction,
23 |        http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7,
24 |        http://people.idsia.ch/~ciresan/data/icann2011.pdf
25 |     """
26 |     return ScaledTanh(scale_in, scale_out)
27 | 
28 | 
29 | def extract_encoder(network):
30 |     pass
31 | 
32 | 
33 | def create_model(incoming, options):
34 |     input_p = 0.2
35 |     hidden_p = 0.5
36 |     conv_num_filters1 = int(100 / (1.0 - input_p))
37 |     conv_num_filters2 = int(150 / (1.0 - hidden_p))
38 |     conv_num_filters3 = int(200 / (1.0 - hidden_p))
39 |     filter_size1 = 5
40 |     filter_size2 = 5
41 |     filter_size3 = 3
42 |     pool_size = 2
43 |     encode_size = int(options['BOTTLENECK'] / 0.5)
44 |     dense_mid_size = int(options['DENSE'] / 0.5)
45 |     pad_in = 'valid'
46 |     pad_out = 'full'
47 |     scaled_tanh = create_scaled_tanh()
48 |     dropout0 = DropoutLayer(incoming, p=0.2, name='dropout0')
49 |     conv2d1 = Conv2DLayer(dropout0, num_filters=conv_num_filters1, filter_size=filter_size1, pad=pad_in, name='conv2d1', nonlinearity=scaled_tanh)
50 |     maxpool2d2 = MaxPool2DLayer(conv2d1, pool_size=pool_size, name='maxpool2d2')
51 |     dropout1 = DropoutLayer(maxpool2d2, name='dropout1')
52 |     conv2d3 = Conv2DLayer(dropout1, num_filters=conv_num_filters2, filter_size=filter_size2, pad=pad_in, name='conv2d3', nonlinearity=scaled_tanh)
53 |     maxpool2d4 = MaxPool2DLayer(conv2d3, pool_size=pool_size, name='maxpool2d4', pad=(1,0))
54 |     dropout2 = DropoutLayer(maxpool2d4, name='dropout2')
55 |     conv2d5 = Conv2DLayer(dropout2, num_filters=conv_num_filters3, filter_size=filter_size3, pad=pad_in, name='conv2d5', nonlinearity=scaled_tanh)
56 |     reshape6 = ReshapeLayer(conv2d5, shape=([0], -1), name='reshape6')  # 3000
57 |     reshape6_output = reshape6.output_shape[1]
58 |     dropout3 = DropoutLayer(reshape6, name='dropout3')
59 |     dense7 = DenseLayer(dropout3, num_units=dense_mid_size, name='dense7', nonlinearity=scaled_tanh)
60 |     dropout4 = DropoutLayer(dense7, name='dropout4')
61 |     bottleneck = DenseLayer(dropout4, num_units=encode_size, name='bottleneck', nonlinearity=linear)
62 |     # print_network(bottleneck)
63 |     dense8 = DenseLayer(bottleneck, num_units=dense_mid_size, W=bottleneck.W.T, name='dense8', nonlinearity=linear)
64 |     dense9 = DenseLayer(dense8, num_units=reshape6_output, W=dense7.W.T, nonlinearity=scaled_tanh, name='dense9')
65 |     reshape10 = ReshapeLayer(dense9, shape=([0], conv_num_filters3, 3, 5), name='reshape10')  # 32 x 4 x 7
66 |     deconv2d11 = Deconv2DLayer(reshape10, conv2d5.input_shape[1], conv2d5.filter_size, stride=conv2d5.stride,
67 |                                W=conv2d5.W, flip_filters=not conv2d5.flip_filters, name='deconv2d11', nonlinearity=scaled_tanh)
68 |     upscale2d12 = Upscale2DLayer(deconv2d11, scale_factor=pool_size, name='upscale2d12')
69 |     deconv2d13 = Deconv2DLayer(upscale2d12, conv2d3.input_shape[1], conv2d3.filter_size, stride=conv2d3.stride,
70 |                                W=conv2d3.W, flip_filters=not conv2d3.flip_filters, name='deconv2d13', nonlinearity=scaled_tanh)
71 |     upscale2d14 = Upscale2DLayer(deconv2d13, scale_factor=pool_size, name='upscale2d14')
72 |     deconv2d15 = Deconv2DLayer(upscale2d14, conv2d1.input_shape[1], conv2d1.filter_size, stride=conv2d1.stride,
73 |                                crop=(1, 0), W=conv2d1.W, flip_filters=not conv2d1.flip_filters, name='deconv2d14', nonlinearity=scaled_tanh)
74 |     reshape16 = ReshapeLayer(deconv2d15, ([0], -1), name='reshape16')
75 |     return reshape16, bottleneck
76 | 


--------------------------------------------------------------------------------
/utils/signal.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as T
  4 | from lasagne.utils import unroll_scan
  5 | 
  6 | 
  7 | def delta_theta(theta, curr_delta, t, THETA, Y):
  8 |     """
  9 |     compute a delta theta component at delta time step t
 10 |     :param theta: current time step theta component
 11 |     :param curr_delta: current accumulated delta_t
 12 |     :param t: current delta_t to be computed
 13 |     :param THETA: window size
 14 |     :param Y: input sequence
 15 |     :return: delta theta component for time step t
 16 |     """
 17 |     # accumulator is shaped (1, no_features), transpose to perform column wise element operations
 18 |     temp = curr_delta.T
 19 |     d_theta = theta * (Y[:, THETA + t + theta] - Y[:, THETA + t - theta]) / (2 * theta * theta)
 20 |     temp += d_theta
 21 |     temp = temp.astype('float32')
 22 |     curr_delta = temp.T
 23 |     return curr_delta
 24 | 
 25 | 
 26 | def delta_t(t, THETA, Y):
 27 |     """
 28 |     compute delta at time step t
 29 |     :param t: time step
 30 |     :param THETA: window size
 31 |     :param Y: sequence in shape (number_of_features, time_step)
 32 |     :return: delta coefficient at time step t
 33 |     """
 34 |     theta = T.arange(1, THETA + 1, dtype='int32')
 35 |     results, _ = theano.scan(delta_theta, outputs_info=T.zeros_like(Y),
 36 |                              sequences=theta, non_sequences=[t, THETA, Y])
 37 |     # only interested in the final results, discard the intermediate values
 38 |     final_results = results[-1]
 39 |     return final_results
 40 | 
 41 | 
 42 | def delta_coeff(A, theta):
 43 |     """
 44 |     compute delta coefficients given a sequence.
 45 |     :param A: input sequence in shape (time_step, number_of_features)
 46 |     :param theta: window size
 47 |     :return: delta coefficients for the input sequence
 48 |     """
 49 |     # transpose and repeat
 50 |     X = A.T
 51 |     Y = T.concatenate([T.extra_ops.repeat(X[:, 0], theta).reshape((X.shape[0], theta)),
 52 |                        X, T.extra_ops.repeat(X[:, -1], theta).reshape((X.shape[0], theta))], axis=1)
 53 |     delta, _ = theano.scan(delta_t, sequences=[T.arange(0, X.shape[1], dtype='int32')], non_sequences=[theta, Y])
 54 |     # transpose the results back to shape (time_step, number_of_features)
 55 |     delta = delta[:, :, -1].reshape(A.shape)
 56 |     return delta
 57 | 
 58 | 
 59 | def append_delta_coeff(A, theta):
 60 |     """
 61 |     append delta + acceleration coefficients given a sequence.
 62 |     :param A: input sequence in shape (time_step, number_of_features)
 63 |     :param theta: window size
 64 |     :return: delta + acceleration coefficients for the input sequence
 65 |     """
 66 |     # transpose and repeat
 67 |     X = A.T
 68 |     Y = T.concatenate([T.extra_ops.repeat(X[:, 0], theta).reshape((X.shape[0], theta)),
 69 |                        X, T.extra_ops.repeat(X[:, -1], theta).reshape((X.shape[0], theta))], axis=1)
 70 |     delta, _ = theano.scan(delta_t, sequences=[T.arange(0, X.shape[1], dtype='int32')], non_sequences=[theta, Y])
 71 |     # transpose the results back to shape (time_step, number_of_features)
 72 |     delta = delta[:, :, -1].reshape(A.shape)
 73 | 
 74 |     X = delta.T
 75 |     Y = T.concatenate([T.extra_ops.repeat(X[:, 0], theta).reshape((X.shape[0], theta)),
 76 |                        X, T.extra_ops.repeat(X[:, -1], theta).reshape((X.shape[0], theta))], axis=1)
 77 |     acc, _ = theano.scan(delta_t, sequences=[T.arange(0, X.shape[1], dtype='int32')], non_sequences=[theta, Y])
 78 |     acc = acc[:, :, -1].reshape(A.shape)
 79 |     res = T.concatenate([A, delta, acc], axis=1)
 80 |     return res
 81 | 
 82 | 
 83 | def main():
 84 |     """
 85 |     test runner, computes delta for an array of sequences
 86 |     :return: None
 87 |     """
 88 |     A = T.tensor3('A', dtype='float32')
 89 |     theta = T.iscalar('theta')
 90 | 
 91 |     # compute delta coefficients for multiple sequences
 92 |     results, updates = theano.scan(append_delta_coeff, sequences=A, non_sequences=theta)
 93 |     compute_deltas = theano.function([A, theta], outputs=results, updates=updates)
 94 | 
 95 |     seqs = np.array([[[1, 2, 3, 4, 5],
 96 |                       [10, 12, 13, 14, 15],
 97 |                       [300, 1, 23, 56, 22]],
 98 |                      [[1, 1, 1, 1, 1],
 99 |                       [1, 1, 100, 1, 1],
100 |                       [1, 1, 1, 1, 1]]], dtype='float32')
101 |     res = compute_deltas(seqs, 1)
102 |     print(res)
103 | 
104 | if __name__ == '__main__':
105 |     main()
106 | 


--------------------------------------------------------------------------------
/modelzoo/adenet_v2.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | import lasagne as las
 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
 5 | from lasagne.layers import Gate, DropoutLayer
 6 | from lasagne.nonlinearities import tanh, sigmoid, linear, rectify
 7 | 
 8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer, create_blstm
 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder
10 | 
11 | 
12 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
13 |                  dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'),
14 |                  output_classes=26, fusiontype='sum', w_init_fn=las.init.GlorotUniform(),
15 |                  use_peepholes=False, nonlinearities=rectify):
16 | 
17 |     weights, biases, shapes, nonlinearities = dbn
18 |     names = ['fc1', 'fc2', 'fc3', 'bottleneck']
19 | 
20 |     gate_parameters = Gate(
21 |         W_in=w_init_fn, W_hid=w_init_fn,
22 |         b=las.init.Constant(0.))
23 |     cell_parameters = Gate(
24 |         W_in=w_init_fn, W_hid=w_init_fn,
25 |         # Setting W_cell to None denotes that no cell connection will be used.
26 |         W_cell=None, b=las.init.Constant(0.),
27 |         # By convention, the cell nonlinearity is tanh in an LSTM.
28 |         nonlinearity=tanh)
29 | 
30 |     l_in = InputLayer(input_shape, input_var, 'input')
31 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
32 |     l_dct = InputLayer(dct_shape, dct_var, 'dct')
33 | 
34 |     symbolic_batchsize = l_in.input_var.shape[0]
35 |     symbolic_seqlen = l_in.input_var.shape[1]
36 | 
37 |     l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
38 |     l_encoder = create_pretrained_encoder(l_reshape1, weights, biases, shapes, nonlinearities, names)
39 |     encoder_len = las.layers.get_output_shape(l_encoder)[-1]
40 |     l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
41 |     l_delta = DeltaLayer(l_reshape2, win, name='delta')
42 | 
43 |     l_delta_dct = DeltaLayer(l_dct, win, name='delta_dct')
44 | 
45 |     l_lstm_bn = LSTMLayer(
46 |         l_delta, lstm_size, peepholes=use_peepholes,
47 |         # We need to specify a separate input for masks
48 |         mask_input=l_mask,
49 |         # Here, we supply the gate parameters for each gate
50 |         ingate=gate_parameters, forgetgate=gate_parameters,
51 |         cell=cell_parameters, outgate=gate_parameters,
52 |         # We'll learn the initialization and use gradient clipping
53 |         learn_init=True, grad_clipping=5., name='lstm_bn')
54 | 
55 |     l_lstm_dct = LSTMLayer(
56 |         l_delta_dct, lstm_size, peepholes=use_peepholes,
57 |         # We need to specify a separate input for masks
58 |         mask_input=l_mask,
59 |         # Here, we supply the gate parameters for each gate
60 |         ingate=gate_parameters, forgetgate=gate_parameters,
61 |         cell=cell_parameters, outgate=gate_parameters,
62 |         # We'll learn the initialization and use gradient clipping
63 |         learn_init=True, grad_clipping=5., name='lstm_dct')
64 | 
65 |     # We'll combine the forward and backward layer output by summing.
66 |     # Merge layers take in lists of layers to merge as input.
67 | 
68 |     if fusiontype == 'sum':
69 |         l_fuse = ElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='sum1')
70 |     elif fusiontype == 'adasum':
71 |         l_fuse = AdaptiveElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='adasum')
72 |     elif fusiontype == 'concat':
73 |         l_fuse = ConcatLayer([l_lstm_bn, l_lstm_dct], axis=2, name='concat')
74 |     else:
75 |         raise ValueError(message='Unsupported Fusion Type used!')
76 | 
77 |     f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg')
78 | 
79 |     l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2')
80 | 
81 |     # reshape to (num_examples * seq_len, lstm_size)
82 |     l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3')
83 | 
84 |     # l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1')
85 | 
86 |     # Now, we can apply feed-forward layers as usual.
87 |     # We want the network to predict a classification for the sequence,
88 |     # so we'll use a the number of classes.
89 |     l_softmax = DenseLayer(
90 |         l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax')
91 | 
92 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output')
93 | 
94 |     return l_out, l_fuse
95 | 


--------------------------------------------------------------------------------
/modelzoo/avletters_convae_bndrop.py:
--------------------------------------------------------------------------------
 1 | from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer, BatchNormLayer, batch_norm
 2 | from lasagne.nonlinearities import rectify, leaky_rectify, tanh, linear, sigmoid, ScaledTanh
 3 | from lasagne.layers import Conv2DLayer, Deconv2DLayer, DropoutLayer
 4 | from lasagne.layers import MaxPool2DLayer
 5 | 
 6 | 
 7 | def create_scaled_tanh(scale_in=2./3, scale_out=1.7159):
 8 |     """
 9 |     create a scaled hyperbolic tangent to avoid saturation given input range
10 |     of {-1, 1}. Refer to
11 |     :param scale_in:
12 |     :param scale_out:
13 |     :return: scaled hyperbolic tangent callable
14 | 
15 |     References
16 |     ----------
17 |     .. [1] LeCun, Yann A., et al. (1998):
18 |        Efficient BackProp,
19 |        http://link.springer.com/chapter/10.1007/3-540-49430-8_2,
20 |        http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
21 |     .. [2] Masci, Jonathan, et al. (2011):
22 |        Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction,
23 |        http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7,
24 |        http://people.idsia.ch/~ciresan/data/icann2011.pdf
25 |     """
26 |     return ScaledTanh(scale_in, scale_out)
27 | 
28 | 
29 | def extract_encoder(network):
30 |     pass
31 | 
32 | 
33 | def create_model(incoming, options):
34 |     conv_num_filters1 = 100
35 |     conv_num_filters2 = 150
36 |     conv_num_filters3 = 200
37 |     filter_size1 = 5
38 |     filter_size2 = 5
39 |     filter_size3 = 3
40 |     pool_size = 2
41 |     encode_size = options['BOTTLENECK']
42 |     dense_mid_size = options['DENSE']
43 |     pad_in = 'valid'
44 |     pad_out = 'full'
45 |     scaled_tanh = create_scaled_tanh()
46 |     dropout0 = DropoutLayer(incoming, p=0.2, name='dropout0')
47 |     conv2d1 = Conv2DLayer(dropout0, num_filters=conv_num_filters1, filter_size=filter_size1, pad=pad_in, name='conv2d1', nonlinearity=scaled_tanh)
48 |     bn1 = BatchNormLayer(conv2d1, name='batchnorm1')
49 |     maxpool2d2 = MaxPool2DLayer(bn1, pool_size=pool_size, name='maxpool2d2')
50 |     dropout1 = DropoutLayer(maxpool2d2, name='dropout1')
51 |     conv2d3 = Conv2DLayer(dropout1, num_filters=conv_num_filters2, filter_size=filter_size2, pad=pad_in, name='conv2d3', nonlinearity=scaled_tanh)
52 |     bn2 = BatchNormLayer(conv2d3, name='batchnorm2')
53 |     maxpool2d4 = MaxPool2DLayer(bn2, pool_size=pool_size, name='maxpool2d4', pad=(1,0))
54 |     dropout2 = DropoutLayer(maxpool2d4, name='dropout2')
55 |     conv2d5 = Conv2DLayer(dropout2, num_filters=conv_num_filters3, filter_size=filter_size3, pad=pad_in, name='conv2d5', nonlinearity=scaled_tanh)
56 |     bn3 = BatchNormLayer(conv2d5, name='batchnorm3')
57 |     reshape6 = ReshapeLayer(bn3, shape=([0], -1), name='reshape6')  # 3000
58 |     reshape6_output = reshape6.output_shape[1]
59 |     dropout3 = DropoutLayer(reshape6, name='dropout3')
60 |     dense7 = DenseLayer(dropout3, num_units=dense_mid_size, name='dense7', nonlinearity=scaled_tanh)
61 |     bn4 = BatchNormLayer(dense7, name='batchnorm4')
62 |     dropout4 = DropoutLayer(bn4, name='dropout4')
63 |     bottleneck = DenseLayer(dropout4, num_units=encode_size, name='bottleneck', nonlinearity=linear)
64 |     # print_network(bottleneck)
65 |     dense8 = DenseLayer(bottleneck, num_units=dense_mid_size, W=bottleneck.W.T, name='dense8', nonlinearity=linear)
66 |     dense9 = DenseLayer(dense8, num_units=reshape6_output, W=dense7.W.T, nonlinearity=scaled_tanh, name='dense9')
67 |     reshape10 = ReshapeLayer(dense9, shape=([0], conv_num_filters3, 3, 5), name='reshape10')  # 32 x 4 x 7
68 |     deconv2d11 = Deconv2DLayer(reshape10, conv2d5.input_shape[1], conv2d5.filter_size, stride=conv2d5.stride,
69 |                                W=conv2d5.W, flip_filters=not conv2d5.flip_filters, name='deconv2d11', nonlinearity=scaled_tanh)
70 |     upscale2d12 = Upscale2DLayer(deconv2d11, scale_factor=pool_size, name='upscale2d12')
71 |     deconv2d13 = Deconv2DLayer(upscale2d12, conv2d3.input_shape[1], conv2d3.filter_size, stride=conv2d3.stride,
72 |                                W=conv2d3.W, flip_filters=not conv2d3.flip_filters, name='deconv2d13', nonlinearity=scaled_tanh)
73 |     upscale2d14 = Upscale2DLayer(deconv2d13, scale_factor=pool_size, name='upscale2d14')
74 |     deconv2d15 = Deconv2DLayer(upscale2d14, conv2d1.input_shape[1], conv2d1.filter_size, stride=conv2d1.stride,
75 |                                crop=(1, 0), W=conv2d1.W, flip_filters=not conv2d1.flip_filters, name='deconv2d14', nonlinearity=scaled_tanh)
76 |     reshape16 = ReshapeLayer(deconv2d15, ([0], -1), name='reshape16')
77 |     return reshape16, bottleneck
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Visual Speech Recognition (AdeNet)
 2 | This page provides instructions to install the necessary packages to run the 
 3 | experiments described in the project on Visual Speech Recognition using Deep Learning.
 4 | 
 5 | ## Installing
 6 | To run the codes, the following dependencies are required: 
 7 | 
 8 | 1. miniconda2
 9 | 2. matplotlib
10 | 3. pydotplus
11 | 4. tabulate
12 | 5. scikit-learn 6. ipython
13 | 7. pillow
14 | 8. theano
15 | 9. lasagne
16 | 10. nolearn
17 | 
18 | It is suggested that you use miniconda to setup a virtual environment before running the codes 
19 | to prevent the packages from messing up with your current python environment. 
20 | Miniconda can be download from http://conda.pydata.org/miniconda.html. 
21 | To install the necessary dependencies you can use the following bash script:
22 | 
23 | ```
24 | #!/bin/bash
25 | ./Miniconda2−latest−Linux−x86 64.sh
26 | conda create −n ip−avsr python source activate ip−avsr
27 |  
28 | pip install pip install pip install pip install pip install pip install
29 | matplotlib pydotplus tabulate scikit −learn ipython pillow
30 | pip install −−upgrade https://github.com/Theano/Theano/archive/master.zip 
31 | pip install −−upgrade https://github.com/Lasagne/Lasagne/archive/master.
32 | zip
33 | pip install git+https://github.com/dnouri/nolearn.git@master#egg=nolearn
34 | ==0.7.git
35 | ```
36 | 
37 | which creates a virtual environment ip-avsr, activates the virtual environment and installs all 
38 | the necessary python packages to this virtual environment.
39 | 
40 | ## Code Structure
41 | The source codes for different datasets are separated into individual folders named based on 
42 | dataset (`avletters, ouluvs, cuave`). All learning models can be found in the folder `modelzoo` and 
43 | can be imported to code as a python package. Custom neural network layers can be found in the 
44 | package `custom_layers` and the `utils` package contains utility functions such as plotting, 
45 | drawing network layers and image preprocessing functions for normalization and computing delta coefficients.
46 | 
47 | ## Datasets
48 | Within each dataset folder, the codes are further grouped into 3 folders. The data folder contains 
49 | all the mouth ROIs, DCT features and Image Differences extracted for the individual dataset. 
50 | The format used is MatLab’s `.mat` format to allow interchangeability between MatLab and python as the 
51 | pretraining stage requires the use of MatLab DBN code.
52 | The model folder contains all pretrained, finetuned and trained networks so they can be easily reloaded 
53 | in future without the need to retrain them from scratch. The config folder contains a list of `.ini` config files 
54 | that are used for different models (**DeltaNet, AdeNet v1, AdeNet v2**). A list of options are provided below. 
55 | The training programs are called unimodal.py, bimodal.py, trimodal.py for single stream, double stream 
56 | and triple stream input source respectively. 
57 | All training codes accepts a config file using the option `--config`. Type `python trimodal.py -h` to see usage options.
58 | 
59 | ```
60 | usage: trimodal.py [−h] [−−config CONFIG] [−−write results WRITERESULTS]
61 | optional arguments:
62 | −h, −−help show this help message and exit
63 | −−config CONFIG config file to use, default=config/trimodal.ini
64 | −−write results WRITE RESULTS write results to file
65 | ```
66 | 
67 | ## Config File Options
68 | Under the `data` section:
69 | - images: raw image ROIs used to extract DBNFs.
70 | - dct: dct features with delta coefficients appended.
71 | - diff: diff image ROIs used for difference of image input source.
72 | 
73 | Under the `models` section:
74 | - pretrained: pretrained DBNF extractor DBN network for raw images.
75 | - finetuned: finetuned DBNF extractor DBN network for raw images.
76 | - pretrained diff: finetuned DBNF extractor DBN network for difference of images.
77 | - finetuned diff: finetuned DBNF extractor DBN network for difference of images.
78 | - fusiontype: the fusion method to use to combine different input sources. 
79 | 
80 | Under the `training` section:
81 | - learning rate: learning rate to use train the model.
82 | - decay rate: learning rate decay at each epoch after decay start.
83 | - decay start: epoch to start learning rate decay
84 | - do finetune: to perform finetuning of DBNF extractor.
85 | - save finetune: save finetuned model of raw image DBNF extractor.
86 | - load finetune: load finetuned model of raw image DBNF extractor.
87 | - load finetune diff: load finetuned model of image differences DBNF ex- tractor.
88 | - output units: number of output classes.
89 | - lstm units: number of hidden units used in the LSTM classifiers.


--------------------------------------------------------------------------------
/utils/lcn.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | import numpy as np
  4 | from theano.tensor.nnet import conv
  5 | import matplotlib.pyplot as plt
  6 | import pylab
  7 | 
  8 | 
  9 | def gaussian_filter(kernel_shape):
 10 |     x = np.zeros((kernel_shape, kernel_shape), dtype='float32')
 11 | 
 12 |     def gauss(x, y, sigma=2.0):
 13 |         Z = 2 * np.pi * sigma ** 2
 14 |         return 1. / Z * np.exp(-(x ** 2 + y ** 2) / (2. * sigma ** 2))
 15 | 
 16 |     mid = np.floor(kernel_shape / 2.)
 17 |     for i in xrange(0, kernel_shape):
 18 |         for j in xrange(0, kernel_shape):
 19 |             x[i, j] = gauss(i - mid, j - mid)
 20 | 
 21 |     return x / np.sum(x)
 22 | 
 23 | 
 24 | def lecun_lcn(input, img_shape, kernel_shape, threshold=1e-4):
 25 |     input = input.reshape(input.shape[0], 1, img_shape[0], img_shape[1])
 26 |     X = T.matrix(dtype=theano.config.floatX)
 27 |     X = X.reshape(input.shape)
 28 | 
 29 |     filter_shape = (1, 1, kernel_shape, kernel_shape)
 30 |     filters = gaussian_filter(kernel_shape).reshape(filter_shape)
 31 | 
 32 |     convout = conv.conv2d(input=X,
 33 |                           filters=filters,
 34 |                           image_shape=(input.shape[0], 1, img_shape[0], img_shape[1]),
 35 |                           filter_shape=filter_shape,
 36 |                           border_mode='full')
 37 | 
 38 |     # For each pixel, remove mean of 9x9 neighborhood
 39 |     mid = int(np.floor(kernel_shape / 2.))
 40 |     centered_X = X - convout[:, :, mid:-mid, mid:-mid]
 41 |     centered_X = X - convout[:, :, mid:-mid, mid:-mid]
 42 | 
 43 |     # Scale down norm of 9x9 patch if norm is bigger than 1
 44 |     sum_sqr_XX = conv.conv2d(input=centered_X ** 2,
 45 |                              filters=filters,
 46 |                              image_shape=(input.shape[0], 1, img_shape[0], img_shape[1]),
 47 |                              filter_shape=filter_shape,
 48 |                              border_mode='full')
 49 | 
 50 |     denom = T.sqrt(sum_sqr_XX[:, :, mid:-mid, mid:-mid])
 51 |     per_img_mean = denom.mean(axis=[1, 2])
 52 |     divisor = T.largest(per_img_mean.dimshuffle(0, 'x', 'x', 1), denom)
 53 |     divisor = T.maximum(divisor, threshold)
 54 | 
 55 |     new_X = centered_X / divisor
 56 |     new_X = new_X.dimshuffle(0, 2, 3, 1)
 57 |     new_X = new_X.flatten(ndim=3)
 58 | 
 59 |     f = theano.function([X], new_X)
 60 |     return f
 61 |     # return f(input)
 62 | 
 63 | 
 64 | def make_lecun_lcn(input_shape, img_shape, kernel_shape, threshold=1e-4):
 65 |     """
 66 |     lecun local contrast normalization
 67 |     :param input_shape: (batch_size, stack_size, nb_row, nb_col)
 68 |     :param img_shape: (nb_row, nb_col) image dimensions
 69 |     :param kernel_shape: kernel shape of image eg: 9x9
 70 |     :param threshold: threshold to allow enhance of edges
 71 |     :return: theano function that computes the local contrast normalized image
 72 |     """
 73 |     X = T.matrix(dtype=theano.config.floatX)
 74 |     X = X.reshape(input_shape)
 75 | 
 76 |     filter_shape = (1, 1, kernel_shape, kernel_shape)
 77 |     filters = gaussian_filter(kernel_shape).reshape(filter_shape)
 78 | 
 79 |     convout = conv.conv2d(input=X,
 80 |                           filters=filters,
 81 |                           image_shape=(input_shape[0], 1, img_shape[0], img_shape[1]),
 82 |                           filter_shape=filter_shape,
 83 |                           border_mode='full')
 84 | 
 85 |     # For each pixel, remove mean of 9x9 neighborhood
 86 |     mid = int(np.floor(kernel_shape / 2.))
 87 |     centered_X = X - convout[:, :, mid:-mid, mid:-mid]
 88 | 
 89 |     # Scale down norm of 9x9 patch if norm is bigger than 1
 90 |     sum_sqr_XX = conv.conv2d(input=centered_X ** 2,
 91 |                              filters=filters,
 92 |                              image_shape=(input_shape[0], 1, img_shape[0], img_shape[1]),
 93 |                              filter_shape=filter_shape,
 94 |                              border_mode='full')
 95 | 
 96 |     denom = T.sqrt(sum_sqr_XX[:, :, mid:-mid, mid:-mid])
 97 |     per_img_mean = denom.mean(axis=[1, 2])
 98 |     divisor = T.largest(per_img_mean.dimshuffle(0, 'x', 'x', 1), denom)
 99 |     divisor = T.maximum(divisor, threshold)
100 | 
101 |     new_X = centered_X / divisor
102 |     new_X = new_X.dimshuffle(0, 2, 3, 1)
103 |     new_X = new_X.flatten(ndim=3)
104 | 
105 |     f = theano.function([X], new_X)
106 |     return f
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     theano.config.floatX = 'float32'
111 |     x_img = plt.imread("../avletters/data/diff.png")  # change as needed
112 | 
113 |     # x_img = x_img.reshape(1, x_img.shape[0], x_img.shape[1], x_img.shape[2]).astype('float32')
114 |     x_img = x_img.reshape(1, x_img.shape[0], x_img.shape[1], x_img.shape[2]).astype('float32')
115 |     lcn = make_lecun_lcn((1, 1, x_img.shape[1], x_img.shape[2]), (x_img.shape[1], x_img.shape[2]), 7, threshold=10)
116 |     for d in range(3):
117 |         x_img[:, :, :, d] = lcn(x_img[:, :, :, d].reshape((1,1,x_img.shape[1], x_img.shape[2])))
118 |     x_img = x_img[0]
119 |     # plt.imshow(x_img, cmap='gray')
120 |     # plt.show()
121 | 
122 |     pylab.gray()
123 |     pylab.axis('off')
124 |     pylab.imshow(x_img)
125 |     pylab.show()


--------------------------------------------------------------------------------
/oulu/landmarking.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import warnings
  3 | import numpy as np
  4 | import os, errno, glob
  5 | import csv
  6 | 
  7 | import menpo.io as mio
  8 | from menpo.visualize import print_progress
  9 | from menpo.feature import igo, fast_dsift
 10 | from menpo.landmark import labeller, face_ibug_68_to_face_ibug_68
 11 | from menpodetect.dlib import load_dlib_frontal_face_detector
 12 | from menpofit.dlib import DlibWrapper
 13 | from menpofit.aam import HolisticAAM, LucasKanadeAAMFitter, ModifiedAlternatingInverseCompositional
 14 | from menpowidgets import visualize_images
 15 | 
 16 | # constants, change according to system
 17 | OULU_DIR = '/Volumes/Alienware 5/Thesis/ouluvs2-missing-vid/'
 18 | FACE_MODEL_PATH = '../config/shape_predictor_68_face_landmarks.dat'
 19 | EXT = ['.mp4', '.mov', '.mpg']
 20 | 
 21 | 
 22 | def find_all_videos(dir, ext=EXT, relpath=False):
 23 |     videofiles = []
 24 |     find_all_videos_impl(dir, videofiles, ext)
 25 |     if relpath:
 26 |         for i, f in enumerate(videofiles):
 27 |             videofiles[i] = f[len(dir) + 1:]
 28 |     return videofiles
 29 | 
 30 | 
 31 | def find_all_videos_impl(dir, videofiles, ext):
 32 |     files = os.listdir(dir)
 33 |     for f in files:
 34 |         path = os.path.join(dir, f)
 35 |         if os.path.isdir(path):
 36 |             find_all_videos_impl(path, videofiles, ext)
 37 |         elif os.path.splitext(f)[1] in ext:
 38 |             videofiles.append(path)
 39 | 
 40 | 
 41 | def is_video(file, ext=EXT):
 42 |     return os.path.splitext(file)[1] in ext
 43 | 
 44 | 
 45 | def fit_image(image):
 46 |     # Face detection
 47 |     bboxes = fit_image.detect(image, image_diagonal=1000)
 48 | 
 49 |     # Check if at least one face was detected, otherwise throw a warning
 50 |     if len(bboxes) > 0:
 51 |         # Use the first bounding box (the most probable to represent a face) to initialise
 52 |         fitting_result = fit_image.fitter.fit_from_bb(image, bboxes[0])
 53 | 
 54 |         # Assign shape on the image
 55 |         image.landmarks['final_shape'] = fitting_result.final_shape
 56 |     else:
 57 |         # Throw warning if no face was detected
 58 |         warnings.warn('No face detected')
 59 | 
 60 |     # Return the image
 61 |     return image
 62 | 
 63 | 
 64 | def create_dir(dir):
 65 |     if not os.path.exists(dir):
 66 |         try:
 67 |             os.makedirs(dir)
 68 |         except OSError as exc:  # Guard against race condition
 69 |             if exc.errno != errno.EEXIST:
 70 |                 raise
 71 | 
 72 | 
 73 | def fill_row(outwriter, frame_no, row):
 74 |     outwriter.writerow([frame_no] + row)
 75 | 
 76 | 
 77 | def process_video(file, dest):
 78 |     if is_video(file):
 79 |         create_dir(os.path.dirname(dest))
 80 |         frames = mio.import_video(file, normalise=False)
 81 |         print('{} contains {} frames'.format(file, len(frames)))
 82 |         print('writing landmarks to {}...'.format(dest))
 83 |         frames = frames.map(fit_image)
 84 |         with open(dest, 'w') as outputfile:
 85 |             outwriter = csv.writer(outputfile)
 86 |             try:
 87 |                 for i, frame in enumerate(print_progress(frames)):
 88 |                     if 'final_shape' not in frame.landmarks:
 89 |                         warnings.warn('no faces detected in the frame {}, '
 90 |                                       'initializing landmarks to -1s...'.format(i))
 91 |                         # dlib does not fitting from previous initial shape so
 92 |                         # leave entire row as -1s
 93 |                         # initial_shape = frames[i - 1].landmarks['final_shape'].lms
 94 |                         # fitting_result = fit_image.fitter.fit_from_shape(frame, initial_shape)
 95 |                         # frame.landmarks['final_shape'] = fitting_result.final_shape
 96 |                         landmarks = [-1] * 136
 97 |                     else:
 98 |                         lmg = frame.landmarks['final_shape']
 99 |                         landmarks = lmg['all'].points.reshape((136,)).tolist()  # reshape to 136 points
100 |                     fill_row(outwriter, i, landmarks)
101 |             except Exception as e:
102 |                 warnings.warn('Runtime Error at frame {}'.format(i))
103 |                 print('initializing landmarks to -1s...')
104 |                 fill_row(outwriter, i, [-1] * 136)
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     print('Generating Landmarks for OULU Dataset...')
109 |     # use a file list instead to control which files to process
110 |     # process only the frontal faces v1
111 |     files = glob.glob(os.path.join(OULU_DIR, 'orig', 's[0-9]*_v1_u[0-9]*.mp4'))
112 |     files.sort()
113 |     fit_image.detect = load_dlib_frontal_face_detector()
114 |     fit_image.fitter = DlibWrapper(FACE_MODEL_PATH)
115 |     # files = files[3200:]  # modify to adjust what to process
116 |     for i, video in enumerate(files):
117 |         print('[{}/{}] - '.format(i + 1, len(files)), end='')
118 |         basename = os.path.basename(video)
119 |         landmarkfile = os.path.splitext(basename)[0] + '.csv'
120 |         process_video(video,
121 |                       os.path.join(OULU_DIR, 'landmarks', landmarkfile))
122 |     print('All Done!')
123 | 


--------------------------------------------------------------------------------
/modelzoo/adenet_v1.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer, BatchNormLayer
  6 | from lasagne.nonlinearities import tanh, sigmoid, linear
  7 | 
  8 | from custom.layers import DeltaLayer
  9 | 
 10 | 
 11 | def create_pretrained_encoder(weights, biases, incoming):
 12 |     l_1 = DenseLayer(incoming, 2000, W=weights[0], b=biases[0], nonlinearity=sigmoid, name='fc1')
 13 |     l_2 = DenseLayer(l_1, 1000, W=weights[1], b=biases[1], nonlinearity=sigmoid, name='fc2')
 14 |     l_3 = DenseLayer(l_2, 500, W=weights[2], b=biases[2], nonlinearity=sigmoid, name='fc3')
 15 |     l_4 = DenseLayer(l_3, 50, W=weights[3], b=biases[3], nonlinearity=linear, name='bottleneck')
 16 |     return l_4
 17 | 
 18 | 
 19 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name):
 20 | 
 21 |     if cell_parameters is None:
 22 |         cell_parameters = Gate()
 23 |     if gate_parameters is None:
 24 |         gate_parameters = Gate()
 25 | 
 26 |     l_lstm = LSTMLayer(
 27 |         l_incoming, hidden_units,
 28 |         # We need to specify a separate input for masks
 29 |         mask_input=l_mask,
 30 |         # Here, we supply the gate parameters for each gate
 31 |         ingate=gate_parameters, forgetgate=gate_parameters,
 32 |         cell=cell_parameters, outgate=gate_parameters,
 33 |         # We'll learn the initialization and use gradient clipping
 34 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 35 | 
 36 |     # The "backwards" layer is the same as the first,
 37 |     # except that the backwards argument is set to True.
 38 |     l_lstm_back = LSTMLayer(
 39 |         l_incoming, hidden_units, ingate=gate_parameters,
 40 |         mask_input=l_mask, forgetgate=gate_parameters,
 41 |         cell=cell_parameters, outgate=gate_parameters,
 42 |         learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name))
 43 | 
 44 |     return l_lstm, l_lstm_back
 45 | 
 46 | 
 47 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
 48 |                  dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'),
 49 |                  output_classes=26):
 50 | 
 51 |     dbn_layers = dbn.get_all_layers()
 52 |     weights = []
 53 |     biases = []
 54 |     weights.append(dbn_layers[1].W.astype('float32'))
 55 |     weights.append(dbn_layers[2].W.astype('float32'))
 56 |     weights.append(dbn_layers[3].W.astype('float32'))
 57 |     weights.append(dbn_layers[4].W.astype('float32'))
 58 |     biases.append(dbn_layers[1].b.astype('float32'))
 59 |     biases.append(dbn_layers[2].b.astype('float32'))
 60 |     biases.append(dbn_layers[3].b.astype('float32'))
 61 |     biases.append(dbn_layers[4].b.astype('float32'))
 62 | 
 63 |     gate_parameters = Gate(
 64 |         W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
 65 |         b=las.init.Constant(0.))
 66 |     cell_parameters = Gate(
 67 |         W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
 68 |         # Setting W_cell to None denotes that no cell connection will be used.
 69 |         W_cell=None, b=las.init.Constant(0.),
 70 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 71 |         nonlinearity=tanh)
 72 | 
 73 |     l_in = InputLayer(input_shape, input_var, 'input')
 74 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 75 |     l_dct = InputLayer(dct_shape, dct_var, 'dct')
 76 | 
 77 |     symbolic_batchsize = l_in.input_var.shape[0]
 78 |     symbolic_seqlen = l_in.input_var.shape[1]
 79 | 
 80 |     l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
 81 |     l_encoder = create_pretrained_encoder(weights, biases, l_reshape1)
 82 |     l_encoder_bn = BatchNormLayer(l_encoder, name='batchnorm1')
 83 |     encoder_len = las.layers.get_output_shape(l_encoder)[-1]
 84 |     l_reshape2 = ReshapeLayer(l_encoder_bn, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
 85 |     l_delta = DeltaLayer(l_reshape2, win, name='delta')
 86 | 
 87 |     l_concat = ConcatLayer([l_delta, l_dct], axis=2, name='concat')
 88 | 
 89 |     l_lstm, l_lstm_back = create_blstm(l_concat, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1')
 90 | 
 91 |     # We'll combine the forward and backward layer output by summing.
 92 |     # Merge layers take in lists of layers to merge as input.
 93 |     l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1')
 94 | 
 95 |     l_lstm2, l_lstm2_back = create_blstm(l_sum1, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm2')
 96 | 
 97 |     # We'll combine the forward and backward layer output by summing.
 98 |     # Merge layers take in lists of layers to merge as input.
 99 |     l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back])
100 | 
101 |     l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1')
102 | 
103 |     # Now, we can apply feed-forward layers as usual.
104 |     # We want the network to predict a classification for the sequence,
105 |     # so we'll use a the number of classes.
106 |     l_out = DenseLayer(
107 |         l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output')
108 | 
109 |     return l_out, l_concat
110 | 


--------------------------------------------------------------------------------
/modelzoo/adenet_v1_1.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer, BatchNormLayer
  6 | from lasagne.nonlinearities import tanh, sigmoid, linear
  7 | 
  8 | from custom.layers import DeltaLayer
  9 | 
 10 | 
 11 | def create_pretrained_encoder(weights, biases, incoming):
 12 |     l_1 = DenseLayer(incoming, 2000, W=weights[0], b=biases[0], nonlinearity=sigmoid, name='fc1')
 13 |     l_2 = DenseLayer(l_1, 1000, W=weights[1], b=biases[1], nonlinearity=sigmoid, name='fc2')
 14 |     l_3 = DenseLayer(l_2, 500, W=weights[2], b=biases[2], nonlinearity=sigmoid, name='fc3')
 15 |     l_4 = DenseLayer(l_3, 50, W=weights[3], b=biases[3], nonlinearity=linear, name='bottleneck')
 16 |     return l_4
 17 | 
 18 | 
 19 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name):
 20 | 
 21 |     if cell_parameters is None:
 22 |         cell_parameters = Gate()
 23 |     if gate_parameters is None:
 24 |         gate_parameters = Gate()
 25 | 
 26 |     l_lstm = LSTMLayer(
 27 |         l_incoming, hidden_units,
 28 |         # We need to specify a separate input for masks
 29 |         mask_input=l_mask,
 30 |         # Here, we supply the gate parameters for each gate
 31 |         ingate=gate_parameters, forgetgate=gate_parameters,
 32 |         cell=cell_parameters, outgate=gate_parameters,
 33 |         # We'll learn the initialization and use gradient clipping
 34 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 35 | 
 36 |     # The "backwards" layer is the same as the first,
 37 |     # except that the backwards argument is set to True.
 38 |     l_lstm_back = LSTMLayer(
 39 |         l_incoming, hidden_units, ingate=gate_parameters,
 40 |         mask_input=l_mask, forgetgate=gate_parameters,
 41 |         cell=cell_parameters, outgate=gate_parameters,
 42 |         learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name))
 43 | 
 44 |     return l_lstm, l_lstm_back
 45 | 
 46 | 
 47 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
 48 |                  dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'),
 49 |                  output_classes=26):
 50 | 
 51 |     dbn_layers = dbn.get_all_layers()
 52 |     weights = []
 53 |     biases = []
 54 |     weights.append(dbn_layers[1].W.astype('float32'))
 55 |     weights.append(dbn_layers[2].W.astype('float32'))
 56 |     weights.append(dbn_layers[3].W.astype('float32'))
 57 |     weights.append(dbn_layers[4].W.astype('float32'))
 58 |     biases.append(dbn_layers[1].b.astype('float32'))
 59 |     biases.append(dbn_layers[2].b.astype('float32'))
 60 |     biases.append(dbn_layers[3].b.astype('float32'))
 61 |     biases.append(dbn_layers[4].b.astype('float32'))
 62 | 
 63 |     gate_parameters = Gate(
 64 |         W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
 65 |         b=las.init.Constant(0.))
 66 |     cell_parameters = Gate(
 67 |         W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
 68 |         # Setting W_cell to None denotes that no cell connection will be used.
 69 |         W_cell=None, b=las.init.Constant(0.),
 70 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 71 |         nonlinearity=tanh)
 72 | 
 73 |     l_in = InputLayer(input_shape, input_var, 'input')
 74 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 75 |     l_dct = InputLayer(dct_shape, dct_var, 'dct')
 76 | 
 77 |     symbolic_batchsize = l_in.input_var.shape[0]
 78 |     symbolic_seqlen = l_in.input_var.shape[1]
 79 | 
 80 |     l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
 81 |     l_encoder = create_pretrained_encoder(weights, biases, l_reshape1)
 82 |     l_encoder_bn = BatchNormLayer(l_encoder, name='batchnorm1')
 83 |     encoder_len = las.layers.get_output_shape(l_encoder)[-1]
 84 |     l_reshape2 = ReshapeLayer(l_encoder_bn, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
 85 |     l_delta = DeltaLayer(l_reshape2, win, name='delta')
 86 | 
 87 |     l_concat = ConcatLayer([l_delta, l_dct], axis=2, name='concat')
 88 | 
 89 |     l_dropout1 = DropoutLayer(l_concat, name='dropout1')
 90 | 
 91 |     l_lstm, l_lstm_back = create_blstm(l_dropout1, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm1')
 92 | 
 93 |     # We'll combine the forward and backward layer output by summing.
 94 |     # Merge layers take in lists of layers to merge as input.
 95 |     l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1')
 96 | 
 97 |     # implement drop-out regularization
 98 |     l_dropout2 = DropoutLayer(l_sum1, name='dropout2')
 99 | 
100 |     l_lstm2, l_lstm2_back = create_blstm(l_dropout2, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm2')
101 | 
102 |     # We'll combine the forward and backward layer output by summing.
103 |     # Merge layers take in lists of layers to merge as input.
104 |     l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back])
105 | 
106 |     l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1')
107 | 
108 |     # Now, we can apply feed-forward layers as usual.
109 |     # We want the network to predict a classification for the sequence,
110 |     # so we'll use a the number of classes.
111 |     l_out = DenseLayer(
112 |         l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output')
113 | 
114 |     return l_out
115 | 


--------------------------------------------------------------------------------
/modelzoo/baseline_end2end.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer
  6 | from lasagne.nonlinearities import tanh, sigmoid, linear
  7 | 
  8 | 
  9 | def create_pretrained_encoder(weights, biases, incoming):
 10 |     l_1 = DenseLayer(incoming, 2000, W=weights[0], b=biases[0], nonlinearity=sigmoid, name='fc1')
 11 |     l_2 = DenseLayer(l_1, 1000, W=weights[1], b=biases[1], nonlinearity=sigmoid, name='fc2')
 12 |     l_3 = DenseLayer(l_2, 500, W=weights[2], b=biases[2], nonlinearity=sigmoid, name='fc3')
 13 |     l_4 = DenseLayer(l_3, 50, W=weights[3], b=biases[3], nonlinearity=linear, name='bottleneck')
 14 |     return l_4
 15 | 
 16 | 
 17 | def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name):
 18 |     if cell_parameters is None:
 19 |         cell_parameters = Gate()
 20 |     if gate_parameters is None:
 21 |         gate_parameters = Gate()
 22 | 
 23 |     l_lstm = LSTMLayer(
 24 |         l_incoming, hidden_units,
 25 |         # We need to specify a separate input for masks
 26 |         mask_input=l_mask,
 27 |         # Here, we supply the gate parameters for each gate
 28 |         ingate=gate_parameters, forgetgate=gate_parameters,
 29 |         cell=cell_parameters, outgate=gate_parameters,
 30 |         # We'll learn the initialization and use gradient clipping
 31 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 32 | 
 33 |     return l_lstm
 34 | 
 35 | 
 36 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name):
 37 | 
 38 |     if cell_parameters is None:
 39 |         cell_parameters = Gate()
 40 |     if gate_parameters is None:
 41 |         gate_parameters = Gate()
 42 | 
 43 |     l_lstm = LSTMLayer(
 44 |         l_incoming, hidden_units,
 45 |         # We need to specify a separate input for masks
 46 |         mask_input=l_mask,
 47 |         # Here, we supply the gate parameters for each gate
 48 |         ingate=gate_parameters, forgetgate=gate_parameters,
 49 |         cell=cell_parameters, outgate=gate_parameters,
 50 |         # We'll learn the initialization and use gradient clipping
 51 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 52 | 
 53 |     # The "backwards" layer is the same as the first,
 54 |     # except that the backwards argument is set to True.
 55 |     l_lstm_back = LSTMLayer(
 56 |         l_incoming, hidden_units, ingate=gate_parameters,
 57 |         mask_input=l_mask, forgetgate=gate_parameters,
 58 |         cell=cell_parameters, outgate=gate_parameters,
 59 |         learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name))
 60 | 
 61 |     return l_lstm, l_lstm_back
 62 | 
 63 | 
 64 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
 65 |                  lstm_size=250, output_classes=26):
 66 | 
 67 |     dbn_layers = dbn.get_all_layers()
 68 |     weights = []
 69 |     biases = []
 70 |     weights.append(dbn_layers[1].W.astype('float32'))
 71 |     weights.append(dbn_layers[2].W.astype('float32'))
 72 |     weights.append(dbn_layers[3].W.astype('float32'))
 73 |     weights.append(dbn_layers[4].W.astype('float32'))
 74 |     biases.append(dbn_layers[1].b.astype('float32'))
 75 |     biases.append(dbn_layers[2].b.astype('float32'))
 76 |     biases.append(dbn_layers[3].b.astype('float32'))
 77 |     biases.append(dbn_layers[4].b.astype('float32'))
 78 | 
 79 |     gate_parameters = Gate(
 80 |         W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
 81 |         b=las.init.Constant(0.))
 82 |     cell_parameters = Gate(
 83 |         W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
 84 |         # Setting W_cell to None denotes that no cell connection will be used.
 85 |         W_cell=None, b=las.init.Constant(0.),
 86 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 87 |         nonlinearity=tanh)
 88 | 
 89 |     l_in = InputLayer(input_shape, input_var, 'input')
 90 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 91 | 
 92 |     symbolic_batchsize = l_in.input_var.shape[0]
 93 |     symbolic_seqlen = l_in.input_var.shape[1]
 94 | 
 95 |     l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
 96 |     l_encoder = create_pretrained_encoder(weights, biases, l_reshape1)
 97 |     encoder_len = las.layers.get_output_shape(l_encoder)[-1]
 98 |     l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
 99 |     # l_delta = DeltaLayer(l_reshape2, win, name='delta')
100 | 
101 |     # l_lstm = create_lstm(l_reshape2, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1')
102 |     l_lstm, l_lstm_back = create_blstm(l_reshape2, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1')
103 | 
104 |     # We'll combine the forward and backward layer output by summing.
105 |     # Merge layers take in lists of layers to merge as input.
106 |     l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1')
107 | 
108 |     l_forward_slice1 = SliceLayer(l_sum1, -1, 1, name='slice1')
109 | 
110 |     # Now, we can apply feed-forward layers as usual.
111 |     # We want the network to predict a classification for the sequence,
112 |     # so we'll use a the number of classes.
113 |     l_out = DenseLayer(
114 |         l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output')
115 | 
116 |     return l_out
117 | 


--------------------------------------------------------------------------------
/modelzoo/avnet.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer, GlobalPoolLayer
  6 | from lasagne.nonlinearities import tanh, linear, rectify
  7 | 
  8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer, create_blstm
  9 | from modelzoo.pretrained_encoder import create_pretrained_encoder
 10 | 
 11 | 
 12 | def extract_weights(ae):
 13 |     weights = []
 14 |     biases = []
 15 |     shapes = [2000, 1000, 500, 50]
 16 |     nonlinearities = [rectify, rectify, rectify, linear]
 17 |     ae_layers = ae.get_all_layers()
 18 |     weights.append(ae_layers[1].W.astype('float32'))
 19 |     weights.append(ae_layers[2].W.astype('float32'))
 20 |     weights.append(ae_layers[3].W.astype('float32'))
 21 |     weights.append(ae_layers[4].W.astype('float32'))
 22 |     biases.append(ae_layers[1].b.astype('float32'))
 23 |     biases.append(ae_layers[2].b.astype('float32'))
 24 |     biases.append(ae_layers[3].b.astype('float32'))
 25 |     biases.append(ae_layers[4].b.astype('float32'))
 26 | 
 27 |     return weights, biases, shapes, nonlinearities
 28 | 
 29 | 
 30 | def create_pretrained_substream(weights, biases, input_shape, input_var, mask_shape, mask_var, name,
 31 |                                 lstm_size=250, win=T.iscalar('theta'), nonlinearity=rectify,
 32 |                                 w_init_fn=las.init.Orthogonal(), use_peepholes=True):
 33 |     gate_parameters = Gate(
 34 |         W_in=w_init_fn, W_hid=w_init_fn,
 35 |         b=las.init.Constant(0.))
 36 |     cell_parameters = Gate(
 37 |         W_in=w_init_fn, W_hid=w_init_fn,
 38 |         # Setting W_cell to None denotes that no cell connection will be used.
 39 |         W_cell=None, b=las.init.Constant(0.),
 40 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 41 |         nonlinearity=tanh)
 42 | 
 43 |     l_input = InputLayer(input_shape, input_var, 'input_'+name)
 44 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 45 | 
 46 |     symbolic_batchsize_raw = l_input.input_var.shape[0]
 47 |     symbolic_seqlen_raw = l_input.input_var.shape[1]
 48 | 
 49 |     l_reshape1_raw = ReshapeLayer(l_input, (-1, input_shape[-1]), name='reshape1_'+name)
 50 |     l_encoder_raw = create_pretrained_encoder(l_reshape1_raw, weights, biases,
 51 |                                               [2000, 1000, 500, 50],
 52 |                                               [nonlinearity, nonlinearity, nonlinearity, linear],
 53 |                                               ['fc1_'+name, 'fc2_'+name, 'fc3_'+name, 'bottleneck_'+name])
 54 |     input_len = las.layers.get_output_shape(l_encoder_raw)[-1]
 55 | 
 56 |     l_reshape2 = ReshapeLayer(l_encoder_raw,
 57 |                                   (symbolic_batchsize_raw, symbolic_seqlen_raw, input_len),
 58 |                                   name='reshape2_'+name)
 59 |     l_delta = DeltaLayer(l_reshape2, win, name='delta_'+name)
 60 | 
 61 |     l_lstm = LSTMLayer(
 62 |         l_delta, int(lstm_size), peepholes=use_peepholes,
 63 |         # We need to specify a separate input for masks
 64 |         mask_input=l_mask,
 65 |         # Here, we supply the gate parameters for each gate
 66 |         ingate=gate_parameters, forgetgate=gate_parameters,
 67 |         cell=cell_parameters, outgate=gate_parameters,
 68 |         # We'll learn the initialization and use gradient clipping
 69 |         learn_init=True, grad_clipping=5., name='lstm_'+name)
 70 | 
 71 |     return l_lstm
 72 | 
 73 | 
 74 | def create_model(substreams, mask_shape, mask_var, lstm_size=250, output_classes=26,
 75 |                  fusiontype='concat', w_init_fn=las.init.Orthogonal(), use_peepholes=True):
 76 | 
 77 |     gate_parameters = Gate(
 78 |         W_in=w_init_fn, W_hid=w_init_fn,
 79 |         b=las.init.Constant(0.))
 80 |     cell_parameters = Gate(
 81 |         W_in=w_init_fn, W_hid=w_init_fn,
 82 |         # Setting W_cell to None denotes that no cell connection will be used.
 83 |         W_cell=None, b=las.init.Constant(0.),
 84 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 85 |         nonlinearity=tanh)
 86 | 
 87 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 88 |     symbolic_seqlen_raw = l_mask.input_var.shape[1]
 89 | 
 90 |     # We'll combine the forward and backward layer output by summing.
 91 |     # Merge layers take in lists of layers to merge as input.
 92 |     if fusiontype == 'adasum':
 93 |         l_fuse = AdaptiveElemwiseSumLayer(substreams, name='adasum1')
 94 |     elif fusiontype == 'sum':
 95 |         l_fuse = ElemwiseSumLayer(substreams, name='sum1')
 96 |     elif fusiontype == 'concat':
 97 |         l_fuse = ConcatLayer(substreams, axis=-1, name='concat')
 98 | 
 99 |     f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg')
100 |     l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2')
101 | 
102 |     # reshape to (num_examples * seq_len, lstm_size)
103 |     l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3')
104 | 
105 |     # Now, we can apply feed-forward layers as usual.
106 |     # We want the network to predict a classification for the sequence,
107 |     # so we'll use a the number of classes.
108 |     l_softmax = DenseLayer(
109 |         l_reshape3, num_units=output_classes,
110 |         nonlinearity=las.nonlinearities.softmax, name='softmax')
111 | 
112 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_raw, output_classes), name='output')
113 | 
114 |     return l_out, l_fuse
115 | 


--------------------------------------------------------------------------------
/oulu/ae_finetuner.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | sys.path.insert(0, '../')
  4 | import os
  5 | import time
  6 | import pickle
  7 | import ConfigParser
  8 | 
  9 | import theano.tensor as T
 10 | import theano
 11 | 
 12 | import matplotlib
 13 | # matplotlib.use('Agg')  # Change matplotlib backend, in case we have no X server running..
 14 | 
 15 | from utils.preprocessing import *
 16 | from utils.plotting_utils import *
 17 | from utils.io import *
 18 | 
 19 | import numpy as np
 20 | from lasagne.layers import InputLayer, DenseLayer
 21 | from lasagne.nonlinearities import tanh, linear, sigmoid, rectify, leaky_rectify
 22 | from lasagne.updates import nesterov_momentum, adadelta, sgd, norm_constraint
 23 | from lasagne.objectives import squared_error
 24 | from nolearn.lasagne import NeuralNet
 25 | 
 26 | 
 27 | def configure_theano():
 28 |     theano.config.floatX = 'float32'
 29 |     sys.setrecursionlimit(10000)
 30 | 
 31 | 
 32 | def load_ae(path, train_params, nonlinearity=sigmoid):
 33 |     """
 34 |     load a pretrained dbn from path
 35 |     :param path: path to the .mat dbn
 36 |     :return: pretrained deep belief network
 37 |     """
 38 |     # create the network using weights from pretrain_nn.mat
 39 |     nn = sio.loadmat(path)
 40 |     w1 = nn['w1']
 41 |     w2 = nn['w2']
 42 |     w3 = nn['w3']
 43 |     w4 = nn['w4']
 44 |     w5 = nn['w5']
 45 |     w6 = nn['w6']
 46 |     w7 = nn['w7']
 47 |     w8 = nn['w8']
 48 |     b1 = nn['b1'][0]
 49 |     b2 = nn['b2'][0]
 50 |     b3 = nn['b3'][0]
 51 |     b4 = nn['b4'][0]
 52 |     b5 = nn['b5'][0]
 53 |     b6 = nn['b6'][0]
 54 |     b7 = nn['b7'][0]
 55 |     b8 = nn['b8'][0]
 56 | 
 57 |     layers = [
 58 |         (InputLayer, {'name': 'input', 'shape': (None, 1144)}),
 59 |         (DenseLayer, {'name': 'l1', 'num_units': 2000, 'nonlinearity': nonlinearity, 'W': w1, 'b': b1}),
 60 |         (DenseLayer, {'name': 'l2', 'num_units': 1000, 'nonlinearity': nonlinearity, 'W': w2, 'b': b2}),
 61 |         (DenseLayer, {'name': 'l3', 'num_units': 500, 'nonlinearity': nonlinearity, 'W': w3, 'b': b3}),
 62 |         (DenseLayer, {'name': 'l4', 'num_units': 50, 'nonlinearity': linear, 'W': w4, 'b': b4}),
 63 |         (DenseLayer, {'name': 'l5', 'num_units': 500, 'nonlinearity': nonlinearity, 'W': w5, 'b': b5}),
 64 |         (DenseLayer, {'name': 'l6', 'num_units': 1000, 'nonlinearity': nonlinearity, 'W': w6, 'b': b6}),
 65 |         (DenseLayer, {'name': 'l7', 'num_units': 2000, 'nonlinearity': nonlinearity, 'W': w7, 'b': b7}),
 66 |         (DenseLayer, {'name': 'output', 'num_units': 1144, 'nonlinearity': linear, 'W': w8, 'b': b8}),
 67 |     ]
 68 | 
 69 |     '''
 70 |     dbn = NeuralNet(
 71 |         layers=layers,
 72 |         max_epochs=30,
 73 |         objective_loss_function=squared_error,
 74 |         update=nesterov_momentum,
 75 |         regression=True,
 76 |         verbose=1,
 77 |         update_learning_rate=0.001,
 78 |         update_momentum=0.05,
 79 |         objective_l2=0.005,
 80 |     )
 81 |     '''
 82 | 
 83 |     dbn = NeuralNet(
 84 |         layers=layers,
 85 |         max_epochs=10,
 86 |         objective_loss_function=squared_error,
 87 |         update=adadelta,
 88 |         regression=True,
 89 |         verbose=1,
 90 |         update_learning_rate=0.01,
 91 |         # update_learning_rate=0.001,
 92 |         # update_momentum=0.05,
 93 |         objective_l2=0.005,
 94 |     )
 95 |     return dbn
 96 | 
 97 | 
 98 | def main():
 99 |     configure_theano()
100 |     config_file = 'config/finetuner.ini'
101 |     config = ConfigParser.ConfigParser()
102 |     config.read(config_file)
103 |     print('loading config file: {}'.format(config_file))
104 | 
105 |     print('preprocessing dataset...')
106 |     data = load_mat_file(config.get('data', 'images'))
107 |     ae_pretrained = config.get('models', 'pretrained')
108 |     ae_finetuned = config.get('models', 'finetuned')
109 |     do_finetune = config.getboolean('training', 'do_finetune')
110 |     save_finetune = config.getboolean('training', 'save_finetune')
111 |     load_finetune = config.getboolean('training', 'load_finetune')
112 |     train_params = dict()
113 |     train_params['max_epochs'] = config.getint('training', 'max_epochs')
114 |     train_params['learning_rate'] = config.getfloat('training', 'learning_rate')
115 |     train_params['objective_l2'] = config.getfloat('training', 'objective_l2')
116 | 
117 |     # create the necessary variable mappings
118 |     data_matrix = data['dataMatrix'].astype('float32')
119 |     data_matrix_len = data_matrix.shape[0]
120 |     vid_len_vec = data['videoLengthVec']
121 |     iter_vec = data['iterVec']
122 | 
123 |     indexes = create_split_index(data_matrix_len, vid_len_vec, iter_vec)
124 |     train_vidlen_vec, test_vidlen_vec = split_videolen(vid_len_vec, iter_vec)
125 | 
126 |     data_matrix = normalize_input(data_matrix)
127 | 
128 |     # split the data
129 |     train_data = data_matrix[indexes == True]
130 |     test_data = data_matrix[indexes == False]
131 | 
132 |     if do_finetune:
133 |         print('performing finetuning...')
134 |         ae = load_ae(ae_pretrained, train_params, nonlinearity=rectify)
135 |         ae.initialize()
136 |         # ae.fit(train_data, train_data)
137 |         res = ae.predict(test_data)
138 |         # print(res.shape)
139 |         visualize_reconstruction(test_data[300:336], res[300:336], shape=(26, 44))
140 | 
141 |     if save_finetune:
142 |         print('saving finetuned encoder: {}...'.format(ae_finetuned))
143 |         pickle.dump(ae, open(ae_finetuned, 'wb'))
144 | 
145 |     if load_finetune:
146 |         print('loading finetuned encoder: {}'.format(ae_finetuned))
147 |         ae = load_ae(ae_pretrained, train_params)
148 |         # ae = pickle.load(open(ae_finetuned, 'rb'))
149 |         ae.initialize()
150 |         print('performing prediction...')
151 |         res = ae.predict(test_data)
152 |         visualize_reconstruction(test_data[300:336], res[300:336])
153 |         print('done!')
154 | 
155 | 
156 | if __name__ == '__main__':
157 |     main()
158 | 


--------------------------------------------------------------------------------
/avletters/ae_finetuner.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | sys.path.insert(0, '../')
  4 | import os
  5 | import time
  6 | import pickle
  7 | import ConfigParser
  8 | 
  9 | import theano.tensor as T
 10 | import theano
 11 | 
 12 | import matplotlib
 13 | # matplotlib.use('Agg')  # Change matplotlib backend, in case we have no X server running..
 14 | 
 15 | from utils.preprocessing import *
 16 | from utils.plotting_utils import *
 17 | from utils.io import *
 18 | 
 19 | import numpy as np
 20 | from lasagne.layers import InputLayer, DenseLayer
 21 | from lasagne.nonlinearities import tanh, linear, sigmoid, rectify, leaky_rectify
 22 | from lasagne.updates import nesterov_momentum, adadelta, sgd, norm_constraint
 23 | from lasagne.objectives import squared_error
 24 | from nolearn.lasagne import NeuralNet
 25 | 
 26 | 
 27 | def configure_theano():
 28 |     theano.config.floatX = 'float32'
 29 |     sys.setrecursionlimit(10000)
 30 | 
 31 | 
 32 | def load_ae(path, train_params, nonlinearity=sigmoid):
 33 |     """
 34 |     load a pretrained dbn from path
 35 |     :param path: path to the .mat dbn
 36 |     :return: pretrained deep belief network
 37 |     """
 38 |     # create the network using weights from pretrain_nn.mat
 39 |     nn = sio.loadmat(path)
 40 |     w1 = nn['w1']
 41 |     w2 = nn['w2']
 42 |     w3 = nn['w3']
 43 |     w4 = nn['w4']
 44 |     w5 = nn['w5']
 45 |     w6 = nn['w6']
 46 |     w7 = nn['w7']
 47 |     w8 = nn['w8']
 48 |     b1 = nn['b1'][0]
 49 |     b2 = nn['b2'][0]
 50 |     b3 = nn['b3'][0]
 51 |     b4 = nn['b4'][0]
 52 |     b5 = nn['b5'][0]
 53 |     b6 = nn['b6'][0]
 54 |     b7 = nn['b7'][0]
 55 |     b8 = nn['b8'][0]
 56 | 
 57 |     layers = [
 58 |         (InputLayer, {'name': 'input', 'shape': (None, 1200)}),
 59 |         (DenseLayer, {'name': 'l1', 'num_units': 2000, 'nonlinearity': nonlinearity, 'W': w1, 'b': b1}),
 60 |         (DenseLayer, {'name': 'l2', 'num_units': 1000, 'nonlinearity': nonlinearity, 'W': w2, 'b': b2}),
 61 |         (DenseLayer, {'name': 'l3', 'num_units': 500, 'nonlinearity': nonlinearity, 'W': w3, 'b': b3}),
 62 |         (DenseLayer, {'name': 'l4', 'num_units': 50, 'nonlinearity': linear, 'W': w4, 'b': b4}),
 63 |         (DenseLayer, {'name': 'l5', 'num_units': 500, 'nonlinearity': nonlinearity, 'W': w5, 'b': b5}),
 64 |         (DenseLayer, {'name': 'l6', 'num_units': 1000, 'nonlinearity': nonlinearity, 'W': w6, 'b': b6}),
 65 |         (DenseLayer, {'name': 'l7', 'num_units': 2000, 'nonlinearity': nonlinearity, 'W': w7, 'b': b7}),
 66 |         (DenseLayer, {'name': 'output', 'num_units': 1200, 'nonlinearity': linear, 'W': w8, 'b': b8}),
 67 |     ]
 68 | 
 69 |     '''
 70 |     dbn = NeuralNet(
 71 |         layers=layers,
 72 |         max_epochs=30,
 73 |         objective_loss_function=squared_error,
 74 |         update=nesterov_momentum,
 75 |         regression=True,
 76 |         verbose=1,
 77 |         update_learning_rate=0.001,
 78 |         update_momentum=0.05,
 79 |         objective_l2=0.005,
 80 |     )
 81 |     '''
 82 | 
 83 |     dbn = NeuralNet(
 84 |         layers=layers,
 85 |         max_epochs=10,
 86 |         objective_loss_function=squared_error,
 87 |         update=adadelta,
 88 |         regression=True,
 89 |         verbose=1,
 90 |         update_learning_rate=0.01,
 91 |         # update_learning_rate=0.001,
 92 |         # update_momentum=0.05,
 93 |         objective_l2=0.005,
 94 |     )
 95 |     return dbn
 96 | 
 97 | 
 98 | def main():
 99 |     configure_theano()
100 |     config_file = 'config/finetuner.ini'
101 |     config = ConfigParser.ConfigParser()
102 |     config.read(config_file)
103 |     print('loading config file: {}'.format(config_file))
104 | 
105 |     print('preprocessing dataset...')
106 |     data = load_mat_file(config.get('data', 'images'))
107 |     ae_pretrained = config.get('models', 'pretrained')
108 |     ae_finetuned = config.get('models', 'finetuned')
109 |     do_finetune = config.getboolean('training', 'do_finetune')
110 |     save_finetune = config.getboolean('training', 'save_finetune')
111 |     load_finetune = config.getboolean('training', 'load_finetune')
112 |     train_params = dict()
113 |     train_params['max_epochs'] = config.getint('training', 'max_epochs')
114 |     train_params['learning_rate'] = config.getfloat('training', 'learning_rate')
115 |     train_params['objective_l2'] = config.getfloat('training', 'objective_l2')
116 | 
117 |     # create the necessary variable mappings
118 |     data_matrix = data['dataMatrix'].astype('float32')
119 |     data_matrix_len = data_matrix.shape[0]
120 |     vid_len_vec = data['videoLengthVec']
121 |     iter_vec = data['iterVec']
122 | 
123 |     indexes = create_split_index(data_matrix_len, vid_len_vec, iter_vec)
124 |     train_vidlen_vec, test_vidlen_vec = split_videolen(vid_len_vec, iter_vec)
125 |     assert len(train_vidlen_vec) == 520
126 |     assert len(test_vidlen_vec) == 260
127 |     assert np.sum(vid_len_vec) == data_matrix_len
128 | 
129 |     data_matrix = normalize_input(data_matrix)
130 | 
131 |     # split the data
132 |     train_data = data_matrix[indexes == True]
133 |     test_data = data_matrix[indexes == False]
134 | 
135 |     if do_finetune:
136 |         print('performing finetuning...')
137 |         ae = load_ae(ae_pretrained, train_params, rectify)
138 |         ae.initialize()
139 |         #ae.fit(train_data, train_data)
140 |         res = ae.predict(test_data)
141 |         print(res.shape)
142 |         visualize_reconstruction(test_data[300:336], res[300:336])
143 | 
144 |     if save_finetune:
145 |         print('saving finetuned encoder: {}...'.format(ae_finetuned))
146 |         pickle.dump(ae, open(ae_finetuned, 'wb'))
147 | 
148 |     if load_finetune:
149 |         print('loading finetuned encoder: {}'.format(ae_finetuned))
150 |         ae = load_ae(ae_pretrained, train_params)
151 |         # ae = pickle.load(open(ae_finetuned, 'rb'))
152 |         ae.initialize()
153 |         print('performing prediction...')
154 |         res = ae.predict(test_data)
155 |         visualize_reconstruction(test_data[300:336], res[300:336])
156 |         print('done!')
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     main()
161 | 


--------------------------------------------------------------------------------
/utils/draw_net.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Functions to create network diagrams from a list of Layers.
  3 | 
  4 | Examples:
  5 | 
  6 |     Draw a minimal diagram to a pdf file:
  7 |         layers = lasagne.layers.get_all_layers(output_layer)
  8 |         draw_to_file(layers, 'network.pdf', output_shape=False)
  9 | 
 10 |     Draw a verbose diagram in an IPython notebook:
 11 |         from IPython.display import Image #needed to render in notebook
 12 | 
 13 |         layers = lasagne.layers.get_all_layers(output_layer)
 14 |         dot = get_pydot_graph(layers, verbose=True)
 15 |         return Image(dot.create_png())
 16 | """
 17 | 
 18 | import pydot
 19 | import lasagne.layers
 20 | 
 21 | 
 22 | def get_hex_color(layer_type):
 23 |     """
 24 |     Determines the hex color for a layer. Some classes are given
 25 |     default values, all others are calculated pseudorandomly
 26 |     from their name.
 27 |     :parameters:
 28 |         - layer_type : string
 29 |             Class name of the layer
 30 | 
 31 |     :returns:
 32 |         - color : string containing a hex color.
 33 | 
 34 |     :usage:
 35 |         >>> color = get_hex_color('MaxPool2DDNN')
 36 |         '#9D9DD2'
 37 |     """
 38 | 
 39 |     if 'Input' in layer_type:
 40 |         return '#A2CECE'
 41 |     if 'Conv' in layer_type:
 42 |         return '#7C9ABB'
 43 |     if 'Dense' in layer_type:
 44 |         return '#6CCF8D'
 45 |     if 'Pool' in layer_type:
 46 |         return '#9D9DD2'
 47 |     if 'Slice' in layer_type:
 48 |         return '#f6f930'
 49 |     if 'LSTM' in layer_type:
 50 |         return '#e06b04'
 51 |     if 'Reshape' in layer_type:
 52 |         return '#e3b029'
 53 |     if 'Dropout' in layer_type:
 54 |         return '#ffb2ea'
 55 |     if 'Delta' in layer_type:
 56 |         return '#d7b8ff'
 57 |     else:
 58 |         return '#{0:x}'.format(hash(layer_type) % 2**24)
 59 | 
 60 | 
 61 | def get_pydot_graph(layers, output_shape=True, verbose=False):
 62 |     """
 63 |     Creates a PyDot graph of the network defined by the given layers.
 64 |     :parameters:
 65 |         - layers : list
 66 |             List of the layers, as obtained from lasange.layers.get_all_layers
 67 |         - output_shape: (default `True`)
 68 |             If `True`, the output shape of each layer will be displayed.
 69 |         - verbose: (default `False`)
 70 |             If `True`, layer attributes like filter shape, stride, etc.
 71 |             will be displayed.
 72 |         - verbose:
 73 |     :returns:
 74 |         - pydot_graph : PyDot object containing the graph
 75 | 
 76 |     """
 77 |     pydot_graph = pydot.Dot('Network', graph_type='digraph')
 78 |     pydot_nodes = {}
 79 |     pydot_edges = []
 80 |     for i, layer in enumerate(layers):
 81 |         layer_type = '{0}: {1}'.format(layer.__class__.__name__, layer.name)
 82 |         key = repr(layer)
 83 |         label = layer_type
 84 |         color = get_hex_color(layer_type)
 85 |         if verbose:
 86 |             for attr in ['num_filters', 'num_units', 'ds',
 87 |                          'filter_shape', 'stride', 'strides', 'p']:
 88 |                 if hasattr(layer, attr):
 89 |                     label += '\n' + \
 90 |                         '{0}: {1}'.format(attr, getattr(layer, attr))
 91 |             if hasattr(layer, 'nonlinearity'):
 92 |                 try:
 93 |                     nonlinearity = layer.nonlinearity.__name__
 94 |                 except AttributeError:
 95 |                     nonlinearity = layer.nonlinearity.__class__.__name__
 96 |                 label += '\n' + 'nonlinearity: {0}'.format(nonlinearity)
 97 | 
 98 |         if output_shape:
 99 |             output_shape = lasagne.layers.get_output_shape(layer)
100 |             if len(output_shape) is 3:
101 |                 output_shape_str = '(Batch Size, Seq Len, {})'.format(output_shape[-1])
102 |             if len(output_shape) is 2:
103 |                 output_shape_str = '(Batch Size x Seq Len, {})'.format(output_shape[-1])
104 |             if layer.name == 'mask':
105 |                 output_shape_str = '(Batch Size, Seq Len)'
106 |             label += '\n' + \
107 |                 'Output shape: {0}'.format(output_shape_str)
108 |         pydot_nodes[key] = pydot.Node(key,
109 |                                       label=label,
110 |                                       shape='record',
111 |                                       fillcolor=color,
112 |                                       style='filled',
113 |                                       )
114 | 
115 |         if hasattr(layer, 'input_layers'):
116 |             for input_layer in layer.input_layers:
117 |                 pydot_edges.append([repr(input_layer), key])
118 | 
119 |         if hasattr(layer, 'input_layer'):
120 |             pydot_edges.append([repr(layer.input_layer), key])
121 | 
122 |     for node in pydot_nodes.values():
123 |         pydot_graph.add_node(node)
124 |     for edge in pydot_edges:
125 |         pydot_graph.add_edge(
126 |             pydot.Edge(pydot_nodes[edge[0]], pydot_nodes[edge[1]]))
127 |     return pydot_graph
128 | 
129 | 
130 | def draw_to_file(layers, filename, **kwargs):
131 |     """
132 |     Draws a network diagram to a file
133 |     :parameters:
134 |         - layers : list
135 |             List of the layers, as obtained from lasange.layers.get_all_layers
136 |         - filename: string
137 |             The filename to save output to.
138 |         - **kwargs: see docstring of get_pydot_graph for other options
139 |     """
140 |     dot = get_pydot_graph(layers, **kwargs)
141 | 
142 |     ext = filename[filename.rfind('.') + 1:]
143 |     with open(filename, 'w') as fid:
144 |         fid.write(dot.create(format=ext))
145 | 
146 | 
147 | def draw_to_notebook(layers, **kwargs):
148 |     """
149 |     Draws a network diagram in an IPython notebook
150 |     :parameters:
151 |         - layers : list
152 |             List of the layers, as obtained from lasange.layers.get_all_layers
153 |         - **kwargs: see docstring of get_pydot_graph for other options
154 |     """
155 |     from IPython.display import Image  # needed to render in notebook
156 | 
157 |     dot = get_pydot_graph(layers, **kwargs)
158 |     return Image(dot.create_png())


--------------------------------------------------------------------------------
/modelzoo/adenet_3stream_dct.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate
  6 | from lasagne.nonlinearities import tanh
  7 | 
  8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer, create_blstm
  9 | from modelzoo.pretrained_encoder import create_pretrained_encoder
 10 | 
 11 | 
 12 | def create_model(s1_ae, s2_ae, s1_shape, s1_var,
 13 |                  s2_shape, s2_var,
 14 |                  s3_shape, s3_var,
 15 |                  mask_shape, mask_var,
 16 |                  lstm_size=250, win=T.iscalar('theta)'),
 17 |                  output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(),
 18 |                  use_peepholes=True):
 19 | 
 20 |     s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities = s1_ae
 21 |     s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae
 22 | 
 23 |     gate_parameters = Gate(
 24 |         W_in=w_init_fn, W_hid=w_init_fn,
 25 |         b=las.init.Constant(0.))
 26 |     cell_parameters = Gate(
 27 |         W_in=w_init_fn, W_hid=w_init_fn,
 28 |         # Setting W_cell to None denotes that no cell connection will be used.
 29 |         W_cell=None, b=las.init.Constant(0.),
 30 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 31 |         nonlinearity=tanh)
 32 | 
 33 |     l_s1 = InputLayer(s1_shape, s1_var, 's1_im')
 34 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 35 |     l_s2 = InputLayer(s2_shape, s2_var, 's2_im')
 36 |     l_s3 = InputLayer(s3_shape, s3_var, 's3_im')
 37 | 
 38 |     symbolic_batchsize_s1 = l_s1.input_var.shape[0]
 39 |     symbolic_seqlen_s1 = l_s1.input_var.shape[1]
 40 |     symbolic_batchsize_s2 = l_s2.input_var.shape[0]
 41 |     symbolic_seqlen_s2 = l_s2.input_var.shape[1]
 42 | 
 43 |     l_reshape1_s1 = ReshapeLayer(l_s1, (-1, s1_shape[-1]), name='reshape1_s1')
 44 |     l_encoder_s1 = create_pretrained_encoder(l_reshape1_s1, s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities,
 45 |                                               ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1'])
 46 |     s1_len = las.layers.get_output_shape(l_encoder_s1)[-1]
 47 | 
 48 |     l_reshape2_s1 = ReshapeLayer(l_encoder_s1,
 49 |                                  (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len),
 50 |                                  name='reshape2_s1')
 51 |     l_delta_s1 = DeltaLayer(l_reshape2_s1, win, name='delta_s1')
 52 | 
 53 |     # s2 images
 54 |     l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2')
 55 |     l_encoder_s2 = create_pretrained_encoder(l_reshape1_s2, s2_weights, s2_biases, s2_shapes,
 56 |                                              s2_nonlinearities,
 57 |                                              ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2'])
 58 |     s2_len = las.layers.get_output_shape(l_encoder_s2)[-1]
 59 |     l_reshape2_s2 = ReshapeLayer(l_encoder_s2,
 60 |                                  (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len),
 61 |                                  name='reshape2_s2')
 62 |     l_delta_s2 = DeltaLayer(l_reshape2_s2, win, name='delta_s2')
 63 | 
 64 |     # s3 images
 65 |     l_delta_s3 = DeltaLayer(l_s3, win, name='delta_s3')
 66 | 
 67 |     l_lstm_s1 = LSTMLayer(
 68 |         l_delta_s1, int(lstm_size), peepholes=use_peepholes,
 69 |         # We need to specify a separate input for masks
 70 |         mask_input=l_mask,
 71 |         # Here, we supply the gate parameters for each gate
 72 |         ingate=gate_parameters, forgetgate=gate_parameters,
 73 |         cell=cell_parameters, outgate=gate_parameters,
 74 |         # We'll learn the initialization and use gradient clipping
 75 |         learn_init=True, grad_clipping=5., name='lstm_s1')
 76 | 
 77 |     l_lstm_s2 = LSTMLayer(
 78 |         l_delta_s2, lstm_size, peepholes=use_peepholes,
 79 |         # We need to specify a separate input for masks
 80 |         mask_input=l_mask,
 81 |         # Here, we supply the gate parameters for each gate
 82 |         ingate=gate_parameters, forgetgate=gate_parameters,
 83 |         cell=cell_parameters, outgate=gate_parameters,
 84 |         # We'll learn the initialization and use gradient clipping
 85 |         learn_init=True, grad_clipping=5., name='lstm_s2')
 86 | 
 87 |     l_lstm_s3 = LSTMLayer(
 88 |         l_delta_s3, lstm_size, peepholes=use_peepholes,
 89 |         # We need to specify a separate input for masks
 90 |         mask_input=l_mask,
 91 |         # Here, we supply the gate parameters for each gate
 92 |         ingate=gate_parameters, forgetgate=gate_parameters,
 93 |         cell=cell_parameters, outgate=gate_parameters,
 94 |         # We'll learn the initialization and use gradient clipping
 95 |         learn_init=True, grad_clipping=5., name='lstm_s3')
 96 | 
 97 |     # We'll combine the forward and backward layer output by summing.
 98 |     # Merge layers take in lists of layers to merge as input.
 99 |     if fusiontype == 'adasum':
100 |         l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], name='adasum1')
101 |     elif fusiontype == 'sum':
102 |         l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], name='sum1')
103 |     elif fusiontype == 'concat':
104 |         l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], axis=-1, name='concat')
105 | 
106 |     f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg')
107 |     l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2')
108 | 
109 |     # reshape to (num_examples * seq_len, lstm_size)
110 |     l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3')
111 | 
112 |     # Now, we can apply feed-forward layers as usual.
113 |     # We want the network to predict a classification for the sequence,
114 |     # so we'll use a the number of classes.
115 |     l_softmax = DenseLayer(
116 |         l_reshape3, num_units=output_classes,
117 |         nonlinearity=las.nonlinearities.softmax, name='softmax')
118 | 
119 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output')
120 | 
121 |     return l_out, l_fuse
122 | 


--------------------------------------------------------------------------------
/modelzoo/adenet_v2_4.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer, GlobalPoolLayer
  6 | from lasagne.nonlinearities import tanh, sigmoid, linear, rectify
  7 | 
  8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer
  9 | from modelzoo.pretrained_encoder import create_pretrained_encoder
 10 | 
 11 | 
 12 | def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True):
 13 | 
 14 |     if cell_parameters is None:
 15 |         cell_parameters = Gate()
 16 |     if gate_parameters is None:
 17 |         gate_parameters = Gate()
 18 | 
 19 |     l_lstm = LSTMLayer(
 20 |         l_incoming, hidden_units,
 21 |         # We need to specify a separate input for masks
 22 |         mask_input=l_mask, peepholes=use_peepholes,
 23 |         # Here, we supply the gate parameters for each gate
 24 |         ingate=gate_parameters, forgetgate=gate_parameters,
 25 |         cell=cell_parameters, outgate=gate_parameters,
 26 |         # We'll learn the initialization and use gradient clipping
 27 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 28 | 
 29 |     return l_lstm
 30 | 
 31 | 
 32 | def create_model(ae, diff_ae, input_shape, input_var, mask_shape, mask_var,
 33 |                  diff_shape, diff_var, lstm_size=250, win=T.iscalar('theta)'),
 34 |                  output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(),
 35 |                  use_peepholes=True):
 36 | 
 37 |     bn_weights, bn_biases, bn_shapes, bn_nonlinearities = ae
 38 |     diff_weights, diff_biases, diff_shapes, diff_nonlinearities = diff_ae
 39 | 
 40 |     gate_parameters = Gate(
 41 |         W_in=w_init_fn, W_hid=w_init_fn,
 42 |         b=las.init.Constant(0.))
 43 |     cell_parameters = Gate(
 44 |         W_in=w_init_fn, W_hid=w_init_fn,
 45 |         # Setting W_cell to None denotes that no cell connection will be used.
 46 |         W_cell=None, b=las.init.Constant(0.),
 47 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 48 |         nonlinearity=tanh)
 49 | 
 50 |     l_raw = InputLayer(input_shape, input_var, 'raw_im')
 51 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 52 |     l_diff = InputLayer(diff_shape, diff_var, 'diff_im')
 53 | 
 54 |     symbolic_batchsize_raw = l_raw.input_var.shape[0]
 55 |     symbolic_seqlen_raw = l_raw.input_var.shape[1]
 56 |     symbolic_batchsize_diff = l_diff.input_var.shape[0]
 57 |     symbolic_seqlen_diff = l_diff.input_var.shape[1]
 58 | 
 59 |     l_reshape1_raw = ReshapeLayer(l_raw, (-1, input_shape[-1]), name='reshape1_raw')
 60 |     l_encoder_raw = create_pretrained_encoder(l_reshape1_raw, bn_weights, bn_biases, bn_shapes, bn_nonlinearities,
 61 |                                               ['fc1_raw', 'fc2_raw', 'fc3_raw', 'bottleneck_raw'])
 62 |     raw_len = las.layers.get_output_shape(l_encoder_raw)[-1]
 63 | 
 64 |     l_reshape2_raw = ReshapeLayer(l_encoder_raw,
 65 |                                   (symbolic_batchsize_raw, symbolic_seqlen_raw, raw_len),
 66 |                                   name='reshape2_raw')
 67 |     l_delta_raw = DeltaLayer(l_reshape2_raw, win, name='delta_raw')
 68 | 
 69 |     # diff images
 70 |     l_reshape1_diff = ReshapeLayer(l_diff, (-1, diff_shape[-1]), name='reshape1_diff')
 71 |     l_encoder_diff = create_pretrained_encoder(l_reshape1_diff, diff_weights, diff_biases, diff_shapes,
 72 |                                                diff_nonlinearities,
 73 |                                                ['fc1_diff', 'fc2_diff', 'fc3_diff', 'bottleneck_diff'])
 74 |     diff_len = las.layers.get_output_shape(l_encoder_diff)[-1]
 75 |     l_reshape2_diff = ReshapeLayer(l_encoder_diff,
 76 |                                    (symbolic_batchsize_diff, symbolic_seqlen_diff, diff_len),
 77 |                                    name='reshape2_diff')
 78 |     l_delta_diff = DeltaLayer(l_reshape2_diff, win, name='delta_diff')
 79 | 
 80 |     l_lstm_raw = LSTMLayer(
 81 |         l_delta_raw, int(lstm_size), peepholes=use_peepholes,
 82 |         # We need to specify a separate input for masks
 83 |         mask_input=l_mask,
 84 |         # Here, we supply the gate parameters for each gate
 85 |         ingate=gate_parameters, forgetgate=gate_parameters,
 86 |         cell=cell_parameters, outgate=gate_parameters,
 87 |         # We'll learn the initialization and use gradient clipping
 88 |         learn_init=True, grad_clipping=5., name='lstm_raw')
 89 | 
 90 |     l_lstm_diff = LSTMLayer(
 91 |         l_delta_diff, lstm_size, peepholes=use_peepholes,
 92 |         # We need to specify a separate input for masks
 93 |         mask_input=l_mask,
 94 |         # Here, we supply the gate parameters for each gate
 95 |         ingate=gate_parameters, forgetgate=gate_parameters,
 96 |         cell=cell_parameters, outgate=gate_parameters,
 97 |         # We'll learn the initialization and use gradient clipping
 98 |         learn_init=True, grad_clipping=5., name='lstm_diff')
 99 | 
100 |     # We'll combine the forward and backward layer output by summing.
101 |     # Merge layers take in lists of layers to merge as input.
102 |     if fusiontype == 'adasum':
103 |         l_fuse = AdaptiveElemwiseSumLayer([l_lstm_raw, l_lstm_diff], name='adasum1')
104 |     elif fusiontype == 'sum':
105 |         l_fuse = ElemwiseSumLayer([l_lstm_raw, l_lstm_diff], name='sum1')
106 |     elif fusiontype == 'concat':
107 |         l_fuse = ConcatLayer([l_lstm_raw, l_lstm_diff], axis=-1, name='concat')
108 | 
109 |     f_lstm_agg = create_lstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg')
110 | 
111 |     # reshape to (num_examples * seq_len, lstm_size)
112 |     l_reshape3 = ReshapeLayer(f_lstm_agg, (-1, lstm_size))
113 | 
114 |     # Now, we can apply feed-forward layers as usual.
115 |     # We want the network to predict a classification for the sequence,
116 |     # so we'll use a the number of classes.
117 |     l_softmax = DenseLayer(
118 |         l_reshape3, num_units=output_classes,
119 |         nonlinearity=las.nonlinearities.softmax, name='softmax')
120 | 
121 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_raw, output_classes), name='output')
122 | 
123 |     return l_out, l_fuse
124 | 


--------------------------------------------------------------------------------
/modelzoo/adenet_v2_nodelta.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer, GlobalPoolLayer
  6 | from lasagne.nonlinearities import tanh, sigmoid, linear, rectify, leaky_rectify
  7 | 
  8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer
  9 | from modelzoo.pretrained_encoder import create_pretrained_encoder
 10 | 
 11 | 
 12 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True):
 13 | 
 14 |     if cell_parameters is None:
 15 |         cell_parameters = Gate()
 16 |     if gate_parameters is None:
 17 |         gate_parameters = Gate()
 18 | 
 19 |     l_lstm = LSTMLayer(
 20 |         l_incoming, hidden_units,
 21 |         # We need to specify a separate input for masks
 22 |         mask_input=l_mask, peepholes=use_peepholes,
 23 |         # Here, we supply the gate parameters for each gate
 24 |         ingate=gate_parameters, forgetgate=gate_parameters,
 25 |         cell=cell_parameters, outgate=gate_parameters,
 26 |         # We'll learn the initialization and use gradient clipping
 27 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 28 | 
 29 |     # The "backwards" layer is the same as the first,
 30 |     # except that the backwards argument is set to True.
 31 |     l_lstm_back = LSTMLayer(
 32 |         l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes,
 33 |         mask_input=l_mask, forgetgate=gate_parameters,
 34 |         cell=cell_parameters, outgate=gate_parameters,
 35 |         learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name))
 36 | 
 37 |     return l_lstm, l_lstm_back
 38 | 
 39 | 
 40 | def create_model(ae, s2_ae, input_shape, input_var, mask_shape, mask_var,
 41 |                  s2_shape, s2_var, lstm_size=250,
 42 |                  output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(),
 43 |                  use_peepholes=True):
 44 | 
 45 |     bn_weights, bn_biases, bn_shapes, bn_nonlinearities = ae
 46 |     s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae
 47 | 
 48 |     gate_parameters = Gate(
 49 |         W_in=w_init_fn, W_hid=w_init_fn,
 50 |         b=las.init.Constant(0.))
 51 |     cell_parameters = Gate(
 52 |         W_in=w_init_fn, W_hid=w_init_fn,
 53 |         # Setting W_cell to None denotes that no cell connection will be used.
 54 |         W_cell=None, b=las.init.Constant(0.),
 55 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 56 |         nonlinearity=tanh)
 57 | 
 58 |     l_s1 = InputLayer(input_shape, input_var, 's1_im')
 59 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 60 |     l_s2 = InputLayer(s2_shape, s2_var, 's2_im')
 61 | 
 62 |     symbolic_batchsize_s1 = l_s1.input_var.shape[0]
 63 |     symbolic_seqlen_s1 = l_s1.input_var.shape[1]
 64 |     symbolic_batchsize_s2 = l_s2.input_var.shape[0]
 65 |     symbolic_seqlen_s2 = l_s2.input_var.shape[1]
 66 | 
 67 |     l_reshape1_s1 = ReshapeLayer(l_s1, (-1, input_shape[-1]), name='reshape1_s1')
 68 |     l_encoder_s1 = create_pretrained_encoder(l_reshape1_s1, bn_weights, bn_biases, bn_shapes, bn_nonlinearities,
 69 |                                               ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1'])
 70 |     s1_len = las.layers.get_output_shape(l_encoder_s1)[-1]
 71 | 
 72 |     l_reshape2_s1 = ReshapeLayer(l_encoder_s1,
 73 |                                  (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len),
 74 |                                  name='reshape2_s1')
 75 | 
 76 |     # s2 images
 77 |     l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2')
 78 |     l_encoder_s2 = create_pretrained_encoder(l_reshape1_s2, s2_weights, s2_biases, s2_shapes,
 79 |                                              s2_nonlinearities,
 80 |                                              ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2'])
 81 |     s2_len = las.layers.get_output_shape(l_encoder_s2)[-1]
 82 |     l_reshape2_s2 = ReshapeLayer(l_encoder_s2,
 83 |                                  (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len),
 84 |                                  name='reshape2_s2')
 85 | 
 86 |     l_lstm_s1 = LSTMLayer(
 87 |         l_reshape2_s1, int(lstm_size), peepholes=use_peepholes,
 88 |         # We need to specify a separate input for masks
 89 |         mask_input=l_mask,
 90 |         # Here, we supply the gate parameters for each gate
 91 |         ingate=gate_parameters, forgetgate=gate_parameters,
 92 |         cell=cell_parameters, outgate=gate_parameters,
 93 |         # We'll learn the initialization and use gradient clipping
 94 |         learn_init=True, grad_clipping=5., name='lstm_s1')
 95 | 
 96 |     l_lstm_s2 = LSTMLayer(
 97 |         l_reshape2_s2, lstm_size, peepholes=use_peepholes,
 98 |         # We need to specify a separate input for masks
 99 |         mask_input=l_mask,
100 |         # Here, we supply the gate parameters for each gate
101 |         ingate=gate_parameters, forgetgate=gate_parameters,
102 |         cell=cell_parameters, outgate=gate_parameters,
103 |         # We'll learn the initialization and use gradient clipping
104 |         learn_init=True, grad_clipping=5., name='lstm_s2')
105 | 
106 |     # We'll combine the forward and backward layer output by summing.
107 |     # Merge layers take in lists of layers to merge as input.
108 |     if fusiontype == 'adasum':
109 |         l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2], name='adasum1')
110 |     elif fusiontype == 'sum':
111 |         l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2], name='sum1')
112 |     elif fusiontype == 'concat':
113 |         l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2], axis=-1, name='concat')
114 | 
115 |     f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg')
116 |     l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2')
117 | 
118 |     # reshape to (num_examples * seq_len, lstm_size)
119 |     l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3')
120 | 
121 |     # Now, we can apply feed-forward layers as usual.
122 |     # We want the network to predict a classification for the sequence,
123 |     # so we'll use a the number of classes.
124 |     l_softmax = DenseLayer(
125 |         l_reshape3, num_units=output_classes,
126 |         nonlinearity=las.nonlinearities.softmax, name='softmax')
127 | 
128 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output')
129 | 
130 |     return l_out, l_fuse
131 | 


--------------------------------------------------------------------------------
/modelzoo/adenet_v2_2.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer, GlobalPoolLayer
  6 | from lasagne.nonlinearities import tanh, sigmoid, linear, rectify, leaky_rectify
  7 | 
  8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer
  9 | from modelzoo.pretrained_encoder import create_pretrained_encoder
 10 | 
 11 | 
 12 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True):
 13 | 
 14 |     if cell_parameters is None:
 15 |         cell_parameters = Gate()
 16 |     if gate_parameters is None:
 17 |         gate_parameters = Gate()
 18 | 
 19 |     l_lstm = LSTMLayer(
 20 |         l_incoming, hidden_units,
 21 |         # We need to specify a separate input for masks
 22 |         mask_input=l_mask, peepholes=use_peepholes,
 23 |         # Here, we supply the gate parameters for each gate
 24 |         ingate=gate_parameters, forgetgate=gate_parameters,
 25 |         cell=cell_parameters, outgate=gate_parameters,
 26 |         # We'll learn the initialization and use gradient clipping
 27 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 28 | 
 29 |     # The "backwards" layer is the same as the first,
 30 |     # except that the backwards argument is set to True.
 31 |     l_lstm_back = LSTMLayer(
 32 |         l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes,
 33 |         mask_input=l_mask, forgetgate=gate_parameters,
 34 |         cell=cell_parameters, outgate=gate_parameters,
 35 |         learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name))
 36 | 
 37 |     return l_lstm, l_lstm_back
 38 | 
 39 | 
 40 | def create_model(ae, s2_ae, input_shape, input_var, mask_shape, mask_var,
 41 |                  s2_shape, s2_var, lstm_size=250, win=T.iscalar('theta)'),
 42 |                  output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(),
 43 |                  use_peepholes=True):
 44 | 
 45 |     bn_weights, bn_biases, bn_shapes, bn_nonlinearities = ae
 46 |     s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae
 47 | 
 48 |     gate_parameters = Gate(
 49 |         W_in=w_init_fn, W_hid=w_init_fn,
 50 |         b=las.init.Constant(0.))
 51 |     cell_parameters = Gate(
 52 |         W_in=w_init_fn, W_hid=w_init_fn,
 53 |         # Setting W_cell to None denotes that no cell connection will be used.
 54 |         W_cell=None, b=las.init.Constant(0.),
 55 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 56 |         nonlinearity=tanh)
 57 | 
 58 |     l_s1 = InputLayer(input_shape, input_var, 's1_im')
 59 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 60 |     l_s2 = InputLayer(s2_shape, s2_var, 's2_im')
 61 | 
 62 |     symbolic_batchsize_s1 = l_s1.input_var.shape[0]
 63 |     symbolic_seqlen_s1 = l_s1.input_var.shape[1]
 64 |     symbolic_batchsize_s2 = l_s2.input_var.shape[0]
 65 |     symbolic_seqlen_s2 = l_s2.input_var.shape[1]
 66 | 
 67 |     l_reshape1_s1 = ReshapeLayer(l_s1, (-1, input_shape[-1]), name='reshape1_s1')
 68 |     l_encoder_s1 = create_pretrained_encoder(l_reshape1_s1, bn_weights, bn_biases, bn_shapes, bn_nonlinearities,
 69 |                                               ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1'])
 70 |     s1_len = las.layers.get_output_shape(l_encoder_s1)[-1]
 71 | 
 72 |     l_reshape2_s1 = ReshapeLayer(l_encoder_s1,
 73 |                                  (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len),
 74 |                                  name='reshape2_s1')
 75 |     l_delta_s1 = DeltaLayer(l_reshape2_s1, win, name='delta_s1')
 76 | 
 77 |     # s2 images
 78 |     l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2')
 79 |     l_encoder_s2 = create_pretrained_encoder(l_reshape1_s2, s2_weights, s2_biases, s2_shapes,
 80 |                                              s2_nonlinearities,
 81 |                                              ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2'])
 82 |     s2_len = las.layers.get_output_shape(l_encoder_s2)[-1]
 83 |     l_reshape2_s2 = ReshapeLayer(l_encoder_s2,
 84 |                                  (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len),
 85 |                                  name='reshape2_s2')
 86 |     l_delta_s2 = DeltaLayer(l_reshape2_s2, win, name='delta_s2')
 87 | 
 88 |     l_lstm_s1 = LSTMLayer(
 89 |         l_delta_s1, int(lstm_size), peepholes=use_peepholes,
 90 |         # We need to specify a separate input for masks
 91 |         mask_input=l_mask,
 92 |         # Here, we supply the gate parameters for each gate
 93 |         ingate=gate_parameters, forgetgate=gate_parameters,
 94 |         cell=cell_parameters, outgate=gate_parameters,
 95 |         # We'll learn the initialization and use gradient clipping
 96 |         learn_init=True, grad_clipping=5., name='lstm_s1')
 97 | 
 98 |     l_lstm_s2 = LSTMLayer(
 99 |         l_delta_s2, lstm_size, peepholes=use_peepholes,
100 |         # We need to specify a separate input for masks
101 |         mask_input=l_mask,
102 |         # Here, we supply the gate parameters for each gate
103 |         ingate=gate_parameters, forgetgate=gate_parameters,
104 |         cell=cell_parameters, outgate=gate_parameters,
105 |         # We'll learn the initialization and use gradient clipping
106 |         learn_init=True, grad_clipping=5., name='lstm_s2')
107 | 
108 |     # We'll combine the forward and backward layer output by summing.
109 |     # Merge layers take in lists of layers to merge as input.
110 |     if fusiontype == 'adasum':
111 |         l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2], name='adasum1')
112 |     elif fusiontype == 'sum':
113 |         l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2], name='sum1')
114 |     elif fusiontype == 'concat':
115 |         l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2], axis=-1, name='concat')
116 | 
117 |     f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg')
118 |     l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2')
119 | 
120 |     # reshape to (num_examples * seq_len, lstm_size)
121 |     l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3')
122 | 
123 |     # Now, we can apply feed-forward layers as usual.
124 |     # We want the network to predict a classification for the sequence,
125 |     # so we'll use a the number of classes.
126 |     l_softmax = DenseLayer(
127 |         l_reshape3, num_units=output_classes,
128 |         nonlinearity=las.nonlinearities.softmax, name='softmax')
129 | 
130 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output')
131 | 
132 |     return l_out, l_fuse
133 | 


--------------------------------------------------------------------------------
/landmarking/landmarker.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import os
  3 | import errno
  4 | import csv
  5 | import argparse
  6 | 
  7 | import menpo.io as mio
  8 | from menpo.visualize import print_progress
  9 | from menpodetect.dlib import load_dlib_frontal_face_detector
 10 | from menpofit.dlib import DlibWrapper
 11 | 
 12 | # constants, change according to system
 13 | FACE_MODEL_PATH = '../config/shape_predictor_68_face_landmarks.dat'
 14 | EXT = ['.mp4', '.mov', '.mpg']
 15 | NO_LANDMARKS = 68
 16 | 
 17 | 
 18 | def find_all_videos(dir, ext=EXT, relpath=False):
 19 |     # get the absolute path of the file
 20 |     abspath = os.path.abspath(dir)
 21 |     videofiles = []
 22 |     find_all_videos_impl(abspath, videofiles, ext)
 23 |     if relpath:
 24 |         for i, f in enumerate(videofiles):
 25 |             videofiles[i] = f[len(dir) + 1:]
 26 |     return videofiles
 27 | 
 28 | 
 29 | def find_all_videos_impl(dir, videofiles, ext):
 30 |     files = os.listdir(dir)
 31 |     for f in files:
 32 |         path = os.path.join(dir, f)
 33 |         if os.path.isdir(path):
 34 |             find_all_videos_impl(path, videofiles, ext)
 35 |         elif os.path.splitext(f)[1] in ext:
 36 |             videofiles.append(path)
 37 | 
 38 | 
 39 | def is_video(file, ext=EXT):
 40 |     return os.path.splitext(file)[1] in ext
 41 | 
 42 | 
 43 | def fit_image(image):
 44 |     # Face detection
 45 |     bboxes = fit_image.detect(image, image_diagonal=1000)
 46 | 
 47 |     # Check if at least one face was detected, otherwise throw a warning
 48 |     if len(bboxes) > 0:
 49 |         # Use the first bounding box (the most probable to represent a face) to initialise
 50 |         fitting_result = fit_image.fitter.fit_from_bb(image, bboxes[0])
 51 | 
 52 |         # Assign shape on the image
 53 |         image.landmarks['final_shape'] = fitting_result.final_shape
 54 |     else:
 55 |         # Throw warning if no face was detected
 56 |         warnings.warn('No face detected')
 57 | 
 58 |     # Return the image
 59 |     return image
 60 | 
 61 | 
 62 | def create_dir(dir):
 63 |     if not os.path.exists(dir):
 64 |         try:
 65 |             os.makedirs(dir)
 66 |         except OSError as exc:  # Guard against race condition
 67 |             if exc.errno != errno.EEXIST:
 68 |                 raise
 69 | 
 70 | 
 71 | def fill_row(outwriter, frame_no, row):
 72 |     outwriter.writerow([frame_no] + row)
 73 | 
 74 | 
 75 | def process_video(file, dest):
 76 |     if is_video(file):
 77 |         try:
 78 |             frames = mio.import_video(file, normalise=False)
 79 |         except IOError:
 80 |             warnings.warn('IO error reading video file {}, '.format(file) +
 81 |                           'the file may be corrupted or the video format is unsupported, skipping...')
 82 |         except ValueError as e:
 83 |             warnings.warn('Value Error reading video file {}, '.format(file) +
 84 |                           e.message)
 85 |             return
 86 |         # check if directory is non empty
 87 |         if os.path.dirname(dest):
 88 |             create_dir(os.path.dirname(dest))
 89 |         print('{} contains {} frames'.format(file, len(frames)))
 90 |         print('writing landmarks to {}...'.format(dest))
 91 |         frames = frames.map(fit_image)
 92 |         with open(dest, 'w') as outputfile:
 93 |             outwriter = csv.writer(outputfile)
 94 |             try:
 95 |                 for i, frame in enumerate(print_progress(frames)):
 96 |                     if 'final_shape' not in frame.landmarks:
 97 |                         warnings.warn('no faces detected in the frame {}, '
 98 |                                       'initializing landmarks to -1s...'.format(i))
 99 |                         # dlib does not fitting from previous initial shape so
100 |                         # leave entire row as -1s
101 |                         # initial_shape = frames[i - 1].landmarks['final_shape'].lms
102 |                         # fitting_result = fit_image.fitter.fit_from_shape(frame, initial_shape)
103 |                         # frame.landmarks['final_shape'] = fitting_result.final_shape
104 |                         landmarks = [-1] * NO_LANDMARKS*2
105 |                     else:
106 |                         lmg = frame.landmarks['final_shape']
107 |                         landmarks = lmg['all'].points.reshape((NO_LANDMARKS*2,)).tolist()  # reshape to 136 points
108 |                     fill_row(outwriter, i, landmarks)
109 |             except Exception as e:
110 |                 warnings.warn('Runtime Error at frame {}'.format(i))
111 |                 print('initializing landmarks to -1s...')
112 |                 fill_row(outwriter, i, [-1] * NO_LANDMARKS*2)
113 | 
114 | 
115 | def parse_options():
116 |     options = dict()
117 |     parser = argparse.ArgumentParser()
118 |     options['model'] = '../config/shape_predictor_68_face_landmarks.dat'
119 |     parser.add_argument('--input_dir', help='directory to search for videos, supported formats [.mov, .mpg, .mp4]')
120 |     parser.add_argument('--output_dir', help='output directory to store the landmarks')
121 |     parser.add_argument('--model', help='location of landmark model file. '
122 |                                         'Default: ../config/shape_predictor_68_face_landmarks.dat')
123 |     parser.add_argument('--file', help='perform landmarking on a single file')
124 |     parser.add_argument('--output', help='output landmark file name, if not specified '
125 |                                          'creates landmark file in current directory')
126 |     args = parser.parse_args()
127 |     if args.input_dir:
128 |         options['input_dir'] = args.input_dir
129 |     if args.output_dir:
130 |         options['output_dir'] = args.output_dir
131 |     if args.model:
132 |         options['model'] = args.model
133 |     if args.file:
134 |         options['file'] = args.file
135 |     if args.output:
136 |         options['output'] = args.output
137 |     return options
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     options = parse_options()
142 |     fit_image.detect = load_dlib_frontal_face_detector()
143 |     fit_image.fitter = DlibWrapper(options['model'])
144 | 
145 |     if 'file' in options:
146 |         video_file = options['file']
147 |         video_file_basename = os.path.basename(video_file)
148 |         print('Generating Landmarks from {}'.format(video_file))
149 |         output = options['output'] if 'output' in options else os.path.splitext(video_file_basename)[0] + '.csv'
150 |         process_video(video_file, output)
151 |         exit()
152 | 
153 |     print('Generating Landmarks from {}'.format(options['input_dir']))
154 |     videofiles = find_all_videos(options['input_dir'], relpath=False)
155 |     videofiles.sort()
156 |     print('Found {} video(s)...'.format(len(videofiles)))
157 |     input_dir = os.path.abspath(options['input_dir'])
158 |     output_dir = os.path.abspath(options['output_dir'])
159 |     for video in videofiles:
160 |         relative_path = video[len(input_dir) + 1:]
161 |         landmarkfile = os.path.join(output_dir, os.path.splitext(relative_path)[0] + '.csv')
162 |         process_video(video, landmarkfile)
163 |     print('All Done!')
164 | 


--------------------------------------------------------------------------------
/modelzoo/adenet_v2_3.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer
  6 | from lasagne.nonlinearities import tanh, sigmoid, linear
  7 | from lasagne.layers import batch_norm, BatchNormLayer
  8 | 
  9 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer
 10 | from modelzoo.pretrained_encoder import create_pretrained_encoder
 11 | 
 12 | 
 13 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True):
 14 | 
 15 |     if cell_parameters is None:
 16 |         cell_parameters = Gate()
 17 |     if gate_parameters is None:
 18 |         gate_parameters = Gate()
 19 | 
 20 |     l_lstm = LSTMLayer(
 21 |         l_incoming, hidden_units, peepholes=use_peepholes,
 22 |         # We need to specify a separate input for masks
 23 |         mask_input=l_mask,
 24 |         # Here, we supply the gate parameters for each gate
 25 |         ingate=gate_parameters, forgetgate=gate_parameters,
 26 |         cell=cell_parameters, outgate=gate_parameters,
 27 |         # We'll learn the initialization and use gradient clipping
 28 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 29 | 
 30 |     # The "backwards" layer is the same as the first,
 31 |     # except that the backwards argument is set to True.
 32 |     l_lstm_back = LSTMLayer(
 33 |         l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes,
 34 |         mask_input=l_mask, forgetgate=gate_parameters,
 35 |         cell=cell_parameters, outgate=gate_parameters,
 36 |         learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name))
 37 | 
 38 |     return l_lstm, l_lstm_back
 39 | 
 40 | 
 41 | def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True):
 42 | 
 43 |     if cell_parameters is None:
 44 |         cell_parameters = Gate()
 45 |     if gate_parameters is None:
 46 |         gate_parameters = Gate()
 47 | 
 48 |     l_lstm = LSTMLayer(
 49 |         l_incoming, hidden_units, peepholes=use_peepholes,
 50 |         # We need to specify a separate input for masks
 51 |         mask_input=l_mask,
 52 |         # Here, we supply the gate parameters for each gate
 53 |         ingate=gate_parameters, forgetgate=gate_parameters,
 54 |         cell=cell_parameters, outgate=gate_parameters,
 55 |         # We'll learn the initialization and use gradient clipping
 56 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 57 | 
 58 |     return l_lstm
 59 | 
 60 | 
 61 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
 62 |                  dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'),
 63 |                  output_classes=26, fusiontype='sum', w_init_fn=las.init.Orthogonal(),
 64 |                  use_peepholes=True):
 65 | 
 66 |     dbn_layers = dbn.get_all_layers()
 67 |     weights = []
 68 |     biases = []
 69 |     shapes = [2000, 1000, 500, 50]
 70 |     nonlinearities = [rectify, rectify, rectify, linear]
 71 |     weights.append(dbn_layers[1].W.astype('float32'))
 72 |     weights.append(dbn_layers[2].W.astype('float32'))
 73 |     weights.append(dbn_layers[3].W.astype('float32'))
 74 |     weights.append(dbn_layers[4].W.astype('float32'))
 75 |     biases.append(dbn_layers[1].b.astype('float32'))
 76 |     biases.append(dbn_layers[2].b.astype('float32'))
 77 |     biases.append(dbn_layers[3].b.astype('float32'))
 78 |     biases.append(dbn_layers[4].b.astype('float32'))
 79 | 
 80 |     gate_parameters = Gate(
 81 |         W_in=las.init.Orthogonal(), W_hid=w_init_fn,
 82 |         b=las.init.Constant(0.))
 83 |     cell_parameters = Gate(
 84 |         W_in=w_init_fn, W_hid=w_init_fn,
 85 |         # Setting W_cell to None denotes that no cell connection will be used.
 86 |         W_cell=None, b=las.init.Constant(0.),
 87 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 88 |         nonlinearity=tanh)
 89 | 
 90 |     l_in = InputLayer(input_shape, input_var, 'input')
 91 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 92 |     l_dct = InputLayer(dct_shape, dct_var, 'dct')
 93 | 
 94 |     symbolic_batchsize = l_in.input_var.shape[0]
 95 |     symbolic_seqlen = l_in.input_var.shape[1]
 96 | 
 97 |     l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
 98 |     l_encoder = create_pretrained_encoder(l_reshape1, weights, biases, shapes, nonlinearities,
 99 |                                           ['fc1', 'fc2', 'fc3', 'bottleneck'])
100 |     encoder_len = las.layers.get_output_shape(l_encoder)[-1]
101 |     l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
102 |     l_delta = DeltaLayer(l_reshape2, win, name='delta')
103 | 
104 |     l_lstm_bn = LSTMLayer(
105 |         l_delta, lstm_size, peepholes=use_peepholes,
106 |         # We need to specify a separate input for masks
107 |         mask_input=l_mask,
108 |         # Here, we supply the gate parameters for each gate
109 |         ingate=gate_parameters, forgetgate=gate_parameters,
110 |         cell=cell_parameters, outgate=gate_parameters,
111 |         # We'll learn the initialization and use gradient clipping
112 |         learn_init=True, grad_clipping=5., name='lstm_bn')
113 | 
114 |     l_lstm_dct = LSTMLayer(
115 |         l_dct, lstm_size, peepholes=use_peepholes,
116 |         # We need to specify a separate input for masks
117 |         mask_input=l_mask,
118 |         # Here, we supply the gate parameters for each gate
119 |         ingate=gate_parameters, forgetgate=gate_parameters,
120 |         cell=cell_parameters, outgate=gate_parameters,
121 |         # We'll learn the initialization and use gradient clipping
122 |         learn_init=True, grad_clipping=5., name='lstm_dct')
123 | 
124 |     # We'll combine the forward and backward layer output by summing.
125 |     # Merge layers take in lists of layers to merge as input.
126 | 
127 |     if fusiontype == 'sum':
128 |         l_fuse = ElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='sum1')
129 |     elif fusiontype == 'adasum':
130 |         l_fuse = AdaptiveElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='adasum')
131 |     elif fusiontype == 'concat':
132 |         l_fuse = ConcatLayer([l_lstm_bn, l_lstm_dct], axis=2, name='concat')
133 | 
134 |     f_lstm_agg = create_lstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg')
135 | 
136 |     # reshape to (num_examples * seq_len, lstm_size)
137 |     l_reshape3 = ReshapeLayer(f_lstm_agg, (-1, lstm_size))
138 | 
139 |     # l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1')
140 | 
141 |     # Now, we can apply feed-forward layers as usual.
142 |     # We want the network to predict a classification for the sequence,
143 |     # so we'll use a the number of classes.
144 |     l_softmax = DenseLayer(
145 |         l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax')
146 | 
147 |     l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output')
148 | 
149 |     return l_out, l_fuse
150 | 


--------------------------------------------------------------------------------
/modelzoo/adenet_v4.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as T
  2 | 
  3 | import lasagne as las
  4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer
  5 | from lasagne.layers import Gate, DropoutLayer
  6 | from lasagne.nonlinearities import tanh, sigmoid, linear
  7 | from lasagne.layers import batch_norm
  8 | 
  9 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer
 10 | 
 11 | 
 12 | def create_pretrained_encoder(weights, biases, incoming):
 13 |     l_1 = DenseLayer(incoming, 2000, W=weights[0], b=biases[0], nonlinearity=sigmoid, name='fc1')
 14 |     l_2 = DenseLayer(l_1, 1000, W=weights[1], b=biases[1], nonlinearity=sigmoid, name='fc2')
 15 |     l_3 = DenseLayer(l_2, 500, W=weights[2], b=biases[2], nonlinearity=sigmoid, name='fc3')
 16 |     l_4 = DenseLayer(l_3, 50, W=weights[3], b=biases[3], nonlinearity=linear, name='bottleneck')
 17 |     return l_4
 18 | 
 19 | 
 20 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name):
 21 | 
 22 |     if cell_parameters is None:
 23 |         cell_parameters = Gate()
 24 |     if gate_parameters is None:
 25 |         gate_parameters = Gate()
 26 | 
 27 |     l_lstm = LSTMLayer(
 28 |         l_incoming, hidden_units,
 29 |         # We need to specify a separate input for masks
 30 |         mask_input=l_mask,
 31 |         # Here, we supply the gate parameters for each gate
 32 |         ingate=gate_parameters, forgetgate=gate_parameters,
 33 |         cell=cell_parameters, outgate=gate_parameters,
 34 |         # We'll learn the initialization and use gradient clipping
 35 |         learn_init=True, grad_clipping=5., name='f_{}'.format(name))
 36 | 
 37 |     # The "backwards" layer is the same as the first,
 38 |     # except that the backwards argument is set to True.
 39 |     l_lstm_back = LSTMLayer(
 40 |         l_incoming, hidden_units, ingate=gate_parameters,
 41 |         mask_input=l_mask, forgetgate=gate_parameters,
 42 |         cell=cell_parameters, outgate=gate_parameters,
 43 |         learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name))
 44 | 
 45 |     return l_lstm, l_lstm_back
 46 | 
 47 | 
 48 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var,
 49 |                  dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'),
 50 |                  output_classes=26):
 51 | 
 52 |     dbn_layers = dbn.get_all_layers()
 53 |     weights = []
 54 |     biases = []
 55 |     weights.append(dbn_layers[1].W.astype('float32'))
 56 |     weights.append(dbn_layers[2].W.astype('float32'))
 57 |     weights.append(dbn_layers[3].W.astype('float32'))
 58 |     weights.append(dbn_layers[4].W.astype('float32'))
 59 |     biases.append(dbn_layers[1].b.astype('float32'))
 60 |     biases.append(dbn_layers[2].b.astype('float32'))
 61 |     biases.append(dbn_layers[3].b.astype('float32'))
 62 |     biases.append(dbn_layers[4].b.astype('float32'))
 63 | 
 64 |     gate_parameters = Gate(
 65 |         W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
 66 |         b=las.init.Constant(0.))
 67 |     cell_parameters = Gate(
 68 |         W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(),
 69 |         # Setting W_cell to None denotes that no cell connection will be used.
 70 |         W_cell=None, b=las.init.Constant(0.),
 71 |         # By convention, the cell nonlinearity is tanh in an LSTM.
 72 |         nonlinearity=tanh)
 73 | 
 74 |     l_in = InputLayer(input_shape, input_var, 'input')
 75 |     l_mask = InputLayer(mask_shape, mask_var, 'mask')
 76 |     l_dct = InputLayer(dct_shape, dct_var, 'dct')
 77 | 
 78 |     symbolic_batchsize = l_in.input_var.shape[0]
 79 |     symbolic_seqlen = l_in.input_var.shape[1]
 80 | 
 81 |     l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1')
 82 |     l_encoder = create_pretrained_encoder(weights, biases, l_reshape1)
 83 |     encoder_len = las.layers.get_output_shape(l_encoder)[-1]
 84 |     l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2')
 85 |     l_delta = DeltaLayer(l_reshape2, win, name='delta')
 86 |     l_delta_drop = DropoutLayer(l_delta, name='dropout_delta')
 87 |     l_dct_drop = DropoutLayer(l_dct, p=0.2, name='dropout_dct')
 88 | 
 89 |     l_lstm_bn = LSTMLayer(
 90 |         l_delta_drop, lstm_size * 2,
 91 |         # We need to specify a separate input for masks
 92 |         mask_input=l_mask,
 93 |         # Here, we supply the gate parameters for each gate
 94 |         ingate=gate_parameters, forgetgate=gate_parameters,
 95 |         cell=cell_parameters, outgate=gate_parameters,
 96 |         # We'll learn the initialization and use gradient clipping
 97 |         learn_init=True, grad_clipping=5., name='lstm_bn')
 98 | 
 99 |     l_lstm_dct = LSTMLayer(
100 |         l_dct_drop, lstm_size * 2,
101 |         # We need to specify a separate input for masks
102 |         mask_input=l_mask,
103 |         # Here, we supply the gate parameters for each gate
104 |         ingate=gate_parameters, forgetgate=gate_parameters,
105 |         cell=cell_parameters, outgate=gate_parameters,
106 |         # We'll learn the initialization and use gradient clipping
107 |         learn_init=True, grad_clipping=5., name='lstm_dct')
108 | 
109 |     # We'll combine the forward and backward layer output by summing.
110 |     # Merge layers take in lists of layers to merge as input.
111 |     # l_sum1 = AdaptiveElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='adasum1')
112 |     l_sum1 = ElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='sum1')
113 |     l_sum1_drop = DropoutLayer(l_sum1, name='dropout_agg')
114 |     # f_lstm_agg, b_lstm_agg = create_blstm(l_sum1, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg')
115 | 
116 |     l_lstm_agg = LSTMLayer(
117 |         l_sum1_drop, lstm_size * 2,
118 |         # We need to specify a separate input for masks
119 |         mask_input=l_mask,
120 |         # Here, we supply the gate parameters for each gate
121 |         ingate=gate_parameters, forgetgate=gate_parameters,
122 |         cell=cell_parameters, outgate=gate_parameters,
123 |         # We'll learn the initialization and use gradient clipping
124 |         learn_init=True, grad_clipping=5., name='lstm_agg')
125 | 
126 |     '''
127 |     # implement drop-out regularization
128 |     l_dropout = DropoutLayer(l_sum1, p=0.4, name='dropout1')
129 | 
130 |     l_lstm2, l_lstm2_back = create_blstm(l_dropout, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm2')
131 | 
132 |     # We'll combine the forward and backward layer output by summing.
133 |     # Merge layers take in lists of layers to merge as input.
134 |     l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back])
135 |     '''
136 | 
137 |     # l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2')
138 | 
139 |     l_forward_slice1 = SliceLayer(l_lstm_agg, -1, 1, name='slice1')
140 | 
141 |     # Now, we can apply feed-forward layers as usual.
142 |     # We want the network to predict a classification for the sequence,
143 |     # so we'll use a the number of classes.
144 |     l_out = DenseLayer(
145 |         l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output')
146 | 
147 |     return l_out, l_sum1
148 | 


--------------------------------------------------------------------------------
/utils/ffmpeg.py:
--------------------------------------------------------------------------------
  1 | """
  2 | module containing functions to use ffprobe to parse video frame info
  3 | """
  4 | from __future__ import print_function
  5 | import subprocess
  6 | import cStringIO
  7 | 
  8 | 
  9 | class base_frame(object):
 10 |     """
 11 |     Base Frame from FFProbe
 12 |     [FRAME]
 13 |     media_type=video
 14 |     stream_index=0
 15 |     key_frame=0
 16 |     pkt_pts=11745667
 17 |     pkt_pts_time=130.507411
 18 |     pkt_dts=11745667
 19 |     pkt_dts_time=130.507411
 20 |     best_effort_timestamp=11745667
 21 |     best_effort_timestamp_time=130.507411
 22 |     pkt_duration=3003
 23 |     pkt_duration_time=0.033367
 24 |     pkt_pos=86509020
 25 |     pkt_size=13294
 26 |     ...
 27 |     [/FRAME]
 28 |     """
 29 |     def __init__(self, buf, parser):
 30 |         """
 31 |         Constructs a base ffprobe frame
 32 |         :param buf: buffer containing frame info
 33 |         :param parser: ffprobe frame parser
 34 |         """
 35 |         self.stream_index = parser.get_int(buf)
 36 |         self.key_frame = parser.get_int(buf)
 37 |         self.pkt_pts = parser.get_int(buf)
 38 |         self.pkt_pts_time = parser.get_float(buf)
 39 |         self.pkt_dts = parser.get_int(buf)
 40 |         self.pkt_dts_time = parser.get_float(buf)
 41 |         self.best_effort_timestamp = parser.get_int(buf)
 42 |         self.best_effort_timestamp_time = parser.get_float(buf)
 43 |         self.pkt_duration = parser.get_int(buf)
 44 |         self.pkt_duration_time = parser.get_float(buf)
 45 |         self.pkt_pos = parser.get_int(buf)
 46 |         self.pkt_size = parser.get_int(buf)
 47 | 
 48 | 
 49 | class audio_frame(base_frame):
 50 |     """
 51 |     Audio Frame from FFProbe
 52 |     [FRAME]
 53 |     ...
 54 |     sample_fmt=s16p
 55 |     nb_samples=1152
 56 |     channels=2
 57 |     channel_layout=stereo
 58 |     [/FRAME]
 59 |     """
 60 |     def __init__(self, buf, parser):
 61 |         """
 62 |         Constructs an Audio Frame from FFprobe
 63 |         :param buf: buffer containing ffprobe frame info
 64 |         :param parser: ffprobe frame parser
 65 |         """
 66 |         super(audio_frame, self).__init__(buf, parser)
 67 |         self.media_type = 'audio'
 68 |         self.sample_fmt = parser.get_str(buf)
 69 |         self.nb_samples = parser.get_int(buf)
 70 |         self.channels = parser.get_int(buf)
 71 |         self.channel_layout = parser.get_str(buf)
 72 | 
 73 | 
 74 | class video_frame(base_frame):
 75 |     """
 76 |     Video Frame from FFProbe
 77 |     [FRAME]
 78 |     ...
 79 |     width=720
 80 |     height=480
 81 |     pix_fmt=yuv420p
 82 |     sample_aspect_ratio=1:1
 83 |     pict_type=B
 84 |     coded_picture_number=3889
 85 |     display_picture_number=0
 86 |     interlaced_frame=0
 87 |     top_field_first=0
 88 |     repeat_pict=0
 89 |     [/FRAME]
 90 |     """
 91 |     def __init__(self, buf, parser):
 92 |         """
 93 |         Constructs a Video Frame from ffprobe
 94 |         :param buf: buffer containing ffprobe frame info
 95 |         :param parser: ffprobe frame parser
 96 |         """
 97 |         super(video_frame, self).__init__(buf, parser)
 98 |         self.media_type = 'video'
 99 |         self.width = parser.get_int(buf)
100 |         self.height = parser.get_int(buf)
101 |         self.pix_fmt = parser.get_str(buf)
102 |         self.sample_aspect_ratio = parser.get_str(buf)
103 |         self.pict_type = parser.get_str(buf)
104 |         self.coded_picture_number = parser.get_int(buf)
105 |         self.display_picture_number = parser.get_int(buf)
106 |         self.interlaced_frame = parser.get_int(buf)
107 |         self.top_field_first = parser.get_int(buf)
108 |         self.repeat_pict = parser.get_int(buf)
109 | 
110 | 
111 | class side_data(object):
112 |     """
113 |     Side Data from FFProbe
114 |     [SIDE_DATA]
115 |     side_data_type=GOP timecode
116 |     side_data_size=8
117 |     timecode=00:00:00:00
118 |     [/SIDE_DATA]
119 |     """
120 |     def __init__(self, buf, parser):
121 |         """
122 |         Constructs side data frame
123 |         :param buf: buffer containing ffprobe frame info
124 |         :param parser: ffprobe frame parser
125 |         """
126 |         self.side_data_type = parser.get_str(buf)
127 |         self.side_data_size = parser.get_int(buf)
128 |         self.timecode = parser.get_str(buf)
129 | 
130 | 
131 | class ffprobe_frame_info_parser(object):
132 |     """
133 |     ffprobe frame parser, reads ffprobe entries and extracts key, value pairs
134 |     """
135 |     def get_str(self, buf, sep='='):
136 |         _, value = buf.readline().split(sep)
137 |         return value[:-1]
138 | 
139 |     def get_int(self, buf, sep='='):
140 |         _, value = buf.readline().split(sep)
141 |         value = value[:-1]
142 |         if value == 'N/A':
143 |             value = -1
144 |         else:
145 |             value = int(value)
146 |         return value
147 | 
148 |     def get_float(self, buf, sep='='):
149 |         _, value = buf.readline().split(sep)
150 |         value = value[:-1]
151 |         if value == 'N/A':
152 |             value = float('nan')
153 |         else:
154 |             value = float(value)
155 |         return value
156 | 
157 |     def get_entry(self, buf, sep='='):
158 |         key, value = buf.readline().split(sep)
159 |         value = value[:-1]
160 |         return key, value
161 | 
162 | 
163 | def peek_line(buf):
164 |     pos = buf.tell()
165 |     line = buf.readline()
166 |     buf.seek(pos)
167 |     return line
168 | 
169 | 
170 | def ffprobe_video(filename):
171 |     """
172 |     probes a video using ffprobe subprocess
173 |     :param filename: video file to probe
174 |     :return: list of audio, video frames
175 |     """
176 |     command = ["ffprobe", "-show_frames", filename]
177 |     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
178 |     out, err = p.communicate()
179 |     video_frames = []
180 |     audio_frames = []
181 |     p = ffprobe_frame_info_parser()
182 |     buf = cStringIO.StringIO(out)
183 |     while True:
184 |         line = buf.readline()
185 |         if line == '':
186 |             break
187 |         else:
188 |             info_type = line[:-1]
189 |             if info_type == '[FRAME]':
190 |                 media_type = p.get_str(buf)
191 |                 if media_type == "video":
192 |                     frame = video_frame(buf, p)
193 |                     video_frames.append(frame)
194 |                     # check if [SIDE_DATA] exists
195 |                     line = peek_line(buf)[:-1]
196 |                     if line == '[SIDE_DATA]':
197 |                         _ = buf.readline()  # read the header [SIDE_DATA]
198 |                         _ = side_data(buf, p)
199 |                         buf.readline()  # read the end tag [/SIDE_DATA]
200 |                 else:
201 |                     frame = audio_frame(buf, p)
202 |                     audio_frames.append(frame)
203 |                 buf.readline()  # read the end tag [/FRAME]
204 |     return audio_frames, video_frames
205 | 
206 | 
207 | def main():
208 |     audio_frames, video_frames = ffprobe_video('s01.mpg')
209 |     assert len(video_frames) == 3890
210 | 
211 | 
212 | if __name__ == '__main__':
213 |     main()
214 | 


--------------------------------------------------------------------------------
/cuave/prepare_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | sys.path.insert(0, '../')
  4 | import argparse
  5 | import utils.ffmpeg
  6 | from utils.preprocessing import *
  7 | from utils.io import *
  8 | from utils.plotting_utils import *
  9 | 
 10 | 
 11 | def parse_htk_labels(filename):
 12 |     """
 13 |     #Normal in 100ns
 14 |     7800000 14480000 zero
 15 |     17510000 22920000 one
 16 |     26580000 32630000 two
 17 |     36290000 40590000 three
 18 |     46240000 49900000 four
 19 |     55310000 59370000 five
 20 |     63590000 69800000 six
 21 |     ...
 22 |     #Moving
 23 | 
 24 |     :param filename:
 25 |     :return:
 26 |     """
 27 |     labels = []
 28 |     with open(filename, 'r') as f:
 29 |         line = f.readline()[:-1]
 30 |         if 'Normal' in line:
 31 |             while True:
 32 |                 # iterate until #Moving
 33 |                 line = f.readline()
 34 |                 if '#Moving' in line:
 35 |                     break
 36 |                 else:
 37 |                     start, end, number = line[:-2].split(' ')  # remove \n\r
 38 |                     labels.append((start, end, number))
 39 |     return labels
 40 | 
 41 | 
 42 | def to_100ns(time_in_sec):
 43 |     return int(time_in_sec * 10000000)
 44 | 
 45 | 
 46 | def digit_to_int(digit):
 47 |     digit_map = {'zero': 0,
 48 |                  'one': 1,
 49 |                  'two': 2,
 50 |                  'three': 3,
 51 |                  'four': 4,
 52 |                  'five': 5,
 53 |                  'six': 6,
 54 |                  'seven': 7,
 55 |                  'eight': 8,
 56 |                  'nine': 9}
 57 |     return digit_map[digit]
 58 | 
 59 | 
 60 | def segment_video(video_file, label_file):
 61 |     _, video_frames = utils.ffmpeg.ffprobe_video(video_file)
 62 |     htk_labels = parse_htk_labels(label_file)
 63 |     print('number of video frames: {}'.format(len(video_frames)))
 64 |     print('number of labels: {}'.format(len(htk_labels)))
 65 |     current_frame = 0
 66 |     idxes = []
 67 |     seq_lens = []
 68 |     labels = []
 69 |     for start, end, label in htk_labels:
 70 |         start = int(start)
 71 |         end = int(end)
 72 |         number = digit_to_int(label)
 73 |         # print(start, end, number)
 74 |         seq_len = 0
 75 |         while True:
 76 |             f = video_frames[current_frame]
 77 |             pts_time = to_100ns(f.pkt_pts_time)
 78 |             # check if frame is withing utterance window
 79 |             if pts_time > start and pts_time <= end:
 80 |                 idxes.append(current_frame)
 81 |                 labels.append(number)
 82 |                 seq_len += 1
 83 |                 current_frame += 1
 84 |                 # TODO: extract/select mouth ROI of frame
 85 |             else:
 86 |                 if pts_time > end:
 87 |                     break
 88 |                 current_frame += 1  # keep moving to the start of the next sequence
 89 |         seq_lens.append(seq_len)
 90 |     print(len(idxes))
 91 |     print(len(labels))
 92 |     print(seq_lens)
 93 | 
 94 | 
 95 | def test_mergesamples():
 96 |     s = np.array([[1],[2],[3],[4],[1],[2],[3],[4],[1],[2],[3],[4],[1],[2],[3],[4],[5]])
 97 |     # s = np.array([1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,5])
 98 |     l = [4,4,4,5]
 99 |     r = factorize(s, l, 3, 0)
100 |     print(r)
101 | 
102 | 
103 | def test_embed_temporal_info():
104 |     s = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[1,1,1],[2,2,2],[3,3,3],[4,4,4],[1,1,1],[2,2,2],[3,3,3],[4,4,4],
105 |                   [1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]])
106 |     # s = np.array([1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,5])
107 |     l = np.array([4,4,4,5])
108 |     r, l = factorize(s, l, 3, 0)
109 |     r, l = embed_temporal_info(r, l, 3, 3)
110 |     print(r)
111 | 
112 | 
113 | def parse_options():
114 |     options = dict()
115 |     options['remove_mean'] = False
116 |     options['diff_image'] = False
117 |     options['samplewise_norm'] = False
118 |     options['merge_samples'] = False
119 |     options['output'] = None
120 |     options['mergesize'] = 3
121 |     parser = argparse.ArgumentParser()
122 |     parser.add_argument('--remove_mean', action='store_true', help='remove mean image')
123 |     parser.add_argument('--diff_image', action='store_true', help='compute difference of image')
124 |     parser.add_argument('--samplewise_norm', action='store_true', help='samplewise normalize')
125 |     parser.add_argument('--reorder_data', help='redorder data from f to c convention. eg: 30,50')
126 |     parser.add_argument('--concat_deltas', help='concat 1st and 2nd deltas, default delta window: 2')
127 |     parser.add_argument('--embed_temporal_info', help='embed temporal info to features [window],[step]. ie: 3,1')
128 |     parser.add_argument('--output', help='write output to .mat file')
129 |     parser.add_argument('input', nargs='+', help='input cuave .mat file to preprocess')
130 |     args = parser.parse_args()
131 |     if args.remove_mean:
132 |         options['remove_mean'] = args.remove_mean
133 |     if args.diff_image:
134 |         options['diff_image'] = args.diff_image
135 |     if args.samplewise_norm:
136 |         options['samplewise_norm'] = args.samplewise_norm
137 |     if args.embed_temporal_info:
138 |         options['embed_temporal_info'] = args.embed_temporal_info
139 |     if args.reorder_data:
140 |         options['reorder_data'] = args.reorder_data
141 |     if args.output:
142 |         options['output'] = args.output
143 |     if args.input:
144 |         options['input'] = args.input[0]
145 |     if args.concat_deltas:
146 |         options['concat_deltas'] = int(args.concat_deltas)
147 |     return options
148 | 
149 | 
150 | def main():
151 |     options = parse_options()
152 |     data = load_mat_file(options['input'])
153 |     data_matrix = data['dataMatrix'].astype('float32')
154 |     vid_len_vec = data['videoLengthVec'].astype('int').reshape((-1,))
155 |     targets_vec = data['targetsVec'].reshape((-1,))
156 | 
157 |     if 'reorder_data' in options:
158 |         imagesize = tuple([int(d) for d in options['reorder_data'].split(',')])
159 |         data_matrix = reorder_data(data_matrix, imagesize)
160 |     if options['samplewise_norm']:
161 |         data_matrix = normalize_input(data_matrix)
162 |     if options['remove_mean']:
163 |         data_matrix = sequencewise_mean_image_subtraction(data_matrix, vid_len_vec)
164 |     if options['diff_image']:
165 |         data_matrix = compute_diff_images(data_matrix, vid_len_vec)
166 |     if 'embed_temporal_info' in options:
167 |         window, step = tuple([int(d) for d in options['embed_temporal_info'].split(',')])
168 |         data_matrix, targets_vec, vid_len_vec = factorize(data_matrix, targets_vec, vid_len_vec, step, 0)
169 |         data_matrix, targets_vec, vid_len_vec = embed_temporal_info(data_matrix, targets_vec, vid_len_vec, window, step)
170 |     if 'concat_deltas' in options:
171 |         data_matrix = concat_first_second_deltas(data_matrix, vid_len_vec, options['concat_deltas'])
172 | 
173 |     data['dataMatrix'] = data_matrix
174 | 
175 |     if 'embed_temporal_info' in options:
176 |         data['videoLengthVec'] = vid_len_vec
177 |         data['targetsVec'] = targets_vec
178 | 
179 |     if 'output' in options:
180 |         save_mat(data, options['output'])
181 |     # print(data.keys())
182 |     print('data prepared!')
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     main()


--------------------------------------------------------------------------------