├── custom ├── __init__.py ├── nonlinearities.py ├── objectives.py └── updates.py ├── utils ├── __init__.py ├── regularization.py ├── io.py ├── data_structures.py ├── signal.py ├── lcn.py ├── draw_net.py └── ffmpeg.py ├── modelzoo ├── __init__.py ├── pretrained_encoder.py ├── lstm_classifier_majority_vote.py ├── deltanet_v1.py ├── autoencoder.py ├── lstm_classifier_baseline.py ├── deltanet.py ├── avletters_convae.py ├── avletters_convae_bn.py ├── avletters_convae_drop.py ├── adenet_v2.py ├── avletters_convae_bndrop.py ├── adenet_v1.py ├── adenet_v1_1.py ├── baseline_end2end.py ├── avnet.py ├── adenet_3stream_dct.py ├── adenet_v2_4.py ├── adenet_v2_nodelta.py ├── adenet_v2_2.py ├── adenet_v2_3.py └── adenet_v4.py ├── dbn ├── displayImage.m ├── computeDCTfeatAndDeltas.m ├── computeDCTfeat.m ├── resizeImages.m ├── extractNN.m ├── deltas.m ├── DCT_Features.m ├── computeStates.m ├── normaliseData.m ├── RBMup.m ├── RBMdown.m ├── unfoldDBNtoNN.m ├── computeActivations.m ├── visualiseHiddenLayerWeights.m ├── unfoldDBNToClsf.m ├── dbnParamsInit.m ├── trainDBN.m ├── exampleDBN_AE.m ├── unfoldDBNtoAE.m └── zigzag.m ├── .gitignore ├── oulu ├── playvid.py ├── preprocess_images.py ├── landmarking.py └── ae_finetuner.py ├── test ├── test_model_io.py ├── test_preprocessing.py └── test_gen_batch_from_file.py ├── avletters ├── preprocess_images.py └── ae_finetuner.py ├── runners ├── extract_encoder_from_model.py └── extract_lstm_from_model.py ├── avletters2 └── prepare_data.py ├── README.md ├── landmarking └── landmarker.py └── cuave └── prepare_data.py /custom/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modelzoo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dbn/displayImage.m: -------------------------------------------------------------------------------- 1 | function [] = displayImage( image1D, h, w ) 2 | %DISPLAYIMAGE Summary of this function goes here 3 | % Detailed explanation goes here 4 | image = mat2gray(reshape(image1D, h, w)); 5 | imshow(image); 6 | 7 | end 8 | 9 | -------------------------------------------------------------------------------- /dbn/computeDCTfeatAndDeltas.m: -------------------------------------------------------------------------------- 1 | function dctFeatures = computeDCTfeatAndDeltas(dataMatrix, w, h, noCoeff) 2 | dctFeatures = computeDCTfeat(dataMatrix, w, h, noCoeff); 3 | d1 = deltas(dctFeatures, 9); 4 | d2 = deltas(d1, 9); 5 | dctFeatures = horzcat(dctFeatures, d1, d2); 6 | end 7 | -------------------------------------------------------------------------------- /dbn/computeDCTfeat.m: -------------------------------------------------------------------------------- 1 | function dctFeatures = computeDCTfeat(dataMatrix, w, h, noCoeff) 2 | 3 | 4 | 5 | [noIm, dim] = size(dataMatrix); 6 | imMatrix = zeros(h, w, noIm); 7 | 8 | for i = 1:noIm 9 | imMatrix(:,:,i) = reshape(dataMatrix(i,:), h, w); 10 | end 11 | 12 | dctFeatures = DCT_Features(imMatrix,noCoeff,[]); -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # DS_Store 2 | .DS_Store 3 | **/.DS_Store 4 | 5 | #.idea 6 | .idea/ 7 | 8 | # ipython checkpoints 9 | **/.ipynb_checkpoints/ 10 | 11 | # data and model files 12 | */data 13 | */models 14 | */results 15 | config/ 16 | 17 | examples/ 18 | 19 | # .pyc files 20 | *.pyc 21 | 22 | # png files 23 | *.png 24 | *.sh 25 | *.csv 26 | runners/experiments/ 27 | -------------------------------------------------------------------------------- /dbn/resizeImages.m: -------------------------------------------------------------------------------- 1 | function [ imMatrix ] = resizeImages( dataMatrix, oldHt, oldWt, newHt, newWt ) 2 | %RESIZEIMAGES Summary of this function goes here 3 | % Detailed explanation goes here 4 | [noIm, ~] = size(dataMatrix); 5 | imMatrix = zeros(noIm, newWt * newHt); 6 | 7 | for i = 1:noIm 8 | img = reshape(dataMatrix(i,:), oldHt, oldWt); 9 | img = imresize(img, [newHt, newWt]); 10 | imMatrix(i,:) = reshape(img, 1, newHt * newWt); 11 | end 12 | 13 | end -------------------------------------------------------------------------------- /dbn/extractNN.m: -------------------------------------------------------------------------------- 1 | function [w1,w2,w3,w4,w5,w6,w7,w8,b1,b2,b3,b4,b5,b6,b7,b8] = extractNN( nn ) 2 | %EXTRACTNN Summary of this function goes here 3 | % Detailed explanation goes here 4 | w1 = nn.W{1,1}; 5 | w2 = nn.W{1,2}; 6 | w3 = nn.W{1,3}; 7 | w4 = nn.W{1,4}; 8 | w5 = nn.W{1,5}; 9 | w6 = nn.W{1,6}; 10 | w7 = nn.W{1,7}; 11 | w8 = nn.W{1,8}; 12 | b1 = nn.biases{1,1}; 13 | b2 = nn.biases{1,2}; 14 | b3 = nn.biases{1,3}; 15 | b4 = nn.biases{1,4}; 16 | b5 = nn.biases{1,5}; 17 | b6 = nn.biases{1,6}; 18 | b7 = nn.biases{1,7}; 19 | b8 = nn.biases{1,8}; 20 | end 21 | 22 | -------------------------------------------------------------------------------- /custom/nonlinearities.py: -------------------------------------------------------------------------------- 1 | from lasagne.nonlinearities import * 2 | 3 | 4 | def select_nonlinearity(string): 5 | nonlinearities = {'rectify': rectify, 6 | 'sigmoid': sigmoid, 7 | 'leaky_rectify': leaky_rectify, 8 | 'very_leaky_rectify': very_leaky_rectify, 9 | 'tanh': tanh, 10 | 'linear': linear, 11 | 'softmax': softmax, 12 | 'softplus': softplus, 13 | 'elu': elu, 14 | 'scaled_tanh': ScaledTanh, 15 | 'identity': identity} 16 | return nonlinearities[string] 17 | -------------------------------------------------------------------------------- /utils/regularization.py: -------------------------------------------------------------------------------- 1 | def early_stop(cost_window): 2 | if len(cost_window) < 2: 3 | return False 4 | else: 5 | curr = cost_window[0] 6 | for idx, cost in enumerate(cost_window): 7 | if curr < cost or idx == 0: 8 | curr = cost 9 | else: 10 | return False 11 | return True 12 | 13 | 14 | def early_stop2(cost_window, min_val_cost, threshold): 15 | if len(cost_window) < 2: 16 | return False 17 | else: 18 | count = 0 19 | for cost in cost_window: 20 | if cost > min_val_cost: 21 | count += 1 22 | if count == threshold: 23 | return True -------------------------------------------------------------------------------- /oulu/playvid.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import cv2 4 | 5 | for idx in range(31, 61): 6 | videofile = '../examples/data/s30_v1_u{}.mp4'.format(idx) 7 | print('video file: {}'.format(videofile)) 8 | cap = cv2.VideoCapture(videofile) 9 | 10 | while cap.isOpened(): 11 | ret, frame = cap.read() 12 | if ret: 13 | frame = cv2.resize(frame, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_LINEAR) 14 | gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 15 | cv2.imshow('frame', gray) 16 | else: 17 | break 18 | if cv2.waitKey(1) & 0xFF == ord('q'): 19 | break 20 | cap.release() 21 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /dbn/deltas.m: -------------------------------------------------------------------------------- 1 | function d = deltas(x, w) 2 | % D = deltas(X,W) Calculate the deltas (derivatives) of a sequence 3 | % Use a W-point window (W odd, default 9) to calculate deltas using a 4 | % simple linear slope. This mirrors the delta calculation performed 5 | % in feacalc etc. Each row of X is filtered separately. 6 | % 2003-06-30 dpwe@ee.columbia.edu 7 | 8 | if nargin < 2 9 | w = 9; 10 | end 11 | 12 | [nr,nc] = size(x); 13 | 14 | % Define window shape 15 | hlen = floor(w/2); 16 | w = 2*hlen + 1; 17 | win = hlen:-1:-hlen; 18 | 19 | % pad data by repeating first and last columns 20 | xx = [repmat(x(:,1),1,hlen),x,repmat(x(:,end),1,hlen)]; 21 | 22 | % Apply the delta filter 23 | d = filter(win, 1, xx, [], 2); % filter along dim 2 (rows) 24 | 25 | % Trim edges 26 | d = d(:,2*hlen + [1:nc]); 27 | 28 | -------------------------------------------------------------------------------- /modelzoo/pretrained_encoder.py: -------------------------------------------------------------------------------- 1 | from lasagne.layers import DenseLayer 2 | 3 | 4 | def create_pretrained_encoder(incoming, weights, biases, shapes, nonlinearities, names): 5 | encoder = DenseLayer(incoming, shapes[0], W=weights[0], b=biases[0], nonlinearity=nonlinearities[0], name=names[0]) 6 | for i, num_units in enumerate(shapes[1:], 1): 7 | encoder = DenseLayer(encoder, shapes[i], W=weights[i], b=biases[i], 8 | nonlinearity=nonlinearities[i], name=names[i]) 9 | return encoder 10 | 11 | 12 | def create_encoder(incoming, shapes, nonlinearities, names): 13 | encoder = DenseLayer(incoming, shapes[0], nonlinearity=nonlinearities[0], name=names[0]) 14 | for i, num_units in enumerate(shapes[1:], 1): 15 | encoder = DenseLayer(encoder, shapes[i], nonlinearity=nonlinearities[i], name=names[i]) 16 | return encoder 17 | -------------------------------------------------------------------------------- /dbn/DCT_Features.m: -------------------------------------------------------------------------------- 1 | function features = DCT_Features(ROIs,NumberOfCoefs2Keep,visualize) 2 | % Extract plain DCT features for given ROIs. 3 | % Coefs are computed over the whole ROIs (non-block approach). 4 | % Keep 2:NumberOfCoefs2Keep+1 zig-zag arranged coefs. 5 | 6 | 7 | if nargin<3 || isempty(visualize) 8 | visualize = 0; 9 | end 10 | nFrames = size(ROIs,3); 11 | % Initialization of zigzag vectors 12 | features = zeros(nFrames,NumberOfCoefs2Keep); 13 | 14 | if visualize == 1 15 | figure; 16 | end 17 | 18 | for i=1:nFrames 19 | CurrentFrame = ROIs(:,:,i); 20 | DCTImage = dct2(CurrentFrame); 21 | DCTzigzagVector = zigzag(DCTImage); 22 | features(i,:) = DCTzigzagVector(2:NumberOfCoefs2Keep+1); 23 | if visualize == 1 24 | imshow(DCTImage,[]), colormap(jet(64)) 25 | drawnow 26 | pause(0.04) 27 | end 28 | clear DCTImage DCTzigzagvector; 29 | end 30 | 31 | 32 | end 33 | 34 | -------------------------------------------------------------------------------- /dbn/computeStates.m: -------------------------------------------------------------------------------- 1 | function states = computeStates(layerType, probs, data) 2 | % computeStates - Computes states of hidden/visible layer of an RBM 3 | 4 | % INPUTS 5 | % layerType: activation function of given layer, e.g. 'sigm', 'linear', 6 | % 'ReLu' 7 | 8 | % probs: activation matrix, noExamples x noNeurons 9 | 10 | % data: data matrix, it's the input to the neurons, noExamples x noNeurons 11 | 12 | % OUTPUTS 13 | % states: states matrix, noExamples x noNeurons 14 | 15 | 16 | [numExamples,numHid] = size(probs); 17 | 18 | if strcmpi(layerType,'sigm') 19 | 20 | states = probs > rand(numExamples,numHid); 21 | 22 | elseif strcmpi(layerType,'linear') 23 | 24 | states = probs + randn(numExamples,numHid); 25 | 26 | elseif strcmpi(layerType,'ReLu') 27 | 28 | 29 | sigma = 1./(1 + exp(-data)); 30 | noise = sigma .* randn(numExamples, numHid); 31 | states = max(0,data + noise); 32 | 33 | end 34 | 35 | 36 | -------------------------------------------------------------------------------- /dbn/normaliseData.m: -------------------------------------------------------------------------------- 1 | function [data,PS] = normaliseData(trFcn, data, PS) 2 | 3 | % in case of linear visible layer it is recommended by Hinton in "A practical guide 4 | %to training RBMs" to make each dimension of the feature vector to have 5 | %zero mean and unit standard deviation. 6 | if strcmpi(trFcn, 'linear') 7 | 8 | if isempty(PS) 9 | ymean = 0; 10 | ystd = 1; 11 | [data,PS] = mapstd(data,ymean,ystd); 12 | else 13 | data = mapstd('apply',data,PS); 14 | 15 | end 16 | % [data,PS] = mapstd(data',ymean,ystd); 17 | % data = data'; 18 | 19 | % each image is zero normalised and divided by the std over all pixers over 20 | % all images 21 | % s = std(data(:)); 22 | % 23 | % [dataTemp,PS] = mapstd(data,ymean,ystd); 24 | % PS.xstd = repmat(s,size(data, 1),1); 25 | % [data,PS] = mapstd('apply',data,PS); 26 | 27 | 28 | 29 | % in case the activation function of the visible layer is "sigm" i.e. data 30 | % are binary, then simply divide by the max value so the data are in the 31 | % range [0, 1]. 32 | elseif strcmpi(trFcn, 'sigm') 33 | data = data/max(data(:)); %255; 34 | end -------------------------------------------------------------------------------- /dbn/RBMup.m: -------------------------------------------------------------------------------- 1 | function [activations, states] = RBMup(data, weights, hidbiases, hL_type) 2 | % RBMup - Computes activations and states of RBM's hidden layer 3 | 4 | % INPUTS 5 | % data: data matrix, noExamples x noDimensions 6 | 7 | % weights: matrix containing the RBM weights, noVisibleUnits x 8 | % noHiddenUNits 9 | 10 | %hidbiases: biases of hidden layer, 1 x NoVisibleNeurons 11 | 12 | % hL_type: activation function of hidden layer, e.g. 'sigm', 'linear', 13 | % 'ReLu' 14 | 15 | % OUTPUTS 16 | % activations: activation matrix, noExamples x noNeurons (hidden neurons) 17 | 18 | % states: states of hidden neurons, noExamples x noNeurons (hidden neurons) 19 | 20 | [numExamples numDims] = size(data); 21 | 22 | % input to hidden neurons - batchSize x noHidden neurons, each row 23 | % contains the input to the hidden units 24 | hidInp = data * weights; 25 | 26 | % create biases matrix 27 | hidBiasesMatrx = repmat(hidbiases,numExamples,1); 28 | 29 | finalHidInp = hidInp + hidBiasesMatrx; 30 | 31 | % contains activations of hidden units, batchSize x noHidden neurons 32 | activations = computeActivations(hL_type, finalHidInp); 33 | 34 | % compute hidden states 35 | states = computeStates(hL_type, activations, finalHidInp); -------------------------------------------------------------------------------- /utils/io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import scipy.io as sio 3 | import lasagne as las 4 | sys.path.insert(0, '../') 5 | try: 6 | import cPickle as pickle 7 | except: 8 | import pickle 9 | 10 | 11 | def read_data_split_file(path, sep=','): 12 | with open(path) as f: 13 | subjects = f.readline().split(sep) 14 | subjects = [int(s) for s in subjects] 15 | return subjects 16 | 17 | 18 | def load_mat_file(path): 19 | """ 20 | Loads .mat file 21 | :param path: path to .mat file 22 | :return: dictionary containing .mat data 23 | """ 24 | return sio.loadmat(path) 25 | 26 | 27 | def save_mat(dict, path): 28 | print('save matlab file...') 29 | sio.savemat(path, dict) 30 | 31 | 32 | def save_model(model, path): 33 | pickle.dump(model, open(path, 'wb')) 34 | 35 | 36 | def load_model(path): 37 | return pickle.load(open(path, 'rb')) 38 | 39 | 40 | def save_model_params(network, path): 41 | all_param_values = las.layers.get_all_param_values(network) 42 | pickle.dump(all_param_values, open(path, 'wb')) 43 | 44 | 45 | def load_model_params(network, path): 46 | all_param_values = pickle.load(open(path, 'rb')) 47 | las.layers.set_all_param_values(network, all_param_values) 48 | return network 49 | -------------------------------------------------------------------------------- /dbn/RBMdown.m: -------------------------------------------------------------------------------- 1 | function [activations, states] = RBMdown(data, weights, visbiases, vL_type) 2 | % RBMdown - Computes activations and states of RBM's hidden layer 3 | 4 | % INPUTS 5 | % data: data matrix, noExamples x noDimensions 6 | 7 | % weights: matrix containing the RBM weights, noVisibleUnits x 8 | % noHiddenUNits 9 | 10 | % visbiases: biases of visible layer, 1 x NoVisibleNeurons 11 | 12 | % vL_type: activation function of visible layer, e.g. 'sigm', 'linear', 13 | % 'ReLu' 14 | 15 | % OUTPUTS 16 | % activations: activation matrix, noExamples x noNeurons (visible neurons) 17 | 18 | % states: states of visible neurons, noExamples x noNeurons (visible neurons) 19 | 20 | 21 | % batchSize x noDims, each row contains one example generated from the 22 | % hidden states through backpopagating their states multiplied by the 23 | % weights 24 | numExamples = size(data, 1); 25 | 26 | inpFromHidden = data * weights'; 27 | 28 | visBiasesMatrix = repmat(visbiases,numExamples,1); 29 | 30 | finalVisInput = inpFromHidden + visBiasesMatrix; 31 | 32 | %activations of visible units 33 | activations = computeActivations(vL_type, finalVisInput); 34 | 35 | % compute visible states 36 | states = computeStates(vL_type, activations, finalVisInput); 37 | -------------------------------------------------------------------------------- /dbn/unfoldDBNtoNN.m: -------------------------------------------------------------------------------- 1 | function nn = unfoldDBNtoNN(dbnParams, dbn, outputSize) 2 | % unfoldDBNtoNN - Unfolds DBN to NN 3 | 4 | % INPUTS 5 | % dbnParams: structure containing the DBN params, see manual for more 6 | % details 7 | 8 | % dbn: structure which contains the weights (W), the hidden biases (hidbiases) and 9 | % the visible biases (visbiases) for each RBM layer 10 | 11 | % outputSize: size of output layer 12 | 13 | % OUTPUTS 14 | % nn: neural network structure, see manual for details 15 | 16 | 17 | if dbnParams.type == 1 % AE 18 | 19 | 20 | disp('Unfolding DBN to AE') 21 | 22 | [weightsAE, biasesAE, newActivationFunctions, newLayers] = unfoldDBNtoAE(dbnParams, dbn, outputSize); 23 | % nn = paramsNNinit(newLayers, newActivationFunctions); 24 | nn.activationFunctions = newActivationFunctions; 25 | nn.layers = newLayers; 26 | nn.W = weightsAE; 27 | nn.biases = biasesAE; 28 | 29 | 30 | elseif dbnParams.type == 2 % classification 31 | 32 | disp('Unfolding DBN to Classifier') 33 | 34 | [weightsClsf, biasesClsf, newActivationFunctions, newLayers] = unfoldDBNToClsf(dbnParams, dbn, outputSize); 35 | nn = paramsNNinit(newLayers, newActivationFunctions); 36 | nn.W = weightsClsf; 37 | nn.biases = biasesClsf; 38 | 39 | end 40 | 41 | 42 | nn.pretraining = 1; 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /dbn/computeActivations.m: -------------------------------------------------------------------------------- 1 | function activations = computeActivations(layerType, data) 2 | % computeActivations - Computes activations of a hidden or output layer 3 | 4 | % INPUTS 5 | % layerType: activation function of given layer, e.g. 'sigm', 'linear', 6 | % 'ReLu' 7 | 8 | % data: data matrix, it's the input to the neurons, noExamples x noNeurons 9 | 10 | % OUTPUTS 11 | % activations: activation matrix, noExamples x noNeurons 12 | 13 | outputSize = size(data, 2); 14 | 15 | if strcmpi(layerType,'sigm') 16 | 17 | activations = 1./(1 + exp(-data)); 18 | 19 | elseif strcmpi(layerType,'tanh') 20 | 21 | activations = 2 * (1./(1 + exp(-2*data))) - 1; % tanh(z) = 2*sigm(2z) - 1 22 | 23 | elseif strcmpi(layerType,'linear') 24 | 25 | activations = data; 26 | 27 | elseif strcmpi(layerType,'ReLu') 28 | 29 | activations = max(0,data); 30 | 31 | elseif strcmpi(layerType, 'leakyReLu') 32 | 33 | activations = max(0.01 * data, data); 34 | 35 | elseif strcmpi(layerType, 'softplus') 36 | 37 | activations = log(1 + exp(data)); 38 | 39 | elseif strcmpi(layerType, 'softsign') 40 | 41 | activations = data ./ (1 + abs(data)); 42 | 43 | elseif strcmpi(layerType, 'softmax') 44 | 45 | activNominator = exp(data); 46 | sumActiv = sum(activNominator, 2); 47 | activations = activNominator ./ repmat(sumActiv, 1, outputSize); 48 | 49 | end -------------------------------------------------------------------------------- /dbn/visualiseHiddenLayerWeights.m: -------------------------------------------------------------------------------- 1 | function visualiseHiddenLayerWeights(weights,col,row,noImageRows) 2 | % visualiseHiddenLayerWeights - Visualises as an image the given weights 3 | 4 | % INPUTS 5 | % weights: weightMatrix, noInputs x noHiddenNeurons (first hidden layer, 6 | % since we usually visualise weights of the first hidden layer only) 7 | 8 | % col: number of image columns 9 | 10 | % row: number of image rows 11 | % The product of col and row must be equal to the number of inputs, i.e., 12 | % the number of rows of the weights matrix 13 | 14 | % noImageRows: number of image rows, i.e., if 10 then there will be 10 rows 15 | % of images where each row will contain floor(noHiddenNeurons / noImageRows) 16 | 17 | [inpSize, N] = size(weights); 18 | 19 | % find minimum/maximum weight value 20 | minValue = min(weights(:)); 21 | maxValue = max(weights(:)); 22 | 23 | % no images per Row 24 | noExPerRow = floor(N / noImageRows); 25 | 26 | img2Disp = cell(noImageRows, noExPerRow); 27 | 28 | 29 | for i = 1:noImageRows 30 | 31 | baseInd = (i - 1) * noExPerRow; 32 | 33 | for j = 1:noExPerRow 34 | 35 | selInd = baseInd + j; 36 | 37 | img = reshape(weights(:,selInd),row,col); 38 | 39 | img(:,end+1:end+3) = minValue; 40 | img(end+1:end+3,:) = minValue; 41 | 42 | img2Disp{i,j} = img; 43 | 44 | end 45 | 46 | end 47 | 48 | img2DispFinal = cell2mat(img2Disp); 49 | imagesc(img2DispFinal,[minValue,maxValue]); colormap gray; axis equal; axis off; 50 | 51 | 52 | drawnow; 53 | 54 | 55 | -------------------------------------------------------------------------------- /test/test_model_io.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import theano.tensor as T 4 | from lasagne.nonlinearities import rectify, linear 5 | from modelzoo import deltanet_majority_vote 6 | from utils.io import save_mat 7 | 8 | 9 | class TestModelIO(unittest.TestCase): 10 | def test_load_params(self): 11 | window = T.iscalar('theta') 12 | inputs1 = T.tensor3('inputs1', dtype='float32') 13 | mask = T.matrix('mask', dtype='uint8') 14 | network = deltanet_majority_vote.load_saved_model('../oulu/results/best_models/1stream_mfcc_w3s3.6.pkl', 15 | ([500, 200, 100, 50], [rectify, rectify, rectify, linear]), 16 | (None, None, 91), inputs1, (None, None), mask, 17 | 250, window, 10) 18 | d = deltanet_majority_vote.extract_encoder_weights(network, ['fc1', 'fc2', 'fc3', 'bottleneck'], 19 | [('w1', 'b1'), ('w2', 'b2'), ('w3', 'b3'), ('w4', 'b4')]) 20 | b = deltanet_majority_vote.extract_lstm_weights(network, ['f_blstm1', 'b_blstm1'], 21 | ['flstm', 'blstm']) 22 | expected_keys = ['w1', 'w2', 'w3', 'w4', 'b1', 'b2', 'b3', 'b4'] 23 | keys = d.keys() 24 | for k in keys: 25 | assert k in expected_keys 26 | assert type(d[k]) == np.ndarray 27 | save_mat(d, '../oulu/models/oulu_1stream_mfcc_w3s3.mat') 28 | 29 | 30 | if __name__ == '__main__': 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /dbn/unfoldDBNToClsf.m: -------------------------------------------------------------------------------- 1 | function [weightsClsf, biasesClsf, newActivationFunctions newLayers] = unfoldDBNToClsf(dbnParams,dbn,outputSize) 2 | % unfoldDBNToClsf - Unfolds DBN to NN for classification purposes 3 | 4 | % INPUTS 5 | % dbnParams: structure containing the DBN params, see manual for more 6 | % details 7 | 8 | % dbn: structure which contains the weights (W), the hidden biases (hidbiases) and 9 | % the visible biases (visbiases) for each RBM layer 10 | 11 | % outputSize: size of output layer 12 | 13 | % OUTPUTS 14 | % weightsClsf: 1xN cell array, where N is the number of layers (hidden + output 15 | % layer), each cell contains the weights of the corresponding layer 16 | 17 | % biasesClsf: 1xN cell array, where N is the number of layers (hidden + output 18 | % layer), each cell contains the biases of the corresponding layer 19 | 20 | % newActivationFunctions: 1xN cell array, where N is the number of layers (hidden + output 21 | % layer), each cell contains the activation function of the corresponding layer 22 | 23 | % newLayers: 1xN vector, where N is the number of layers (hidden + output 24 | % layer), each entry contains the size of the corresponding layer 25 | 26 | % if classification then last layer is softmax 27 | newActivationFunctions = [dbnParams.hiddenActivationFunctions 'softmax']; 28 | 29 | newLayers = [dbnParams.hiddenLayers outputSize]; 30 | 31 | % initialise weights/biases of new layer 32 | % hinton in his code initialises the last layer like this 33 | % http://www.cs.toronto.edu/~hinton/MatlabForSciencePaper.html 34 | lastLayerW = 0.1*randn(newLayers(end - 1), outputSize); 35 | lastLayerBiases = 0.1*randn(1, outputSize); 36 | 37 | weightsClsf = [dbn.W lastLayerW]; 38 | biasesClsf = [dbn.hidbiases lastLayerBiases]; 39 | 40 | 41 | -------------------------------------------------------------------------------- /custom/objectives.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as tt 2 | 3 | 4 | def temporal_softmax_loss(x, y, mask): 5 | """ 6 | A temporal version of softmax loss for use in RNNs. We assume that we are 7 | making predictions over a vocabulary of size V for each timestep of a 8 | timeseries of length T, over a minibatch of size N. The input x gives scores 9 | for all vocabulary elements at all timesteps, and y gives the indices of the 10 | ground-truth element at each timestep. We use a cross-entropy loss at each 11 | timestep, summing the loss over all timesteps and averaging across the 12 | minibatch. 13 | As an additional complication, we may want to ignore the model output at some 14 | timesteps, since sequences of different length may have been combined into a 15 | minibatch and padded with NULL tokens. The optional mask argument tells us 16 | which elements should contribute to the loss. 17 | Inputs: 18 | - x: Input scores, of shape (N, T, V) 19 | - y: Ground-truth indices, of shape (N, T) where each element is in the range 20 | 0 <= y[i, t] < V 21 | - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not 22 | the scores at x[i, t] should contribute to the loss. 23 | Returns a tuple of: 24 | - loss: Scalar giving loss 25 | """ 26 | 27 | N, T, V = x.shape 28 | 29 | x_flat = x.reshape((N * T, V)) 30 | y_flat = y.reshape((N * T,)) 31 | mask_flat = mask.reshape((N * T,)) 32 | total_frames = tt.sum(mask_flat) 33 | 34 | probs = tt.exp(x_flat - tt.max(x_flat, axis=1, keepdims=True)) 35 | probs /= tt.sum(probs, axis=1, keepdims=True) 36 | # loss = -tt.sum(mask_flat * tt.log(probs[tt.arange(N * T), y_flat])) / N 37 | loss = -tt.sum(mask_flat * tt.log(probs[tt.arange(N * T), y_flat])) / total_frames 38 | 39 | return loss 40 | -------------------------------------------------------------------------------- /dbn/dbnParamsInit.m: -------------------------------------------------------------------------------- 1 | function dbnParams = dbnParamsInit(type,hiddenActivationFunctions, hiddenLayers) 2 | % dbnParamsInit - Create Parameters for DBN 3 | 4 | % INPUTS 5 | % type: type of DBN to be trained, 1 is AE, 2 is classifier 6 | 7 | % hiddenActivationFunctions: 1xN cell array, where N is the number of 8 | % hidden layers, each cell contains the activation function ('sigm', 'linear', 'ReLu') of the 9 | % corresponding layer, e.g., {'sigm' 'sigm' 'sigm' 'sigm'} 10 | 11 | % hiddenLayers: 1xN vector, where N is the number of 12 | % hidden layers, each entry contains the size of the 13 | % corresponding hidden layer, e.g., [500 500 500 200] 14 | 15 | % OUTPUTS 16 | % dbnParams: structure which contains the dbnParams, see the manual for 17 | % more details 18 | 19 | rbmParams.epochs = 10; 20 | rbmParams.batchsize = 100; 21 | rbmParams.lrW = 0.1; % learningRate for weights 22 | rbmParams.lrVb = 0.1; % learningRate for visible biases 23 | rbmParams.lrHb = 0.1; % learningRate for hidden biases 24 | 25 | rbmParams.lrW_linear = 0.001; % learning for weights when one layer is linear 26 | rbmParams.lrVb_linear = 0.001; % learning for visible biases when one layer is linear 27 | rbmParams.lrHb_linear = 0.001; % learning for hidden biases when one layer is linear 28 | 29 | rbmParams.weightPenaltyL2 = 0.0002;% L2 regularisation 30 | 31 | rbmParams.initMomentum = 0.5; % initial momentum 32 | rbmParams.finalMomentum = 0.9; % final momentum 33 | 34 | rbmParams.momentumEpochThres = 5; %threshold after which the final momentum is used 35 | 36 | rbmParams.type = 1; %1 is what Hinton suggests in "A practical guide to training RBMs", 2 is consistent with theory 37 | %check myRBMtrain 38 | 39 | dbnParams.rbmParams = rbmParams; 40 | 41 | dbnParams.type = type; %1 is AE, 2 is classifier 42 | dbnParams.inputActivationFunction = 'sigm'; 43 | 44 | dbnParams.hiddenActivationFunctions = hiddenActivationFunctions; 45 | dbnParams.hiddenLayers = hiddenLayers; 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /utils/data_structures.py: -------------------------------------------------------------------------------- 1 | class circular_list(object): 2 | def __init__(self, size, init=None): 3 | self._data = [] 4 | self.MAX_SIZE = size 5 | if init is not None: 6 | for i in range(size): 7 | self._data.append(init) 8 | 9 | def push(self, item): 10 | """ 11 | push item to the tail 12 | :param item: item to insert 13 | :return: 14 | """ 15 | if len(self._data) == self.MAX_SIZE: 16 | # full we have to pop the oldest item (head) 17 | self._data.pop(0) 18 | self._data.append(item) 19 | 20 | def pop(self): 21 | """ 22 | pops the first item in the queue 23 | :return: head of queue 24 | """ 25 | if len(self._data) == 0: 26 | return None 27 | else: 28 | return self._data.pop(0) 29 | 30 | def __iter__(self): 31 | self.index = 0 32 | return self 33 | 34 | def next(self): 35 | if self.index == len(self._data): 36 | raise StopIteration 37 | else: 38 | self.index += 1 39 | return self._data[self.index - 1] 40 | 41 | def __getitem__(self, index): 42 | return self._data[index] 43 | 44 | def __setitem__(self, index, value): 45 | self._data[index] = value 46 | 47 | def __len__(self): 48 | return len(self._data) 49 | 50 | 51 | def test_circular_list(): 52 | clist = circular_list(5) 53 | clist.push(1) 54 | clist.push(2) 55 | clist.push(3) 56 | clist.push(4) 57 | clist.push(5) 58 | clist.push(6) 59 | clist.push(7) 60 | 61 | clist[1] = 8 62 | 63 | assert clist[0] == 3 64 | assert clist[1] == 8 65 | assert clist[2] == 5 66 | assert clist[3] == 6 67 | assert clist[4] == 7 68 | assert len(clist) == 5 69 | 70 | clist2 = circular_list(7, 'hello') 71 | for item in clist2: 72 | assert item == 'hello' 73 | 74 | 75 | if __name__ == '__main__': 76 | test_circular_list() 77 | -------------------------------------------------------------------------------- /modelzoo/lstm_classifier_majority_vote.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate 6 | from lasagne.nonlinearities import tanh 7 | from custom.layers import create_blstm, create_lstm 8 | 9 | 10 | def create_model(input_shape, input_var, mask_shape, mask_var, lstm_size=250, output_classes=26, 11 | w_init=las.init.GlorotUniform(), use_peepholes=False, use_blstm=True): 12 | gate_parameters = Gate( 13 | W_in=w_init, W_hid=w_init, 14 | b=las.init.Constant(0.)) 15 | cell_parameters = Gate( 16 | W_in=w_init, W_hid=w_init, 17 | # Setting W_cell to None denotes that no cell connection will be used. 18 | W_cell=None, b=las.init.Constant(0.), 19 | # By convention, the cell nonlinearity is tanh in an LSTM. 20 | nonlinearity=tanh) 21 | 22 | l_in = InputLayer(input_shape, input_var, 'input') 23 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 24 | 25 | symbolic_seqlen = l_in.input_var.shape[1] 26 | if use_blstm: 27 | f_lstm, b_lstm = create_blstm(l_in, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) 28 | l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum') 29 | 30 | # reshape to (num_examples * seq_len, lstm_size) 31 | l_reshape = ReshapeLayer(l_sum, (-1, lstm_size), name='reshape') 32 | else: 33 | l_lstm = create_lstm(l_in, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) 34 | l_reshape = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape') 35 | 36 | # Now, we can apply feed-forward layers as usual. 37 | # We want the network to predict a classification for the sequence, 38 | # so we'll use a the number of classes. 39 | l_softmax = DenseLayer( 40 | l_reshape, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') 41 | 42 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') 43 | return l_out 44 | -------------------------------------------------------------------------------- /dbn/trainDBN.m: -------------------------------------------------------------------------------- 1 | function [dbn, errorPerBatch errorPerSample] = trainDBN(dataMatrix, dbnParams) 2 | % trainDBN - Trains a DBN 3 | 4 | % INPUTS 5 | % dataMatrix: matrix containing the training examples, size: noExamples x 6 | % Dimensionality 7 | % dbnParams: structure containing the DBN params, see manual for more 8 | % details 9 | 10 | % OUTPUTS 11 | % dbn: structure which contains the weights (W), the hidden biases (hidbiases) and 12 | % the visible biases (visbiases) for each RBM layer 13 | 14 | % errorPerBatch: 1xN cell array where N is the number of hidden layers (= 15 | % the number of RBMs to train and stack). Each cell contains the average 16 | % minibatch error per epoch. If number of epochs is 100 then each cell will 17 | % be 1 x 100 18 | 19 | % errorPerSample: same as above but contains the average error per training 20 | % sample 21 | 22 | activationFunctionsAllLayers = [dbnParams.inputActivationFunction, dbnParams.hiddenActivationFunctions]; 23 | 24 | hiddenLayers = dbnParams.hiddenLayers; 25 | nHidLayers = length(hiddenLayers); 26 | 27 | for i = 1:nHidLayers 28 | 29 | noHidNeurons = hiddenLayers(i); 30 | [numExamples, numDims] = size(dataMatrix); 31 | 32 | fprintf(1,'Pretraining Layer %d with RBM: %d-%d \n',i, numDims,noHidNeurons); 33 | 34 | hLayer = activationFunctionsAllLayers(i + 1); % activation function of hidden layer 35 | vLayer = activationFunctionsAllLayers(i); % activation function of visible layer 36 | 37 | trFctnLayers = [vLayer hLayer]; 38 | 39 | % train RBM 40 | [rbm, errorPerBatch{i}, errorPerSample{i}] = trainRBM(dataMatrix, dbnParams, noHidNeurons, trFctnLayers); 41 | 42 | % save RBM weights to corresponding DBN layer 43 | dbn.W{i} = rbm.W; 44 | dbn.hidbiases{i} = rbm.hidbiases; 45 | dbn.visbiases{i} = rbm.visbiases; 46 | 47 | % compute RBMs hidden activations 48 | [posHidProbs, posHidStates] = RBMup(dataMatrix, rbm.W, rbm.hidbiases, hLayer); 49 | 50 | % and use them as new inputs for the following RBM 51 | dataMatrix = posHidProbs; 52 | 53 | end 54 | 55 | disp('DBN training done') -------------------------------------------------------------------------------- /dbn/exampleDBN_AE.m: -------------------------------------------------------------------------------- 1 | 2 | type = 1; % 1 is AE, 2 is classifier, 3 | 4 | 5 | 6 | % train_x = double(train_x(1:50000,:)); 7 | % train_y = double(train_y(1:50000,:)); 8 | 9 | train_x = dataMatrix; 10 | %train_x = trData; %vertcat(trData, valData, testData); 11 | % train_x = cat(1, testDataResized, trainDataResized); 12 | 13 | 14 | inputSize = size(train_x,2); 15 | 16 | if type == 1 % AE 17 | outputSize = inputSize; % in case of AE it should be equal to the number of inputs 18 | 19 | %if type = 1, i.e., AE then the last layer should be linear and usually a 20 | % series of decreasing layers are used 21 | hiddenActivationFunctions = {'ReLu','ReLu','ReLu','linear'};%{'sigm','sigm','sigm','linear'}; 22 | hiddenLayers = [200 100 50 20]; 23 | 24 | elseif type == 2 % classifier 25 | outputSize = size(train_y,2); % in case of classification it should be equal to the number of classes 26 | 27 | hiddenActivationFunctions = {'sigm','sigm','sigm'};%{'ReLu','ReLu','ReLu','ReLu'};% 28 | hiddenLayers = [500 500 1000 ]; % hidden layers sizes, does not include input or output layers 29 | 30 | end 31 | 32 | % parameters used for visualisation of first layer weights 33 | visParams.noExamplesPerSubplot = 50; % number of images to show per row 34 | visParams.noSubplots = floor(hiddenLayers(1) / visParams.noExamplesPerSubplot); 35 | visParams.col = 45; %44;% number columns of image 36 | visParams.row = 30; %26 number rows of image 37 | 38 | 39 | 40 | dbnParams = dbnParamsInit(type, hiddenActivationFunctions, hiddenLayers); 41 | dbnParams.inputActivationFunction = 'linear'; %sigm for binary inputs, linear for continuous input 42 | dbnParams.rbmParams.epochs = 20; 43 | 44 | % normalise data 45 | train_x = normaliseData(dbnParams.inputActivationFunction, train_x,[]); 46 | 47 | % train Deep Belief Network 48 | [dbn, errorPerBatch, errorPerSample] = trainDBN(train_x, dbnParams); 49 | 50 | % visualise weights of first layer 51 | % visualiseHiddenLayerWeights(dbn.W{1},visParams.col,visParams.row,visParams.noSubplots); 52 | 53 | nn = unfoldDBNtoNN(dbnParams, dbn, outputSize); 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /modelzoo/deltanet_v1.py: -------------------------------------------------------------------------------- 1 | import lasagne as las 2 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ReshapeLayer, ElemwiseSumLayer 3 | from lasagne.layers import Gate 4 | from lasagne.nonlinearities import tanh 5 | from custom.layers import create_blstm, DeltaLayer, create_lstm 6 | 7 | 8 | def create_model(input_shape, input_var, mask_shape, mask_var, window, lstm_size=250, output_classes=26, 9 | w_init=las.init.GlorotUniform(), use_peepholes=False, use_blstm=True): 10 | gate_parameters = Gate( 11 | W_in=w_init, W_hid=w_init, 12 | b=las.init.Constant(0.)) 13 | cell_parameters = Gate( 14 | W_in=w_init, W_hid=w_init, 15 | # Setting W_cell to None denotes that no cell connection will be used. 16 | W_cell=None, b=las.init.Constant(0.), 17 | # By convention, the cell nonlinearity is tanh in an LSTM. 18 | nonlinearity=tanh) 19 | 20 | l_in = InputLayer(input_shape, input_var, 'input') 21 | l_mask = InputLayer(mask_shape, mask_var, name='mask') 22 | 23 | symbolic_seqlen = l_in.input_var.shape[1] 24 | l_delta = DeltaLayer(l_in, window, name='delta') 25 | 26 | if use_blstm: 27 | f_lstm, b_lstm = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) 28 | l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum') 29 | # reshape to (num_examples * seq_len, lstm_size) 30 | l_reshape = ReshapeLayer(l_sum, (-1, lstm_size), name='reshape') 31 | else: 32 | l_lstm = create_lstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) 33 | l_reshape = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape') 34 | 35 | # Now, we can apply feed-forward layers as usual. 36 | # We want the network to predict a classification for the sequence, 37 | # so we'll use a the number of classes. 38 | l_softmax = DenseLayer( 39 | l_reshape, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') 40 | 41 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') 42 | return l_out 43 | -------------------------------------------------------------------------------- /dbn/unfoldDBNtoAE.m: -------------------------------------------------------------------------------- 1 | function [weightsAE, biasesAE, newActivationFunctions, newLayers] = unfoldDBNtoAE(dbnParams, dbn, outputSize) 2 | % unfoldDBNtoAE - Unfolds DBN to an autoencoder NN 3 | 4 | % INPUTS 5 | % dbnParams: structure containing the DBN params, see manual for more 6 | % details 7 | 8 | % dbn: structure which contains the weights (W), the hidden biases (hidbiases) and 9 | % the visible biases (visbiases) for each RBM layer 10 | 11 | % outputSize: size of output layer 12 | 13 | % OUTPUTS 14 | % weightsAE: 1xN cell array, where N is the number of layers (hidden + output 15 | % layer), each cell contains the weights of the corresponding layer 16 | 17 | % biasesAE: 1xN cell array, where N is the number of layers (hidden + output 18 | % layer), each cell contains the biases of the corresponding layer 19 | 20 | % newActivationFunctions: 1xN cell array, where N is the number of layers (hidden + output 21 | % layer), each cell contains the activation function of the corresponding layer 22 | 23 | % newLayers: 1xN vector, where N is the number of layers (hidden + output 24 | % layer), each entry contains the size of the corresponding layer 25 | 26 | noLayers = length(dbnParams.hiddenLayers); 27 | 28 | % create encoding layers 29 | weightsAE = dbn.W; 30 | biasesAE = dbn.hidbiases; 31 | inputSize = size(dbn.W{1},1); 32 | 33 | if inputSize ~= outputSize 34 | error('Input size is different that output size. In an AE they should have the same size') 35 | end 36 | 37 | ind = 1; 38 | % create decoding layers, where weights/biases are mirrored from the 39 | % encoding layer 40 | for i = noLayers + 1:2*noLayers 41 | 42 | index = i - ind; 43 | weightsAE{i} = dbn.W{index}'; 44 | biasesAE{i} = dbn.visbiases{index}; 45 | 46 | ind = ind + 2; 47 | 48 | end 49 | 50 | % create new activation functions (activFcn from encoding layer + same 51 | % activFcn flipped for decoding layer + outputActivFcn same as inputActivFcn 52 | newActivationFunctions = [dbnParams.hiddenActivationFunctions fliplr(dbnParams.hiddenActivationFunctions(1:end-1)) dbnParams.inputActivationFunction]; 53 | % same as above for hidden layers 54 | newLayers = [dbnParams.hiddenLayers fliplr(dbnParams.hiddenLayers(1:end-1)) outputSize]; 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /modelzoo/autoencoder.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear 7 | 8 | import scipy.io as sio 9 | 10 | 11 | def load_dbn(path='models/avletters_ae.mat'): 12 | """ 13 | load a pretrained dbn from path 14 | :param path: path to the .mat dbn 15 | :return: pretrained deep belief network 16 | """ 17 | # create the network using weights from pretrain_nn.mat 18 | nn = sio.loadmat(path) 19 | w = [] 20 | b = [] 21 | w.append(nn['w1']) 22 | w.append(nn['w2']) 23 | w.append(nn['w3']) 24 | w.append(nn['w4']) 25 | w.append(nn['w5']) 26 | w.append(nn['w6']) 27 | w.append(nn['w7']) 28 | w.append(nn['w8']) 29 | b.append(nn['b1'][0]) 30 | b.append(nn['b2'][0]) 31 | b.append(nn['b3'][0]) 32 | b.append(nn['b4'][0]) 33 | b.append(nn['b5'][0]) 34 | b.append(nn['b6'][0]) 35 | b.append(nn['b7'][0]) 36 | b.append(nn['b8'][0]) 37 | return w, b 38 | 39 | 40 | def create_model(incoming, weights, biases, activations, layersizes): 41 | """ 42 | Create an autoencoder given pretrained weights and activations 43 | :param: incoming: incoming layer (input layer) 44 | :param weights: layer weights 45 | :param biases: layer biases 46 | :param activations: activation functions for each layer 47 | :param layersizes: num hidden units for each layer 48 | :return: autoencoder model 49 | """ 50 | for i, w in enumerate(weights): 51 | incoming = DenseLayer(incoming, layersizes[i], w, biases[i], activations[i], name='fc{}'.format(i + 1)) 52 | return incoming 53 | 54 | 55 | def create_pretrained_encoder(incoming, weights, biases, activations, layersizes): 56 | l_1 = DenseLayer(incoming, layersizes[0], W=weights[0], b=biases[0], nonlinearity=activations[0], name='fc1') 57 | l_2 = DenseLayer(l_1, layersizes[1], W=weights[1], b=biases[1], nonlinearity=activations[1], name='fc2') 58 | l_3 = DenseLayer(l_2, layersizes[2], W=weights[2], b=biases[2], nonlinearity=activations[2], name='fc3') 59 | l_4 = DenseLayer(l_3, layersizes[3], W=weights[3], b=biases[3], nonlinearity=activations[3], name='bottleneck') 60 | return l_4 -------------------------------------------------------------------------------- /dbn/zigzag.m: -------------------------------------------------------------------------------- 1 | function out=zigzag(in) 2 | % Zig-zag scanning 3 | % This function is used to rearrange a matrix of any size into a 1-D array 4 | % by implementing the ZIG-ZAG SCANNING procedure. 5 | % IN specifies the input matrix of any size 6 | % OUT is the resulting zig-zag scanned (1-D) vector 7 | % having length equal to the total number of elements in the 2-D input matrix 8 | % 9 | % As an example, 10 | % IN = [1 2 6 7 11 | % 3 5 8 11 12 | % 4 9 10 12]; 13 | % OUT = ZIGZAG(IN) 14 | % OUT= 15 | % 1 2 3 4 5 6 7 8 9 10 11 12 16 | 17 | % 18 | % 19 | % Oluwadamilola (Damie) Martins Ogunbiyi 20 | % University of Maryland, College Park 21 | % Department of Electrical and Computer Engineering 22 | % Communications and Signal Processing 23 | % 22-March-2010 24 | % Copyright 2009-2010 Black Ace of Diamonds. 25 | 26 | [num_rows num_cols]=size(in); 27 | 28 | % Initialise the output vector 29 | out=zeros(1,num_rows*num_cols); 30 | 31 | cur_row=1; cur_col=1; cur_index=1; 32 | 33 | % First element 34 | %out(1)=in(1,1); 35 | 36 | while cur_row<=num_rows & cur_col<=num_cols 37 | if cur_row==1 & mod(cur_row+cur_col,2)==0 & cur_col~=num_cols 38 | out(cur_index)=in(cur_row,cur_col); 39 | cur_col=cur_col+1; %move right at the top 40 | cur_index=cur_index+1; 41 | 42 | elseif cur_row==num_rows & mod(cur_row+cur_col,2)~=0 & cur_col~=num_cols 43 | out(cur_index)=in(cur_row,cur_col); 44 | cur_col=cur_col+1; %move right at the bottom 45 | cur_index=cur_index+1; 46 | 47 | elseif cur_col==1 & mod(cur_row+cur_col,2)~=0 & cur_row~=num_rows 48 | out(cur_index)=in(cur_row,cur_col); 49 | cur_row=cur_row+1; %move down at the left 50 | cur_index=cur_index+1; 51 | 52 | elseif cur_col==num_cols & mod(cur_row+cur_col,2)==0 & cur_row~=num_rows 53 | out(cur_index)=in(cur_row,cur_col); 54 | cur_row=cur_row+1; %move down at the right 55 | cur_index=cur_index+1; 56 | 57 | elseif cur_col~=1 & cur_row~=num_rows & mod(cur_row+cur_col,2)~=0 58 | out(cur_index)=in(cur_row,cur_col); 59 | cur_row=cur_row+1; cur_col=cur_col-1; %move diagonally left down 60 | cur_index=cur_index+1; 61 | 62 | elseif cur_row~=1 & cur_col~=num_cols & mod(cur_row+cur_col,2)==0 63 | out(cur_index)=in(cur_row,cur_col); 64 | cur_row=cur_row-1; cur_col=cur_col+1; %move diagonally right up 65 | cur_index=cur_index+1; 66 | 67 | elseif cur_row==num_rows & cur_col==num_cols %obtain the bottom right element 68 | out(end)=in(end); %end of the operation 69 | break %terminate the operation 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /oulu/preprocess_images.py: -------------------------------------------------------------------------------- 1 | """ 2 | preprocess the images 3 | """ 4 | import sys 5 | sys.path.append('../') 6 | import argparse 7 | from utils.io import load_mat_file, save_mat 8 | from utils.preprocessing import normalize_input 9 | from utils.preprocessing import sequencewise_mean_image_subtraction, compute_diff_images 10 | from utils.plotting_utils import reshape_images_order 11 | 12 | 13 | def reorder_images(data, shape): 14 | data = reshape_images_order(data, shape) 15 | return data 16 | 17 | 18 | def samplewise_normalize(data): 19 | data = normalize_input(data) 20 | return data 21 | 22 | 23 | def remove_mean(data, vidlens): 24 | data = sequencewise_mean_image_subtraction(data, vidlens) 25 | return data 26 | 27 | 28 | def diff_image(data, vidlens): 29 | data = compute_diff_images(data, vidlens) 30 | return data 31 | 32 | 33 | def parse_options(): 34 | options = dict() 35 | options['remove_mean'] = False 36 | options['diff_image'] = False 37 | options['samplewise_norm'] = False 38 | options['no_reorder'] = False 39 | options['output'] = None 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--remove_mean', action='store_true', help='remove mean image') 42 | parser.add_argument('--diff_image', action='store_true', help='compute difference of image') 43 | parser.add_argument('--samplewise_norm', action='store_true', help='samplewise normalize') 44 | parser.add_argument('--no_reorder', action='store_true', help='disable data reordering from f to c') 45 | parser.add_argument('--output', help='write output to .mat file') 46 | parser.add_argument('input', nargs='+', help='input ouluvs2 .mat file to preprocess') 47 | args = parser.parse_args() 48 | if args.remove_mean: 49 | options['remove_mean'] = args.remove_mean 50 | if args.diff_image: 51 | options['diff_image'] = args.diff_image 52 | if args.samplewise_norm: 53 | options['samplewise_norm'] = args.samplewise_norm 54 | if args.no_reorder: 55 | options['no_reorder'] = args.no_reorder 56 | if args.output: 57 | options['output'] = args.output 58 | if args.input: 59 | options['input'] = args.input[0] 60 | return options 61 | 62 | 63 | def main(): 64 | options = parse_options() 65 | data = load_mat_file(options['input']) 66 | dataMatrix = data['dataMatrix'].astype('float32') 67 | vidlens = data['videoLengthVec'].reshape((-1,)) 68 | 69 | if not options['no_reorder']: 70 | dataMatrix = reorder_images(dataMatrix, (26, 44)) 71 | if options['samplewise_norm']: 72 | dataMatrix = samplewise_normalize(dataMatrix) 73 | if options['remove_mean']: 74 | dataMatrix = remove_mean(dataMatrix, vidlens) 75 | if options['diff_image']: 76 | dataMatrix = diff_image(dataMatrix, vidlens) 77 | 78 | data['dataMatrix'] = dataMatrix 79 | if options['output']: 80 | save_mat(data, options['output']) 81 | print('data prepared!') 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /avletters/preprocess_images.py: -------------------------------------------------------------------------------- 1 | """ 2 | realign images to c format from f format 3 | """ 4 | import sys 5 | sys.path.append('../') 6 | import argparse 7 | from utils.io import load_mat_file, save_mat 8 | from utils.preprocessing import resize_images, normalize_input, sequencewise_mean_image_subtraction, reorder_data 9 | from utils.preprocessing import compute_dct_features, concat_first_second_deltas 10 | from utils.preprocessing import compute_diff_images, apply_zca_whitening 11 | from utils.plotting_utils import visualize_images 12 | 13 | 14 | def resize(data): 15 | X = data['dataMatrix'] 16 | vidlens = data['videoLengthVec'].reshape((-1,)) 17 | X = resize_images(X) 18 | # X = apply_zca_whitening(X) 19 | visualize_images(X[800:864]) 20 | dct_feats = compute_dct_features(X, (30, 40), 30, method='zigzag') 21 | dct_feats = concat_first_second_deltas(dct_feats, vidlens) 22 | X = normalize_input(X) 23 | data['dataMatrix'] = X 24 | save_mat(data, 'data/resized.mat') 25 | d = dict() 26 | d['dctFeatures'] = dct_feats 27 | save_mat(d, 'data/dctFeat_AVLetters.mat') 28 | 29 | 30 | def remove_mean(data): 31 | X = data['dataMatrix'].astype('float32') 32 | vidlens = data['videoLengthVec'].reshape((-1,)) 33 | X = resize_images(X) 34 | X = sequencewise_mean_image_subtraction(X, vidlens) 35 | # X = apply_zca_whitening(X) 36 | X_fortran = reorder_data(X, (30, 40), 'c', 'f') 37 | dct_feats = compute_dct_features(X, (30, 40), 30, method='zigzag') 38 | dct_feats = concat_first_second_deltas(dct_feats, vidlens) 39 | d = dict() 40 | d['dctFeatures'] = dct_feats 41 | save_mat(d, 'data/dctFeat_mean_removed_AVLetters.mat') 42 | visualize_images(X[800:864]) 43 | # samplewise normalize 44 | X = normalize_input(X, centralize=True) 45 | data['dataMatrix'] = X 46 | data['dataMatrixF'] = X_fortran 47 | save_mat(data, 'data/resized_mean_removed.mat') 48 | 49 | 50 | def diff_image(data): 51 | X = data['dataMatrix'].astype('float32') 52 | vidlens = data['videoLengthVec'].reshape((-1,)) 53 | X = resize_images(X) 54 | X = apply_zca_whitening(X) 55 | # X = normalize_input(X) 56 | visualize_images(X[2000:2081]) 57 | X = compute_diff_images(X, vidlens) 58 | X = apply_zca_whitening(X) 59 | X = normalize_input(X) 60 | visualize_images(X[2000:2081]) 61 | data['dataMatrix'] = X 62 | save_mat(data, 'data/resized_diff_image_AVLetters.mat') 63 | 64 | 65 | def parse_options(): 66 | options = dict() 67 | options['operation'] = None 68 | parser = argparse.ArgumentParser() 69 | parser.add_argument('--operation', help='remove_mean, diff_image, resize') 70 | args = parser.parse_args() 71 | if args.operation: 72 | options['operation'] = args.operation 73 | return options 74 | 75 | 76 | def main(): 77 | options = parse_options() 78 | data = load_mat_file('data/allData_mouthROIs.mat') 79 | if options['operation'] == 'remove_mean': 80 | remove_mean(data) 81 | elif options['operation'] == 'diff_image': 82 | diff_image(data) 83 | elif options['operation'] == 'resize': 84 | resize(data) 85 | else: 86 | print('unknown operation') 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /test/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from utils.io import * 3 | from utils.preprocessing import * 4 | 5 | 6 | class TestPreprocessingMethods(unittest.TestCase): 7 | def test_forcealign(self): 8 | stream1 = load_mat_file('../oulu/data/allMouthROIsResized_frontal.mat') 9 | stream2 = load_mat_file('../oulu/data/mfcc_w3s3.mat') 10 | 11 | s1_data_matrix = stream1['dataMatrix'].astype('float32') 12 | s1_targets = stream1['targetsVec'].reshape((-1,)) 13 | s1_vidlens = stream1['videoLengthVec'].reshape((-1,)) 14 | s1_subjects = stream1['subjectsVec'].reshape((-1,)) 15 | 16 | s2_data_matrix = stream2['dataMatrix'].astype('float32') 17 | s2_targets = stream2['targetsVec'].reshape((-1,)) 18 | s2_vidlens = stream2['videoLengthVec'].reshape((-1,)) 19 | s2_subjects = stream2['subjectsVec'].reshape((-1,)) 20 | 21 | s1, s2 = force_align((s1_data_matrix, s1_targets, s1_vidlens), 22 | (s2_data_matrix, s2_targets, s2_vidlens)) 23 | 24 | s1_data_matrix, s1_targets, s1_vidlens = s1 25 | s2_data_matrix, s2_targets, s2_vidlens = s2 26 | 27 | assert len(s1_data_matrix) == len(s2_data_matrix) 28 | assert len(s1_targets) == len(s2_targets) 29 | assert np.sum(s1_vidlens) == np.sum(s2_vidlens) 30 | 31 | def test_multistream_forcealign(self): 32 | 33 | stream1 = load_mat_file('../oulu/data/allMouthROIsResized_frontal.mat') 34 | stream2 = load_mat_file('../oulu/data/allMouthROIsResized_frontal.mat') 35 | stream3 = load_mat_file('../oulu/data/dctFeats_meanrm_w2s1.mat') 36 | stream4 = load_mat_file('../oulu/data/mfcc_w3s3.mat') 37 | 38 | s1_data_matrix = stream1['dataMatrix'].astype('float32') 39 | s1_targets = stream1['targetsVec'].reshape((-1,)) 40 | s1_vidlens = stream1['videoLengthVec'].reshape((-1,)) 41 | s1_subjects = stream1['subjectsVec'].reshape((-1,)) 42 | 43 | s2_data_matrix = stream2['dataMatrix'].astype('float32') 44 | s2_targets = stream2['targetsVec'].reshape((-1,)) 45 | s2_vidlens = stream2['videoLengthVec'].reshape((-1,)) 46 | s2_subjects = stream2['subjectsVec'].reshape((-1,)) 47 | 48 | s3_data_matrix = stream3['dataMatrix'].astype('float32') 49 | s3_targets = stream3['targetsVec'].reshape((-1,)) 50 | s3_vidlens = stream3['videoLengthVec'].reshape((-1,)) 51 | s3_subjects = stream3['subjectsVec'].reshape((-1,)) 52 | 53 | s4_data_matrix = stream4['dataMatrix'].astype('float32') 54 | s4_targets = stream4['targetsVec'].reshape((-1,)) 55 | s4_vidlens = stream4['videoLengthVec'].reshape((-1,)) 56 | s4_subjects = stream4['subjectsVec'].reshape((-1,)) 57 | 58 | orig_streams = [ 59 | (s1_data_matrix, s1_targets, s1_vidlens), 60 | (s2_data_matrix, s2_targets, s2_vidlens), 61 | (s3_data_matrix, s3_targets, s3_vidlens), 62 | (s4_data_matrix, s4_targets, s4_vidlens) 63 | ] 64 | 65 | a = multistream_force_align(orig_streams) 66 | assert len(a[0][0]) == len(a[1][0]) == len(a[2][0]) == len(a[3][0]) 67 | assert len(a[0][1]) == len(a[1][1]) == len(a[2][1]) == len(a[3][1]) 68 | assert len(a[0][2]) == len(a[1][2]) == len(a[2][2]) == len(a[3][2]) 69 | 70 | if __name__ == '__main__': 71 | unittest.main() 72 | -------------------------------------------------------------------------------- /modelzoo/lstm_classifier_baseline.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear 7 | 8 | 9 | def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): 10 | if cell_parameters is None: 11 | cell_parameters = Gate() 12 | if gate_parameters is None: 13 | gate_parameters = Gate() 14 | 15 | l_lstm = LSTMLayer( 16 | l_incoming, hidden_units, 17 | # We need to specify a separate input for masks 18 | mask_input=l_mask, 19 | # Here, we supply the gate parameters for each gate 20 | ingate=gate_parameters, forgetgate=gate_parameters, 21 | cell=cell_parameters, outgate=gate_parameters, 22 | # We'll learn the initialization and use gradient clipping 23 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 24 | 25 | return l_lstm 26 | 27 | 28 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): 29 | 30 | if cell_parameters is None: 31 | cell_parameters = Gate() 32 | if gate_parameters is None: 33 | gate_parameters = Gate() 34 | 35 | l_lstm = LSTMLayer( 36 | l_incoming, hidden_units, 37 | # We need to specify a separate input for masks 38 | mask_input=l_mask, 39 | # Here, we supply the gate parameters for each gate 40 | ingate=gate_parameters, forgetgate=gate_parameters, 41 | cell=cell_parameters, outgate=gate_parameters, 42 | # We'll learn the initialization and use gradient clipping 43 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 44 | 45 | # The "backwards" layer is the same as the first, 46 | # except that the backwards argument is set to True. 47 | l_lstm_back = LSTMLayer( 48 | l_incoming, hidden_units, ingate=gate_parameters, 49 | mask_input=l_mask, forgetgate=gate_parameters, 50 | cell=cell_parameters, outgate=gate_parameters, 51 | learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) 52 | 53 | return l_lstm, l_lstm_back 54 | 55 | 56 | def create_model(input_shape, input_var, mask_shape, mask_var, lstm_size=250, output_classes=26, 57 | w_init=las.init.Orthogonal()): 58 | gate_parameters = Gate( 59 | W_in=w_init, W_hid=w_init, 60 | b=las.init.Constant(0.)) 61 | cell_parameters = Gate( 62 | W_in=w_init, W_hid=w_init, 63 | # Setting W_cell to None denotes that no cell connection will be used. 64 | W_cell=None, b=las.init.Constant(0.), 65 | # By convention, the cell nonlinearity is tanh in an LSTM. 66 | nonlinearity=tanh) 67 | 68 | l_in = InputLayer(input_shape, input_var, 'input') 69 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 70 | 71 | f_lstm, b_lstm = create_blstm(l_in, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm') 72 | 73 | l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum') 74 | l_forward_slice1 = SliceLayer(l_sum, -1, 1, name='slice1') 75 | 76 | # Now, we can apply feed-forward layers as usual. 77 | # We want the network to predict a classification for the sequence, 78 | # so we'll use a the number of classes. 79 | l_out = DenseLayer( 80 | l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') 81 | 82 | return l_out 83 | -------------------------------------------------------------------------------- /runners/extract_encoder_from_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.insert(0, '../') 4 | import numpy as np 5 | import theano.tensor as T 6 | import argparse 7 | from modelzoo import deltanet_majority_vote 8 | from utils.io import save_mat 9 | from custom.nonlinearities import select_nonlinearity 10 | 11 | 12 | def parse_options(): 13 | options = dict() 14 | options['config'] = '../cuave/config/1stream.ini' 15 | options['shape'] = '2000,1000,500,50' 16 | options['nonlinearities'] = 'rectify,rectify,rectify,linear' 17 | options['input_dim'] = 1200 18 | options['lstm_size'] = 250 19 | options['output_classes'] = 26 20 | options['use_blstm'] = False 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--shape', help='shape of encoder. Default: 2000,1000,500,50') 23 | parser.add_argument('--input_dim', help='input dimension. Default: 1200') 24 | parser.add_argument('--nonlinearities', help='nolinearities used by encodeer. ' 25 | 'Default: rectify,rectify,rectify,linear') 26 | parser.add_argument('--output', help='output file to write results') 27 | parser.add_argument('--lstm_size', help='lstm layer size. Default: 250') 28 | parser.add_argument('--output_classes', help='number of output classes. Default: 10') 29 | parser.add_argument('--use_blstm', help='use blstm') 30 | parser.add_argument('input', help='input model.pkl file') 31 | 32 | args = parser.parse_args() 33 | options['input'] = args.input 34 | if args.shape: 35 | options['shape'] = args.shape 36 | if args.input_dim: 37 | options['input_dim'] = int(args.input_dim) 38 | if args.nonlinearities: 39 | options['nonlinearities'] = args.nonlinearities 40 | if args.lstm_size: 41 | options['lstm_size'] = int(args.lstm_size) 42 | if args.output_classes: 43 | options['output_classes'] = int(args.output_classes) 44 | if args.output: 45 | options['output'] = args.output 46 | if args.use_blstm: 47 | options['use_blstm'] = True 48 | return options 49 | 50 | 51 | def main(): 52 | options = parse_options() 53 | print(options) 54 | window = T.iscalar('theta') 55 | inputs1 = T.tensor3('inputs1', dtype='float32') 56 | mask = T.matrix('mask', dtype='uint8') 57 | shape = [int(i) for i in options['shape'].split(',')] 58 | nonlinearities = [select_nonlinearity(s) for s in options['nonlinearities'].split(',')] 59 | network = deltanet_majority_vote.load_saved_model(options['input'], 60 | (shape, nonlinearities), 61 | (None, None, options['input_dim']), inputs1, (None, None), mask, 62 | options['lstm_size'], window, options['output_classes'], 63 | use_blstm=options['use_blstm']) 64 | d = deltanet_majority_vote.extract_encoder_weights(network, ['fc1', 'fc2', 'fc3', 'bottleneck'], 65 | [('w1', 'b1'), ('w2', 'b2'), ('w3', 'b3'), ('w4', 'b4')]) 66 | expected_keys = ['w1', 'w2', 'w3', 'w4', 'b1', 'b2', 'b3', 'b4'] 67 | keys = d.keys() 68 | for k in keys: 69 | assert k in expected_keys 70 | assert type(d[k]) == np.ndarray 71 | if 'output' in options: 72 | print('save extracted weights to {}'.format(options['output'])) 73 | save_mat(d, options['output']) 74 | 75 | 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /avletters2/prepare_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.insert(0, '../') 4 | import argparse 5 | from utils.preprocessing import * 6 | from utils.io import * 7 | from utils.plotting_utils import * 8 | 9 | 10 | def parse_options(): 11 | options = dict() 12 | options['remove_mean'] = False 13 | options['diff_image'] = False 14 | options['samplewise_norm'] = False 15 | options['merge_samples'] = False 16 | options['output'] = None 17 | options['mergesize'] = 3 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--remove_mean', action='store_true', help='remove mean image') 20 | parser.add_argument('--diff_image', action='store_true', help='compute difference of image') 21 | parser.add_argument('--samplewise_norm', action='store_true', help='samplewise normalize') 22 | parser.add_argument('--reorder_data', help='redorder data from f to c convention. eg: 30,50') 23 | parser.add_argument('--concat_deltas', help='concat 1st and 2nd deltas, default delta window: 2') 24 | parser.add_argument('--embed_temporal_info', help='embed temporal info to features [window],[step]. ie: 3,1') 25 | parser.add_argument('--output', help='write output to .mat file') 26 | parser.add_argument('input', nargs='+', help='input data .mat file to preprocess') 27 | args = parser.parse_args() 28 | if args.remove_mean: 29 | options['remove_mean'] = args.remove_mean 30 | if args.diff_image: 31 | options['diff_image'] = args.diff_image 32 | if args.samplewise_norm: 33 | options['samplewise_norm'] = args.samplewise_norm 34 | if args.embed_temporal_info: 35 | options['embed_temporal_info'] = args.embed_temporal_info 36 | if args.reorder_data: 37 | options['reorder_data'] = args.reorder_data 38 | if args.output: 39 | options['output'] = args.output 40 | if args.input: 41 | options['input'] = args.input[0] 42 | if args.concat_deltas: 43 | options['concat_deltas'] = int(args.concat_deltas) 44 | return options 45 | 46 | 47 | def main(): 48 | options = parse_options() 49 | data = load_mat_file(options['input']) 50 | data_matrix = data['dataMatrix'].astype('float32') 51 | vid_len_vec = data['videoLengthVec'].astype('int').reshape((-1,)) 52 | targets_vec = data['targetsVec'].reshape((-1,)) 53 | 54 | if 'reorder_data' in options: 55 | imagesize = tuple([int(d) for d in options['reorder_data'].split(',')]) 56 | data_matrix = reorder_data(data_matrix, imagesize) 57 | if options['samplewise_norm']: 58 | data_matrix = normalize_input(data_matrix) 59 | if options['remove_mean']: 60 | data_matrix = sequencewise_mean_image_subtraction(data_matrix, vid_len_vec) 61 | if options['diff_image']: 62 | data_matrix = compute_diff_images(data_matrix, vid_len_vec) 63 | if 'embed_temporal_info' in options: 64 | window, step = tuple([int(d) for d in options['embed_temporal_info'].split(',')]) 65 | data_matrix, targets_vec, vid_len_vec = factorize(data_matrix, targets_vec, vid_len_vec, step, 0) 66 | data_matrix, targets_vec, vid_len_vec = embed_temporal_info(data_matrix, targets_vec, vid_len_vec, window, step) 67 | if 'concat_deltas' in options: 68 | data_matrix = concat_first_second_deltas(data_matrix, vid_len_vec, options['concat_deltas']) 69 | 70 | data['dataMatrix'] = data_matrix 71 | 72 | if 'embed_temporal_info' in options: 73 | data['videoLengthVec'] = vid_len_vec 74 | data['targetsVec'] = targets_vec 75 | 76 | if 'output' in options: 77 | save_mat(data, options['output']) 78 | # print(data.keys()) 79 | print('data prepared!') 80 | 81 | 82 | if __name__ == '__main__': 83 | main() -------------------------------------------------------------------------------- /modelzoo/deltanet.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, DenseLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate 6 | from lasagne.nonlinearities import tanh, linear, rectify 7 | 8 | from custom.layers import DeltaLayer, create_blstm 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder 10 | 11 | 12 | def create_model_using_pretrained_encoder(weights, biases, input_shape, input_var, mask_shape, mask_var, 13 | lstm_size=250, win=T.iscalar('theta'), output_classes=26, 14 | w_init_fn=las.init.Orthogonal(), 15 | use_peepholes=False, nonlinearities=rectify): 16 | gate_parameters = Gate( 17 | W_in=w_init_fn, W_hid=w_init_fn, 18 | b=las.init.Constant(0.)) 19 | cell_parameters = Gate( 20 | W_in=w_init_fn, W_hid=w_init_fn, 21 | # Setting W_cell to None denotes that no cell connection will be used. 22 | W_cell=None, b=las.init.Constant(0.), 23 | # By convention, the cell nonlinearity is tanh in an LSTM. 24 | nonlinearity=tanh) 25 | 26 | l_in = InputLayer(input_shape, input_var, 'input') 27 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 28 | 29 | symbolic_batchsize = l_in.input_var.shape[0] 30 | symbolic_seqlen = l_in.input_var.shape[1] 31 | 32 | l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') 33 | l_encoder = create_pretrained_encoder(l_reshape1, weights, biases, 34 | [2000, 1000, 500, 50], 35 | [nonlinearities, nonlinearities, nonlinearities, linear], 36 | ['fc1', 'fc2', 'fc3', 'bottleneck']) 37 | encoder_len = las.layers.get_output_shape(l_encoder)[-1] 38 | l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') 39 | l_delta = DeltaLayer(l_reshape2, win, name='delta') 40 | 41 | l_lstm, l_lstm_back = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'bstm1', 42 | use_peepholes) 43 | 44 | # We'll combine the forward and backward layer output by summing. 45 | # Merge layers take in lists of layers to merge as input. 46 | l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') 47 | 48 | l_forward_slice1 = SliceLayer(l_sum1, -1, 1, name='slice1') 49 | 50 | # Now, we can apply feed-forward layers as usual. 51 | # We want the network to predict a classification for the sequence, 52 | # so we'll use a the number of classes. 53 | l_out = DenseLayer( 54 | l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') 55 | 56 | return l_out 57 | 58 | 59 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var, 60 | lstm_size=250, win=T.iscalar('theta)'), 61 | output_classes=26): 62 | 63 | dbn_layers = dbn.get_all_layers() 64 | weights = [] 65 | biases = [] 66 | weights.append(dbn_layers[1].W.astype('float32')) 67 | weights.append(dbn_layers[2].W.astype('float32')) 68 | weights.append(dbn_layers[3].W.astype('float32')) 69 | weights.append(dbn_layers[4].W.astype('float32')) 70 | biases.append(dbn_layers[1].b.astype('float32')) 71 | biases.append(dbn_layers[2].b.astype('float32')) 72 | biases.append(dbn_layers[3].b.astype('float32')) 73 | biases.append(dbn_layers[4].b.astype('float32')) 74 | 75 | return create_model_using_pretrained_encoder(weights, biases, input_shape, input_var, mask_shape, mask_var, 76 | lstm_size, win, output_classes) 77 | 78 | -------------------------------------------------------------------------------- /custom/updates.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import numpy as np 4 | import theano 5 | import theano.tensor as T 6 | import lasagne 7 | from lasagne import utils 8 | 9 | 10 | def generate_lr_map(params, lr_config, default): 11 | """ 12 | generate a layerwise learning map. 13 | to change the values of the learning rate at different epochs eg: learning rate decay 14 | use a tensor.shared object. To set the value of the variable use 15 | tensor.shared.set_value() to set the value of the variable 16 | tensor.shared.get_value() to get the value of the variable 17 | Ensure the variable type for the variable learning rates are the same type as the model weights. 18 | Typically you can call lasagne.utils.floatX(0.001) to ensure this. 19 | 20 | :param params: model parameters 21 | :param lr_config: learning rate configuration map 22 | :param default: default value of learning rate if key not found for layer 23 | :return: learning rate map 24 | """ 25 | lr_map = {} 26 | for param in params: 27 | layer_name = param.name[:param.name.rfind('.')] 28 | if layer_name in lr_config: 29 | lr_map[param] = lr_config[layer_name] 30 | else: 31 | lr_map[param] = default 32 | return lr_map 33 | 34 | 35 | def adam_vlr(loss_or_grads, params, lr_map, beta1=0.9, 36 | beta2=0.999, epsilon=1e-8): 37 | """Adam updates with Variable Learning Rates 38 | 39 | Adam updates implemented as in [1]_. 40 | 41 | Parameters 42 | ---------- 43 | loss_or_grads : symbolic expression or list of expressions 44 | A scalar loss expression, or a list of gradient expressions 45 | params : list of shared variables 46 | The variables to generate update expressions for 47 | lr_map : dictionary of floats 48 | Learning rate map containing layer name and associated learning rate 49 | beta1 : float 50 | Exponential decay rate for the first moment estimates. 51 | beta2 : float 52 | Exponential decay rate for the second moment estimates. 53 | epsilon : float 54 | Constant for numerical stability. 55 | 56 | Returns 57 | ------- 58 | OrderedDict 59 | A dictionary mapping each parameter to its update expression 60 | 61 | Notes 62 | ----- 63 | The paper [1]_ includes an additional hyperparameter lambda. This is only 64 | needed to prove convergence of the algorithm and has no practical use 65 | (personal communication with the authors), it is therefore omitted here. 66 | 67 | References 68 | ---------- 69 | .. [1] Kingma, Diederik, and Jimmy Ba (2014): 70 | Adam: A Method for Stochastic Optimization. 71 | arXiv preprint arXiv:1412.6980. 72 | """ 73 | all_grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params) 74 | t_prev = theano.shared(utils.floatX(0.)) 75 | updates = OrderedDict() 76 | 77 | # Using theano constant to prevent upcasting of float32 78 | one = T.constant(1) 79 | 80 | t = t_prev + 1 81 | 82 | for param, g_t in zip(params, all_grads): 83 | a_t = lr_map[param]*T.sqrt(one-beta2**t)/(one-beta1**t) 84 | value = param.get_value(borrow=True) 85 | m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), 86 | broadcastable=param.broadcastable) 87 | v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), 88 | broadcastable=param.broadcastable) 89 | 90 | m_t = beta1*m_prev + (one-beta1)*g_t 91 | v_t = beta2*v_prev + (one-beta2)*g_t**2 92 | step = a_t*m_t/(T.sqrt(v_t) + epsilon) 93 | 94 | updates[m_prev] = m_t 95 | updates[v_prev] = v_t 96 | updates[param] = param - step 97 | 98 | updates[t_prev] = t 99 | return updates 100 | -------------------------------------------------------------------------------- /modelzoo/avletters_convae.py: -------------------------------------------------------------------------------- 1 | from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer, BatchNormLayer, batch_norm 2 | from lasagne.nonlinearities import rectify, leaky_rectify, tanh, linear, sigmoid, ScaledTanh 3 | from lasagne.layers import Conv2DLayer, Deconv2DLayer 4 | from lasagne.layers import MaxPool2DLayer 5 | 6 | 7 | def create_scaled_tanh(scale_in=0.5, scale_out=2.4): 8 | """ 9 | create a scaled hyperbolic tangent to avoid saturation given input range 10 | of {-1, 1}. Refer to 11 | :param scale_in: 12 | :param scale_out: 13 | :return: scaled hyperbolic tangent callable 14 | 15 | References 16 | ---------- 17 | .. [1] LeCun, Yann A., et al. (1998): 18 | Efficient BackProp, 19 | http://link.springer.com/chapter/10.1007/3-540-49430-8_2, 20 | http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf 21 | .. [2] Masci, Jonathan, et al. (2011): 22 | Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction, 23 | http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7, 24 | http://people.idsia.ch/~ciresan/data/icann2011.pdf 25 | """ 26 | return ScaledTanh(scale_in, scale_out) 27 | 28 | 29 | def extract_encoder(network): 30 | pass 31 | 32 | 33 | def create_model(incoming, options): 34 | conv_num_filters1 = 100 35 | conv_num_filters2 = 150 36 | conv_num_filters3 = 200 37 | filter_size1 = 5 38 | filter_size2 = 5 39 | filter_size3 = 3 40 | pool_size = 2 41 | encode_size = options['BOTTLENECK'] 42 | dense_mid_size = options['DENSE'] 43 | pad_in = 'valid' 44 | pad_out = 'full' 45 | scaled_tanh = create_scaled_tanh() 46 | 47 | conv2d1 = Conv2DLayer(incoming, num_filters=conv_num_filters1, filter_size=filter_size1, pad=pad_in, name='conv2d1', nonlinearity=scaled_tanh) 48 | maxpool2d2 = MaxPool2DLayer(conv2d1, pool_size=pool_size, name='maxpool2d2') 49 | conv2d3 = Conv2DLayer(maxpool2d2, num_filters=conv_num_filters2, filter_size=filter_size2, pad=pad_in, name='conv2d3', nonlinearity=scaled_tanh) 50 | maxpool2d4 = MaxPool2DLayer(conv2d3, pool_size=pool_size, name='maxpool2d4', pad=(1,0)) 51 | conv2d5 = Conv2DLayer(maxpool2d4, num_filters=conv_num_filters3, filter_size=filter_size3, pad=pad_in, name='conv2d5', nonlinearity=scaled_tanh) 52 | reshape6 = ReshapeLayer(conv2d5, shape=([0], -1), name='reshape6') # 3000 53 | reshape6_output = reshape6.output_shape[1] 54 | dense7 = DenseLayer(reshape6, num_units=dense_mid_size, name='dense7', nonlinearity=scaled_tanh) 55 | bottleneck = DenseLayer(dense7, num_units=encode_size, name='bottleneck', nonlinearity=linear) 56 | # print_network(bottleneck) 57 | dense8 = DenseLayer(bottleneck, num_units=dense_mid_size, W=bottleneck.W.T, name='dense8', nonlinearity=linear) 58 | dense9 = DenseLayer(dense8, num_units=reshape6_output, W=dense7.W.T, nonlinearity=scaled_tanh, name='dense9') 59 | reshape10 = ReshapeLayer(dense9, shape=([0], conv_num_filters3, 3, 5), name='reshape10') # 32 x 4 x 7 60 | deconv2d11 = Deconv2DLayer(reshape10, conv2d5.input_shape[1], conv2d5.filter_size, stride=conv2d5.stride, 61 | W=conv2d5.W, flip_filters=not conv2d5.flip_filters, name='deconv2d11', nonlinearity=scaled_tanh) 62 | upscale2d12 = Upscale2DLayer(deconv2d11, scale_factor=pool_size, name='upscale2d12') 63 | deconv2d13 = Deconv2DLayer(upscale2d12, conv2d3.input_shape[1], conv2d3.filter_size, stride=conv2d3.stride, 64 | W=conv2d3.W, flip_filters=not conv2d3.flip_filters, name='deconv2d13', nonlinearity=scaled_tanh) 65 | upscale2d14 = Upscale2DLayer(deconv2d13, scale_factor=pool_size, name='upscale2d14') 66 | deconv2d15 = Deconv2DLayer(upscale2d14, conv2d1.input_shape[1], conv2d1.filter_size, stride=conv2d1.stride, 67 | crop=(1, 0), W=conv2d1.W, flip_filters=not conv2d1.flip_filters, name='deconv2d14', nonlinearity=scaled_tanh) 68 | reshape16 = ReshapeLayer(deconv2d15, ([0], -1), name='reshape16') 69 | return reshape16, bottleneck 70 | -------------------------------------------------------------------------------- /modelzoo/avletters_convae_bn.py: -------------------------------------------------------------------------------- 1 | from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer, BatchNormLayer, batch_norm 2 | from lasagne.nonlinearities import rectify, leaky_rectify, tanh, linear, sigmoid, ScaledTanh 3 | from lasagne.layers import Conv2DLayer, Deconv2DLayer, DropoutLayer 4 | from lasagne.layers import MaxPool2DLayer 5 | from utils.plotting_utils import print_network 6 | 7 | 8 | def create_scaled_tanh(scale_in=0.5, scale_out=2.4): 9 | """ 10 | create a scaled hyperbolic tangent to avoid saturation given input range 11 | of {-1, 1}. Refer to 12 | :param scale_in: 13 | :param scale_out: 14 | :return: scaled hyperbolic tangent callable 15 | 16 | References 17 | ---------- 18 | .. [1] LeCun, Yann A., et al. (1998): 19 | Efficient BackProp, 20 | http://link.springer.com/chapter/10.1007/3-540-49430-8_2, 21 | http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf 22 | .. [2] Masci, Jonathan, et al. (2011): 23 | Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction, 24 | http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7, 25 | http://people.idsia.ch/~ciresan/data/icann2011.pdf 26 | """ 27 | return ScaledTanh(scale_in, scale_out) 28 | 29 | 30 | def extract_encoder(network): 31 | pass 32 | 33 | 34 | def create_model(incoming, options): 35 | conv_num_filters1 = 100 36 | conv_num_filters2 = 150 37 | conv_num_filters3 = 200 38 | filter_size1 = 5 39 | filter_size2 = 5 40 | filter_size3 = 3 41 | pool_size = 2 42 | encode_size = options['BOTTLENECK'] 43 | dense_mid_size = options['DENSE'] 44 | pad_in = 'valid' 45 | pad_out = 'full' 46 | scaled_tanh = create_scaled_tanh() 47 | 48 | conv2d1 = Conv2DLayer(incoming, num_filters=conv_num_filters1, filter_size=filter_size1, pad=pad_in, name='conv2d1', nonlinearity=scaled_tanh) 49 | maxpool2d3 = MaxPool2DLayer(conv2d1, pool_size=pool_size, name='maxpool2d3') 50 | bn2 = BatchNormLayer(maxpool2d3, name='batchnorm2') 51 | conv2d4 = Conv2DLayer(bn2, num_filters=conv_num_filters2, filter_size=filter_size2, pad=pad_in, name='conv2d4', nonlinearity=scaled_tanh) 52 | maxpool2d6 = MaxPool2DLayer(conv2d4, pool_size=pool_size, name='maxpool2d6', pad=(1,0)) 53 | bn3 = BatchNormLayer(maxpool2d6, name='batchnorm3') 54 | conv2d7 = Conv2DLayer(bn3, num_filters=conv_num_filters3, filter_size=filter_size3, pad=pad_in, name='conv2d7', nonlinearity=scaled_tanh) 55 | reshape9 = ReshapeLayer(conv2d7, shape=([0], -1), name='reshape9') # 3000 56 | reshape9_output = reshape9.output_shape[1] 57 | bn8 = BatchNormLayer(reshape9, name='batchnorm8') 58 | dense10 = DenseLayer(bn8, num_units=dense_mid_size, name='dense10', nonlinearity=scaled_tanh) 59 | bn11 = BatchNormLayer(dense10, name='batchnorm11') 60 | bottleneck = DenseLayer(bn11, num_units=encode_size, name='bottleneck', nonlinearity=linear) 61 | # print_network(bottleneck) 62 | dense12 = DenseLayer(bottleneck, num_units=dense_mid_size, W=bottleneck.W.T, name='dense12', nonlinearity=linear) 63 | dense13 = DenseLayer(dense12, num_units=reshape9_output, W=dense10.W.T, nonlinearity=scaled_tanh, name='dense13') 64 | reshape14 = ReshapeLayer(dense13, shape=([0], conv_num_filters3, 3, 5), name='reshape14') # 32 x 4 x 7 65 | deconv2d19 = Deconv2DLayer(reshape14, conv2d7.input_shape[1], conv2d7.filter_size, stride=conv2d7.stride, 66 | W=conv2d7.W, flip_filters=not conv2d7.flip_filters, name='deconv2d19', nonlinearity=scaled_tanh) 67 | upscale2d16 = Upscale2DLayer(deconv2d19, scale_factor=pool_size, name='upscale2d16') 68 | deconv2d17 = Deconv2DLayer(upscale2d16, conv2d4.input_shape[1], conv2d4.filter_size, stride=conv2d4.stride, 69 | W=conv2d4.W, flip_filters=not conv2d4.flip_filters, name='deconv2d17', nonlinearity=scaled_tanh) 70 | upscale2d18 = Upscale2DLayer(deconv2d17, scale_factor=pool_size, name='upscale2d18') 71 | deconv2d19 = Deconv2DLayer(upscale2d18, conv2d1.input_shape[1], conv2d1.filter_size, stride=conv2d1.stride, 72 | crop=(1, 0), W=conv2d1.W, flip_filters=not conv2d1.flip_filters, name='deconv2d14', nonlinearity=scaled_tanh) 73 | reshape20 = ReshapeLayer(deconv2d19, ([0], -1), name='reshape20') 74 | return reshape20, bottleneck 75 | -------------------------------------------------------------------------------- /runners/extract_lstm_from_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.insert(0, '../') 4 | import numpy as np 5 | import theano.tensor as T 6 | import argparse 7 | from modelzoo import deltanet_majority_vote 8 | from utils.io import save_mat 9 | from custom.nonlinearities import select_nonlinearity 10 | 11 | 12 | def parse_options(): 13 | options = dict() 14 | options['shape'] = '2000,1000,500,50' 15 | options['nonlinearities'] = 'rectify,rectify,rectify,linear' 16 | options['input_dim'] = 1200 17 | options['lstm_size'] = 250 18 | options['output_classes'] = 26 19 | options['layer_names'] = 'f_lstm,b_lstm' 20 | options['use_blstm'] = False 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--shape', help='shape of encoder. Default: 2000,1000,500,50') 23 | parser.add_argument('--input_dim', help='input dimension. Default: 1200') 24 | parser.add_argument('--nonlinearities', help='nolinearities used by encodeer. ' 25 | 'Default: rectify,rectify,rectify,linear') 26 | parser.add_argument('--output', help='output file to write results') 27 | parser.add_argument('--lstm_size', help='lstm layer size. Default: 250') 28 | parser.add_argument('--output_classes', help='number of output classes. Default: 10') 29 | parser.add_argument('--layer_names', help='names of lstm layers to extract') 30 | parser.add_argument('--use_blstm', help='use blstm') 31 | parser.add_argument('input', help='input model.pkl file') 32 | 33 | args = parser.parse_args() 34 | options['input'] = args.input 35 | if args.shape: 36 | options['shape'] = args.shape 37 | if args.input_dim: 38 | options['input_dim'] = int(args.input_dim) 39 | if args.nonlinearities: 40 | options['nonlinearities'] = args.nonlinearities 41 | if args.lstm_size: 42 | options['lstm_size'] = int(args.lstm_size) 43 | if args.output_classes: 44 | options['output_classes'] = int(args.output_classes) 45 | if args.output: 46 | options['output'] = args.output 47 | if args.layer_names: 48 | options['layer_names'] = args.layer_names 49 | if args.use_blstm: 50 | options['use_blstm'] = True 51 | return options 52 | 53 | 54 | def main(): 55 | options = parse_options() 56 | print(options) 57 | window = T.iscalar('theta') 58 | inputs1 = T.tensor3('inputs1', dtype='float32') 59 | mask = T.matrix('mask', dtype='uint8') 60 | shape = [int(i) for i in options['shape'].split(',')] 61 | nonlinearities = [select_nonlinearity(s) for s in options['nonlinearities'].split(',')] 62 | layer_names = options['layer_names'].split(',') 63 | network = deltanet_majority_vote.load_saved_model(options['input'], 64 | (shape, nonlinearities), 65 | (None, None, options['input_dim']), inputs1, (None, None), mask, 66 | options['lstm_size'], window, options['output_classes'], 67 | use_blstm=options['use_blstm']) 68 | d = deltanet_majority_vote.extract_lstm_weights(network, layer_names, ['f_lstm', 'b_lstm']) 69 | expected_keys = ['f_lstm_w_hid_to_cell', 'f_lstm_w_hid_to_forgetgate', 'f_lstm_w_hid_to_ingate', 70 | 'f_lstm_w_hid_to_outgate', 'f_lstm_w_in_to_cell', 'f_lstm_w_in_to_forgetgate', 71 | 'f_lstm_w_in_to_ingate', 'f_lstm_w_in_to_outgate', 'f_lstm_b_cell', 'f_lstm_b_forgetgate', 72 | 'f_lstm_b_ingate', 'f_lstm_b_outgate', 73 | 'b_lstm_w_hid_to_cell', 'b_lstm_w_hid_to_forgetgate', 74 | 'b_lstm_w_hid_to_ingate', 'b_lstm_w_hid_to_outgate', 'b_lstm_w_in_to_cell', 'b_lstm_w_in_to_forgetgate', 75 | 'b_lstm_w_in_to_ingate', 'b_lstm_w_in_to_outgate', 'b_lstm_b_cell', 'b_lstm_b_forgetgate', 76 | 'b_lstm_b_ingate', 'b_lstm_b_outgate'] 77 | keys = d.keys() 78 | for k in keys: 79 | assert k in expected_keys 80 | assert type(d[k]) == np.ndarray 81 | if 'output' in options: 82 | print('save extracted weights to {}'.format(options['output'])) 83 | save_mat(d, options['output']) 84 | 85 | 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /test/test_gen_batch_from_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | import numpy as np 4 | from utils.io import load_mat_file 5 | from utils.datagen import gen_batch_from_file 6 | 7 | 8 | class TestGenBatchFromFile(unittest.TestCase): 9 | def test_large_batch(self): 10 | """ 11 | test large batch where train data is larger than batch size 12 | :return: batch with data equal to batch size 13 | """ 14 | # load the test file, and preprocess the path separator and dimensions 15 | data = load_mat_file('../5words/data/' 16 | 'data5Words_mouthROIs_basedOnMouthCenter_1pointAndMouthEyesCenter_filenames.mat') 17 | filenames = data['filenamePaths'].flatten() 18 | vidlens = data['videoLengthVec'].flatten() 19 | targets = (data['targetsPerVideoVec'].flatten()) 20 | train_idxs = data['subjectsVec'].flatten() == 1 21 | val_idxs = data['subjectsVec'].flatten() == 2 22 | test_idxs = data['subjectsVec'].flatten() == 3 23 | train_vidlens = vidlens[train_idxs] 24 | val_vidlens = vidlens[val_idxs] 25 | test_vidlens = vidlens[test_idxs] 26 | train_targets = targets[train_idxs] - 1 27 | val_targets = targets[val_idxs] - 1 28 | test_targets = targets[test_idxs] - 1 29 | 30 | # change the file path format and add path prefix to locate file 31 | def prepare_filepaths(f): 32 | return os.path.join('../5words/data', str(f[0].replace('\\', '/'))) 33 | 34 | # apply to all entries in file lists 35 | vfunc = np.vectorize(prepare_filepaths) 36 | filenames = vfunc(filenames) 37 | 38 | # generate splits 39 | training_files = filenames[train_idxs] 40 | val_files = filenames[val_idxs] 41 | test_files = filenames[test_idxs] 42 | datagen = gen_batch_from_file(training_files, train_targets, train_vidlens, 5551) 43 | 44 | for i in range(165): 45 | X_batch, y_batch, mask, idx = next(datagen) 46 | assert X_batch.shape == (30, 29, 5551) 47 | assert y_batch.shape == (30,) 48 | assert mask.shape == (30, 29) 49 | assert idx.shape == (30,) 50 | # remainder 4959 % 30 51 | remainder_batchsize = 4959 % 30 52 | X_batch, y_batch, mask, idx = next(datagen) 53 | assert X_batch.shape == (remainder_batchsize, 29, 5551) 54 | assert y_batch.shape == (remainder_batchsize,) 55 | assert mask.shape == (remainder_batchsize, 29) 56 | assert idx.shape == (remainder_batchsize,) 57 | 58 | def test_small_batch(self): 59 | """ 60 | test when training data is smaller than batch size 61 | :return: batch of length equal to train data len 62 | """ 63 | # load the test file, and preprocess the path separator and dimensions 64 | data = load_mat_file('../5words/data/' 65 | 'data5Words_mouthROIs_basedOnMouthCenter_1pointAndMouthEyesCenter_filenames.mat') 66 | filenames = data['filenamePaths'].flatten() 67 | vidlens = data['videoLengthVec'].flatten() 68 | targets = (data['targetsPerVideoVec'].flatten()) 69 | train_idxs = data['subjectsVec'].flatten() == 1 70 | val_idxs = data['subjectsVec'].flatten() == 2 71 | test_idxs = data['subjectsVec'].flatten() == 3 72 | train_vidlens = vidlens[train_idxs] 73 | val_vidlens = vidlens[val_idxs] 74 | test_vidlens = vidlens[test_idxs] 75 | train_targets = targets[train_idxs] - 1 76 | val_targets = targets[val_idxs] - 1 77 | test_targets = targets[test_idxs] - 1 78 | 79 | def prepare_filepaths(f): 80 | return os.path.join('../5words/data', str(f[0].replace('\\', '/'))) 81 | 82 | vfunc = np.vectorize(prepare_filepaths) 83 | filenames = vfunc(filenames) 84 | training_files = filenames[train_idxs] 85 | val_files = filenames[val_idxs] 86 | test_files = filenames[test_idxs] 87 | datagen = gen_batch_from_file(training_files[:10], train_targets[:10], train_vidlens[:10], 5551) 88 | X_batch, y_batch, mask, idx = next(datagen) 89 | 90 | assert X_batch.shape == (10, 29, 5551) 91 | assert y_batch.shape == (10,) 92 | assert mask.shape == (10, 29) 93 | assert idx.shape == (10,) 94 | 95 | 96 | if __name__ == '__main__': 97 | unittest.main() -------------------------------------------------------------------------------- /modelzoo/avletters_convae_drop.py: -------------------------------------------------------------------------------- 1 | from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer, BatchNormLayer, batch_norm 2 | from lasagne.nonlinearities import rectify, leaky_rectify, tanh, linear, sigmoid, ScaledTanh 3 | from lasagne.layers import Conv2DLayer, Deconv2DLayer, DropoutLayer 4 | from lasagne.layers import MaxPool2DLayer 5 | 6 | 7 | def create_scaled_tanh(scale_in=0.5, scale_out=2.4): 8 | """ 9 | create a scaled hyperbolic tangent to avoid saturation given input range 10 | of {-1, 1}. Refer to 11 | :param scale_in: 12 | :param scale_out: 13 | :return: scaled hyperbolic tangent callable 14 | 15 | References 16 | ---------- 17 | .. [1] LeCun, Yann A., et al. (1998): 18 | Efficient BackProp, 19 | http://link.springer.com/chapter/10.1007/3-540-49430-8_2, 20 | http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf 21 | .. [2] Masci, Jonathan, et al. (2011): 22 | Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction, 23 | http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7, 24 | http://people.idsia.ch/~ciresan/data/icann2011.pdf 25 | """ 26 | return ScaledTanh(scale_in, scale_out) 27 | 28 | 29 | def extract_encoder(network): 30 | pass 31 | 32 | 33 | def create_model(incoming, options): 34 | input_p = 0.2 35 | hidden_p = 0.5 36 | conv_num_filters1 = int(100 / (1.0 - input_p)) 37 | conv_num_filters2 = int(150 / (1.0 - hidden_p)) 38 | conv_num_filters3 = int(200 / (1.0 - hidden_p)) 39 | filter_size1 = 5 40 | filter_size2 = 5 41 | filter_size3 = 3 42 | pool_size = 2 43 | encode_size = int(options['BOTTLENECK'] / 0.5) 44 | dense_mid_size = int(options['DENSE'] / 0.5) 45 | pad_in = 'valid' 46 | pad_out = 'full' 47 | scaled_tanh = create_scaled_tanh() 48 | dropout0 = DropoutLayer(incoming, p=0.2, name='dropout0') 49 | conv2d1 = Conv2DLayer(dropout0, num_filters=conv_num_filters1, filter_size=filter_size1, pad=pad_in, name='conv2d1', nonlinearity=scaled_tanh) 50 | maxpool2d2 = MaxPool2DLayer(conv2d1, pool_size=pool_size, name='maxpool2d2') 51 | dropout1 = DropoutLayer(maxpool2d2, name='dropout1') 52 | conv2d3 = Conv2DLayer(dropout1, num_filters=conv_num_filters2, filter_size=filter_size2, pad=pad_in, name='conv2d3', nonlinearity=scaled_tanh) 53 | maxpool2d4 = MaxPool2DLayer(conv2d3, pool_size=pool_size, name='maxpool2d4', pad=(1,0)) 54 | dropout2 = DropoutLayer(maxpool2d4, name='dropout2') 55 | conv2d5 = Conv2DLayer(dropout2, num_filters=conv_num_filters3, filter_size=filter_size3, pad=pad_in, name='conv2d5', nonlinearity=scaled_tanh) 56 | reshape6 = ReshapeLayer(conv2d5, shape=([0], -1), name='reshape6') # 3000 57 | reshape6_output = reshape6.output_shape[1] 58 | dropout3 = DropoutLayer(reshape6, name='dropout3') 59 | dense7 = DenseLayer(dropout3, num_units=dense_mid_size, name='dense7', nonlinearity=scaled_tanh) 60 | dropout4 = DropoutLayer(dense7, name='dropout4') 61 | bottleneck = DenseLayer(dropout4, num_units=encode_size, name='bottleneck', nonlinearity=linear) 62 | # print_network(bottleneck) 63 | dense8 = DenseLayer(bottleneck, num_units=dense_mid_size, W=bottleneck.W.T, name='dense8', nonlinearity=linear) 64 | dense9 = DenseLayer(dense8, num_units=reshape6_output, W=dense7.W.T, nonlinearity=scaled_tanh, name='dense9') 65 | reshape10 = ReshapeLayer(dense9, shape=([0], conv_num_filters3, 3, 5), name='reshape10') # 32 x 4 x 7 66 | deconv2d11 = Deconv2DLayer(reshape10, conv2d5.input_shape[1], conv2d5.filter_size, stride=conv2d5.stride, 67 | W=conv2d5.W, flip_filters=not conv2d5.flip_filters, name='deconv2d11', nonlinearity=scaled_tanh) 68 | upscale2d12 = Upscale2DLayer(deconv2d11, scale_factor=pool_size, name='upscale2d12') 69 | deconv2d13 = Deconv2DLayer(upscale2d12, conv2d3.input_shape[1], conv2d3.filter_size, stride=conv2d3.stride, 70 | W=conv2d3.W, flip_filters=not conv2d3.flip_filters, name='deconv2d13', nonlinearity=scaled_tanh) 71 | upscale2d14 = Upscale2DLayer(deconv2d13, scale_factor=pool_size, name='upscale2d14') 72 | deconv2d15 = Deconv2DLayer(upscale2d14, conv2d1.input_shape[1], conv2d1.filter_size, stride=conv2d1.stride, 73 | crop=(1, 0), W=conv2d1.W, flip_filters=not conv2d1.flip_filters, name='deconv2d14', nonlinearity=scaled_tanh) 74 | reshape16 = ReshapeLayer(deconv2d15, ([0], -1), name='reshape16') 75 | return reshape16, bottleneck 76 | -------------------------------------------------------------------------------- /utils/signal.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | from lasagne.utils import unroll_scan 5 | 6 | 7 | def delta_theta(theta, curr_delta, t, THETA, Y): 8 | """ 9 | compute a delta theta component at delta time step t 10 | :param theta: current time step theta component 11 | :param curr_delta: current accumulated delta_t 12 | :param t: current delta_t to be computed 13 | :param THETA: window size 14 | :param Y: input sequence 15 | :return: delta theta component for time step t 16 | """ 17 | # accumulator is shaped (1, no_features), transpose to perform column wise element operations 18 | temp = curr_delta.T 19 | d_theta = theta * (Y[:, THETA + t + theta] - Y[:, THETA + t - theta]) / (2 * theta * theta) 20 | temp += d_theta 21 | temp = temp.astype('float32') 22 | curr_delta = temp.T 23 | return curr_delta 24 | 25 | 26 | def delta_t(t, THETA, Y): 27 | """ 28 | compute delta at time step t 29 | :param t: time step 30 | :param THETA: window size 31 | :param Y: sequence in shape (number_of_features, time_step) 32 | :return: delta coefficient at time step t 33 | """ 34 | theta = T.arange(1, THETA + 1, dtype='int32') 35 | results, _ = theano.scan(delta_theta, outputs_info=T.zeros_like(Y), 36 | sequences=theta, non_sequences=[t, THETA, Y]) 37 | # only interested in the final results, discard the intermediate values 38 | final_results = results[-1] 39 | return final_results 40 | 41 | 42 | def delta_coeff(A, theta): 43 | """ 44 | compute delta coefficients given a sequence. 45 | :param A: input sequence in shape (time_step, number_of_features) 46 | :param theta: window size 47 | :return: delta coefficients for the input sequence 48 | """ 49 | # transpose and repeat 50 | X = A.T 51 | Y = T.concatenate([T.extra_ops.repeat(X[:, 0], theta).reshape((X.shape[0], theta)), 52 | X, T.extra_ops.repeat(X[:, -1], theta).reshape((X.shape[0], theta))], axis=1) 53 | delta, _ = theano.scan(delta_t, sequences=[T.arange(0, X.shape[1], dtype='int32')], non_sequences=[theta, Y]) 54 | # transpose the results back to shape (time_step, number_of_features) 55 | delta = delta[:, :, -1].reshape(A.shape) 56 | return delta 57 | 58 | 59 | def append_delta_coeff(A, theta): 60 | """ 61 | append delta + acceleration coefficients given a sequence. 62 | :param A: input sequence in shape (time_step, number_of_features) 63 | :param theta: window size 64 | :return: delta + acceleration coefficients for the input sequence 65 | """ 66 | # transpose and repeat 67 | X = A.T 68 | Y = T.concatenate([T.extra_ops.repeat(X[:, 0], theta).reshape((X.shape[0], theta)), 69 | X, T.extra_ops.repeat(X[:, -1], theta).reshape((X.shape[0], theta))], axis=1) 70 | delta, _ = theano.scan(delta_t, sequences=[T.arange(0, X.shape[1], dtype='int32')], non_sequences=[theta, Y]) 71 | # transpose the results back to shape (time_step, number_of_features) 72 | delta = delta[:, :, -1].reshape(A.shape) 73 | 74 | X = delta.T 75 | Y = T.concatenate([T.extra_ops.repeat(X[:, 0], theta).reshape((X.shape[0], theta)), 76 | X, T.extra_ops.repeat(X[:, -1], theta).reshape((X.shape[0], theta))], axis=1) 77 | acc, _ = theano.scan(delta_t, sequences=[T.arange(0, X.shape[1], dtype='int32')], non_sequences=[theta, Y]) 78 | acc = acc[:, :, -1].reshape(A.shape) 79 | res = T.concatenate([A, delta, acc], axis=1) 80 | return res 81 | 82 | 83 | def main(): 84 | """ 85 | test runner, computes delta for an array of sequences 86 | :return: None 87 | """ 88 | A = T.tensor3('A', dtype='float32') 89 | theta = T.iscalar('theta') 90 | 91 | # compute delta coefficients for multiple sequences 92 | results, updates = theano.scan(append_delta_coeff, sequences=A, non_sequences=theta) 93 | compute_deltas = theano.function([A, theta], outputs=results, updates=updates) 94 | 95 | seqs = np.array([[[1, 2, 3, 4, 5], 96 | [10, 12, 13, 14, 15], 97 | [300, 1, 23, 56, 22]], 98 | [[1, 1, 1, 1, 1], 99 | [1, 1, 100, 1, 1], 100 | [1, 1, 1, 1, 1]]], dtype='float32') 101 | res = compute_deltas(seqs, 1) 102 | print(res) 103 | 104 | if __name__ == '__main__': 105 | main() 106 | -------------------------------------------------------------------------------- /modelzoo/adenet_v2.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear, rectify 7 | 8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer, create_blstm 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder 10 | 11 | 12 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var, 13 | dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'), 14 | output_classes=26, fusiontype='sum', w_init_fn=las.init.GlorotUniform(), 15 | use_peepholes=False, nonlinearities=rectify): 16 | 17 | weights, biases, shapes, nonlinearities = dbn 18 | names = ['fc1', 'fc2', 'fc3', 'bottleneck'] 19 | 20 | gate_parameters = Gate( 21 | W_in=w_init_fn, W_hid=w_init_fn, 22 | b=las.init.Constant(0.)) 23 | cell_parameters = Gate( 24 | W_in=w_init_fn, W_hid=w_init_fn, 25 | # Setting W_cell to None denotes that no cell connection will be used. 26 | W_cell=None, b=las.init.Constant(0.), 27 | # By convention, the cell nonlinearity is tanh in an LSTM. 28 | nonlinearity=tanh) 29 | 30 | l_in = InputLayer(input_shape, input_var, 'input') 31 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 32 | l_dct = InputLayer(dct_shape, dct_var, 'dct') 33 | 34 | symbolic_batchsize = l_in.input_var.shape[0] 35 | symbolic_seqlen = l_in.input_var.shape[1] 36 | 37 | l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') 38 | l_encoder = create_pretrained_encoder(l_reshape1, weights, biases, shapes, nonlinearities, names) 39 | encoder_len = las.layers.get_output_shape(l_encoder)[-1] 40 | l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') 41 | l_delta = DeltaLayer(l_reshape2, win, name='delta') 42 | 43 | l_delta_dct = DeltaLayer(l_dct, win, name='delta_dct') 44 | 45 | l_lstm_bn = LSTMLayer( 46 | l_delta, lstm_size, peepholes=use_peepholes, 47 | # We need to specify a separate input for masks 48 | mask_input=l_mask, 49 | # Here, we supply the gate parameters for each gate 50 | ingate=gate_parameters, forgetgate=gate_parameters, 51 | cell=cell_parameters, outgate=gate_parameters, 52 | # We'll learn the initialization and use gradient clipping 53 | learn_init=True, grad_clipping=5., name='lstm_bn') 54 | 55 | l_lstm_dct = LSTMLayer( 56 | l_delta_dct, lstm_size, peepholes=use_peepholes, 57 | # We need to specify a separate input for masks 58 | mask_input=l_mask, 59 | # Here, we supply the gate parameters for each gate 60 | ingate=gate_parameters, forgetgate=gate_parameters, 61 | cell=cell_parameters, outgate=gate_parameters, 62 | # We'll learn the initialization and use gradient clipping 63 | learn_init=True, grad_clipping=5., name='lstm_dct') 64 | 65 | # We'll combine the forward and backward layer output by summing. 66 | # Merge layers take in lists of layers to merge as input. 67 | 68 | if fusiontype == 'sum': 69 | l_fuse = ElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='sum1') 70 | elif fusiontype == 'adasum': 71 | l_fuse = AdaptiveElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='adasum') 72 | elif fusiontype == 'concat': 73 | l_fuse = ConcatLayer([l_lstm_bn, l_lstm_dct], axis=2, name='concat') 74 | else: 75 | raise ValueError(message='Unsupported Fusion Type used!') 76 | 77 | f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') 78 | 79 | l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') 80 | 81 | # reshape to (num_examples * seq_len, lstm_size) 82 | l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3') 83 | 84 | # l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1') 85 | 86 | # Now, we can apply feed-forward layers as usual. 87 | # We want the network to predict a classification for the sequence, 88 | # so we'll use a the number of classes. 89 | l_softmax = DenseLayer( 90 | l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') 91 | 92 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') 93 | 94 | return l_out, l_fuse 95 | -------------------------------------------------------------------------------- /modelzoo/avletters_convae_bndrop.py: -------------------------------------------------------------------------------- 1 | from lasagne.layers import get_output, InputLayer, DenseLayer, Upscale2DLayer, ReshapeLayer, BatchNormLayer, batch_norm 2 | from lasagne.nonlinearities import rectify, leaky_rectify, tanh, linear, sigmoid, ScaledTanh 3 | from lasagne.layers import Conv2DLayer, Deconv2DLayer, DropoutLayer 4 | from lasagne.layers import MaxPool2DLayer 5 | 6 | 7 | def create_scaled_tanh(scale_in=2./3, scale_out=1.7159): 8 | """ 9 | create a scaled hyperbolic tangent to avoid saturation given input range 10 | of {-1, 1}. Refer to 11 | :param scale_in: 12 | :param scale_out: 13 | :return: scaled hyperbolic tangent callable 14 | 15 | References 16 | ---------- 17 | .. [1] LeCun, Yann A., et al. (1998): 18 | Efficient BackProp, 19 | http://link.springer.com/chapter/10.1007/3-540-49430-8_2, 20 | http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf 21 | .. [2] Masci, Jonathan, et al. (2011): 22 | Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction, 23 | http://link.springer.com/chapter/10.1007/978-3-642-21735-7_7, 24 | http://people.idsia.ch/~ciresan/data/icann2011.pdf 25 | """ 26 | return ScaledTanh(scale_in, scale_out) 27 | 28 | 29 | def extract_encoder(network): 30 | pass 31 | 32 | 33 | def create_model(incoming, options): 34 | conv_num_filters1 = 100 35 | conv_num_filters2 = 150 36 | conv_num_filters3 = 200 37 | filter_size1 = 5 38 | filter_size2 = 5 39 | filter_size3 = 3 40 | pool_size = 2 41 | encode_size = options['BOTTLENECK'] 42 | dense_mid_size = options['DENSE'] 43 | pad_in = 'valid' 44 | pad_out = 'full' 45 | scaled_tanh = create_scaled_tanh() 46 | dropout0 = DropoutLayer(incoming, p=0.2, name='dropout0') 47 | conv2d1 = Conv2DLayer(dropout0, num_filters=conv_num_filters1, filter_size=filter_size1, pad=pad_in, name='conv2d1', nonlinearity=scaled_tanh) 48 | bn1 = BatchNormLayer(conv2d1, name='batchnorm1') 49 | maxpool2d2 = MaxPool2DLayer(bn1, pool_size=pool_size, name='maxpool2d2') 50 | dropout1 = DropoutLayer(maxpool2d2, name='dropout1') 51 | conv2d3 = Conv2DLayer(dropout1, num_filters=conv_num_filters2, filter_size=filter_size2, pad=pad_in, name='conv2d3', nonlinearity=scaled_tanh) 52 | bn2 = BatchNormLayer(conv2d3, name='batchnorm2') 53 | maxpool2d4 = MaxPool2DLayer(bn2, pool_size=pool_size, name='maxpool2d4', pad=(1,0)) 54 | dropout2 = DropoutLayer(maxpool2d4, name='dropout2') 55 | conv2d5 = Conv2DLayer(dropout2, num_filters=conv_num_filters3, filter_size=filter_size3, pad=pad_in, name='conv2d5', nonlinearity=scaled_tanh) 56 | bn3 = BatchNormLayer(conv2d5, name='batchnorm3') 57 | reshape6 = ReshapeLayer(bn3, shape=([0], -1), name='reshape6') # 3000 58 | reshape6_output = reshape6.output_shape[1] 59 | dropout3 = DropoutLayer(reshape6, name='dropout3') 60 | dense7 = DenseLayer(dropout3, num_units=dense_mid_size, name='dense7', nonlinearity=scaled_tanh) 61 | bn4 = BatchNormLayer(dense7, name='batchnorm4') 62 | dropout4 = DropoutLayer(bn4, name='dropout4') 63 | bottleneck = DenseLayer(dropout4, num_units=encode_size, name='bottleneck', nonlinearity=linear) 64 | # print_network(bottleneck) 65 | dense8 = DenseLayer(bottleneck, num_units=dense_mid_size, W=bottleneck.W.T, name='dense8', nonlinearity=linear) 66 | dense9 = DenseLayer(dense8, num_units=reshape6_output, W=dense7.W.T, nonlinearity=scaled_tanh, name='dense9') 67 | reshape10 = ReshapeLayer(dense9, shape=([0], conv_num_filters3, 3, 5), name='reshape10') # 32 x 4 x 7 68 | deconv2d11 = Deconv2DLayer(reshape10, conv2d5.input_shape[1], conv2d5.filter_size, stride=conv2d5.stride, 69 | W=conv2d5.W, flip_filters=not conv2d5.flip_filters, name='deconv2d11', nonlinearity=scaled_tanh) 70 | upscale2d12 = Upscale2DLayer(deconv2d11, scale_factor=pool_size, name='upscale2d12') 71 | deconv2d13 = Deconv2DLayer(upscale2d12, conv2d3.input_shape[1], conv2d3.filter_size, stride=conv2d3.stride, 72 | W=conv2d3.W, flip_filters=not conv2d3.flip_filters, name='deconv2d13', nonlinearity=scaled_tanh) 73 | upscale2d14 = Upscale2DLayer(deconv2d13, scale_factor=pool_size, name='upscale2d14') 74 | deconv2d15 = Deconv2DLayer(upscale2d14, conv2d1.input_shape[1], conv2d1.filter_size, stride=conv2d1.stride, 75 | crop=(1, 0), W=conv2d1.W, flip_filters=not conv2d1.flip_filters, name='deconv2d14', nonlinearity=scaled_tanh) 76 | reshape16 = ReshapeLayer(deconv2d15, ([0], -1), name='reshape16') 77 | return reshape16, bottleneck 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Visual Speech Recognition (AdeNet) 2 | This page provides instructions to install the necessary packages to run the 3 | experiments described in the project on Visual Speech Recognition using Deep Learning. 4 | 5 | ## Installing 6 | To run the codes, the following dependencies are required: 7 | 8 | 1. miniconda2 9 | 2. matplotlib 10 | 3. pydotplus 11 | 4. tabulate 12 | 5. scikit-learn 6. ipython 13 | 7. pillow 14 | 8. theano 15 | 9. lasagne 16 | 10. nolearn 17 | 18 | It is suggested that you use miniconda to setup a virtual environment before running the codes 19 | to prevent the packages from messing up with your current python environment. 20 | Miniconda can be download from http://conda.pydata.org/miniconda.html. 21 | To install the necessary dependencies you can use the following bash script: 22 | 23 | ``` 24 | #!/bin/bash 25 | ./Miniconda2−latest−Linux−x86 64.sh 26 | conda create −n ip−avsr python source activate ip−avsr 27 | 28 | pip install pip install pip install pip install pip install pip install 29 | matplotlib pydotplus tabulate scikit −learn ipython pillow 30 | pip install −−upgrade https://github.com/Theano/Theano/archive/master.zip 31 | pip install −−upgrade https://github.com/Lasagne/Lasagne/archive/master. 32 | zip 33 | pip install git+https://github.com/dnouri/nolearn.git@master#egg=nolearn 34 | ==0.7.git 35 | ``` 36 | 37 | which creates a virtual environment ip-avsr, activates the virtual environment and installs all 38 | the necessary python packages to this virtual environment. 39 | 40 | ## Code Structure 41 | The source codes for different datasets are separated into individual folders named based on 42 | dataset (`avletters, ouluvs, cuave`). All learning models can be found in the folder `modelzoo` and 43 | can be imported to code as a python package. Custom neural network layers can be found in the 44 | package `custom_layers` and the `utils` package contains utility functions such as plotting, 45 | drawing network layers and image preprocessing functions for normalization and computing delta coefficients. 46 | 47 | ## Datasets 48 | Within each dataset folder, the codes are further grouped into 3 folders. The data folder contains 49 | all the mouth ROIs, DCT features and Image Differences extracted for the individual dataset. 50 | The format used is MatLab’s `.mat` format to allow interchangeability between MatLab and python as the 51 | pretraining stage requires the use of MatLab DBN code. 52 | The model folder contains all pretrained, finetuned and trained networks so they can be easily reloaded 53 | in future without the need to retrain them from scratch. The config folder contains a list of `.ini` config files 54 | that are used for different models (**DeltaNet, AdeNet v1, AdeNet v2**). A list of options are provided below. 55 | The training programs are called unimodal.py, bimodal.py, trimodal.py for single stream, double stream 56 | and triple stream input source respectively. 57 | All training codes accepts a config file using the option `--config`. Type `python trimodal.py -h` to see usage options. 58 | 59 | ``` 60 | usage: trimodal.py [−h] [−−config CONFIG] [−−write results WRITERESULTS] 61 | optional arguments: 62 | −h, −−help show this help message and exit 63 | −−config CONFIG config file to use, default=config/trimodal.ini 64 | −−write results WRITE RESULTS write results to file 65 | ``` 66 | 67 | ## Config File Options 68 | Under the `data` section: 69 | - images: raw image ROIs used to extract DBNFs. 70 | - dct: dct features with delta coefficients appended. 71 | - diff: diff image ROIs used for difference of image input source. 72 | 73 | Under the `models` section: 74 | - pretrained: pretrained DBNF extractor DBN network for raw images. 75 | - finetuned: finetuned DBNF extractor DBN network for raw images. 76 | - pretrained diff: finetuned DBNF extractor DBN network for difference of images. 77 | - finetuned diff: finetuned DBNF extractor DBN network for difference of images. 78 | - fusiontype: the fusion method to use to combine different input sources. 79 | 80 | Under the `training` section: 81 | - learning rate: learning rate to use train the model. 82 | - decay rate: learning rate decay at each epoch after decay start. 83 | - decay start: epoch to start learning rate decay 84 | - do finetune: to perform finetuning of DBNF extractor. 85 | - save finetune: save finetuned model of raw image DBNF extractor. 86 | - load finetune: load finetuned model of raw image DBNF extractor. 87 | - load finetune diff: load finetuned model of image differences DBNF ex- tractor. 88 | - output units: number of output classes. 89 | - lstm units: number of hidden units used in the LSTM classifiers. -------------------------------------------------------------------------------- /utils/lcn.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | import numpy as np 4 | from theano.tensor.nnet import conv 5 | import matplotlib.pyplot as plt 6 | import pylab 7 | 8 | 9 | def gaussian_filter(kernel_shape): 10 | x = np.zeros((kernel_shape, kernel_shape), dtype='float32') 11 | 12 | def gauss(x, y, sigma=2.0): 13 | Z = 2 * np.pi * sigma ** 2 14 | return 1. / Z * np.exp(-(x ** 2 + y ** 2) / (2. * sigma ** 2)) 15 | 16 | mid = np.floor(kernel_shape / 2.) 17 | for i in xrange(0, kernel_shape): 18 | for j in xrange(0, kernel_shape): 19 | x[i, j] = gauss(i - mid, j - mid) 20 | 21 | return x / np.sum(x) 22 | 23 | 24 | def lecun_lcn(input, img_shape, kernel_shape, threshold=1e-4): 25 | input = input.reshape(input.shape[0], 1, img_shape[0], img_shape[1]) 26 | X = T.matrix(dtype=theano.config.floatX) 27 | X = X.reshape(input.shape) 28 | 29 | filter_shape = (1, 1, kernel_shape, kernel_shape) 30 | filters = gaussian_filter(kernel_shape).reshape(filter_shape) 31 | 32 | convout = conv.conv2d(input=X, 33 | filters=filters, 34 | image_shape=(input.shape[0], 1, img_shape[0], img_shape[1]), 35 | filter_shape=filter_shape, 36 | border_mode='full') 37 | 38 | # For each pixel, remove mean of 9x9 neighborhood 39 | mid = int(np.floor(kernel_shape / 2.)) 40 | centered_X = X - convout[:, :, mid:-mid, mid:-mid] 41 | centered_X = X - convout[:, :, mid:-mid, mid:-mid] 42 | 43 | # Scale down norm of 9x9 patch if norm is bigger than 1 44 | sum_sqr_XX = conv.conv2d(input=centered_X ** 2, 45 | filters=filters, 46 | image_shape=(input.shape[0], 1, img_shape[0], img_shape[1]), 47 | filter_shape=filter_shape, 48 | border_mode='full') 49 | 50 | denom = T.sqrt(sum_sqr_XX[:, :, mid:-mid, mid:-mid]) 51 | per_img_mean = denom.mean(axis=[1, 2]) 52 | divisor = T.largest(per_img_mean.dimshuffle(0, 'x', 'x', 1), denom) 53 | divisor = T.maximum(divisor, threshold) 54 | 55 | new_X = centered_X / divisor 56 | new_X = new_X.dimshuffle(0, 2, 3, 1) 57 | new_X = new_X.flatten(ndim=3) 58 | 59 | f = theano.function([X], new_X) 60 | return f 61 | # return f(input) 62 | 63 | 64 | def make_lecun_lcn(input_shape, img_shape, kernel_shape, threshold=1e-4): 65 | """ 66 | lecun local contrast normalization 67 | :param input_shape: (batch_size, stack_size, nb_row, nb_col) 68 | :param img_shape: (nb_row, nb_col) image dimensions 69 | :param kernel_shape: kernel shape of image eg: 9x9 70 | :param threshold: threshold to allow enhance of edges 71 | :return: theano function that computes the local contrast normalized image 72 | """ 73 | X = T.matrix(dtype=theano.config.floatX) 74 | X = X.reshape(input_shape) 75 | 76 | filter_shape = (1, 1, kernel_shape, kernel_shape) 77 | filters = gaussian_filter(kernel_shape).reshape(filter_shape) 78 | 79 | convout = conv.conv2d(input=X, 80 | filters=filters, 81 | image_shape=(input_shape[0], 1, img_shape[0], img_shape[1]), 82 | filter_shape=filter_shape, 83 | border_mode='full') 84 | 85 | # For each pixel, remove mean of 9x9 neighborhood 86 | mid = int(np.floor(kernel_shape / 2.)) 87 | centered_X = X - convout[:, :, mid:-mid, mid:-mid] 88 | 89 | # Scale down norm of 9x9 patch if norm is bigger than 1 90 | sum_sqr_XX = conv.conv2d(input=centered_X ** 2, 91 | filters=filters, 92 | image_shape=(input_shape[0], 1, img_shape[0], img_shape[1]), 93 | filter_shape=filter_shape, 94 | border_mode='full') 95 | 96 | denom = T.sqrt(sum_sqr_XX[:, :, mid:-mid, mid:-mid]) 97 | per_img_mean = denom.mean(axis=[1, 2]) 98 | divisor = T.largest(per_img_mean.dimshuffle(0, 'x', 'x', 1), denom) 99 | divisor = T.maximum(divisor, threshold) 100 | 101 | new_X = centered_X / divisor 102 | new_X = new_X.dimshuffle(0, 2, 3, 1) 103 | new_X = new_X.flatten(ndim=3) 104 | 105 | f = theano.function([X], new_X) 106 | return f 107 | 108 | 109 | if __name__ == '__main__': 110 | theano.config.floatX = 'float32' 111 | x_img = plt.imread("../avletters/data/diff.png") # change as needed 112 | 113 | # x_img = x_img.reshape(1, x_img.shape[0], x_img.shape[1], x_img.shape[2]).astype('float32') 114 | x_img = x_img.reshape(1, x_img.shape[0], x_img.shape[1], x_img.shape[2]).astype('float32') 115 | lcn = make_lecun_lcn((1, 1, x_img.shape[1], x_img.shape[2]), (x_img.shape[1], x_img.shape[2]), 7, threshold=10) 116 | for d in range(3): 117 | x_img[:, :, :, d] = lcn(x_img[:, :, :, d].reshape((1,1,x_img.shape[1], x_img.shape[2]))) 118 | x_img = x_img[0] 119 | # plt.imshow(x_img, cmap='gray') 120 | # plt.show() 121 | 122 | pylab.gray() 123 | pylab.axis('off') 124 | pylab.imshow(x_img) 125 | pylab.show() -------------------------------------------------------------------------------- /oulu/landmarking.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import warnings 3 | import numpy as np 4 | import os, errno, glob 5 | import csv 6 | 7 | import menpo.io as mio 8 | from menpo.visualize import print_progress 9 | from menpo.feature import igo, fast_dsift 10 | from menpo.landmark import labeller, face_ibug_68_to_face_ibug_68 11 | from menpodetect.dlib import load_dlib_frontal_face_detector 12 | from menpofit.dlib import DlibWrapper 13 | from menpofit.aam import HolisticAAM, LucasKanadeAAMFitter, ModifiedAlternatingInverseCompositional 14 | from menpowidgets import visualize_images 15 | 16 | # constants, change according to system 17 | OULU_DIR = '/Volumes/Alienware 5/Thesis/ouluvs2-missing-vid/' 18 | FACE_MODEL_PATH = '../config/shape_predictor_68_face_landmarks.dat' 19 | EXT = ['.mp4', '.mov', '.mpg'] 20 | 21 | 22 | def find_all_videos(dir, ext=EXT, relpath=False): 23 | videofiles = [] 24 | find_all_videos_impl(dir, videofiles, ext) 25 | if relpath: 26 | for i, f in enumerate(videofiles): 27 | videofiles[i] = f[len(dir) + 1:] 28 | return videofiles 29 | 30 | 31 | def find_all_videos_impl(dir, videofiles, ext): 32 | files = os.listdir(dir) 33 | for f in files: 34 | path = os.path.join(dir, f) 35 | if os.path.isdir(path): 36 | find_all_videos_impl(path, videofiles, ext) 37 | elif os.path.splitext(f)[1] in ext: 38 | videofiles.append(path) 39 | 40 | 41 | def is_video(file, ext=EXT): 42 | return os.path.splitext(file)[1] in ext 43 | 44 | 45 | def fit_image(image): 46 | # Face detection 47 | bboxes = fit_image.detect(image, image_diagonal=1000) 48 | 49 | # Check if at least one face was detected, otherwise throw a warning 50 | if len(bboxes) > 0: 51 | # Use the first bounding box (the most probable to represent a face) to initialise 52 | fitting_result = fit_image.fitter.fit_from_bb(image, bboxes[0]) 53 | 54 | # Assign shape on the image 55 | image.landmarks['final_shape'] = fitting_result.final_shape 56 | else: 57 | # Throw warning if no face was detected 58 | warnings.warn('No face detected') 59 | 60 | # Return the image 61 | return image 62 | 63 | 64 | def create_dir(dir): 65 | if not os.path.exists(dir): 66 | try: 67 | os.makedirs(dir) 68 | except OSError as exc: # Guard against race condition 69 | if exc.errno != errno.EEXIST: 70 | raise 71 | 72 | 73 | def fill_row(outwriter, frame_no, row): 74 | outwriter.writerow([frame_no] + row) 75 | 76 | 77 | def process_video(file, dest): 78 | if is_video(file): 79 | create_dir(os.path.dirname(dest)) 80 | frames = mio.import_video(file, normalise=False) 81 | print('{} contains {} frames'.format(file, len(frames))) 82 | print('writing landmarks to {}...'.format(dest)) 83 | frames = frames.map(fit_image) 84 | with open(dest, 'w') as outputfile: 85 | outwriter = csv.writer(outputfile) 86 | try: 87 | for i, frame in enumerate(print_progress(frames)): 88 | if 'final_shape' not in frame.landmarks: 89 | warnings.warn('no faces detected in the frame {}, ' 90 | 'initializing landmarks to -1s...'.format(i)) 91 | # dlib does not fitting from previous initial shape so 92 | # leave entire row as -1s 93 | # initial_shape = frames[i - 1].landmarks['final_shape'].lms 94 | # fitting_result = fit_image.fitter.fit_from_shape(frame, initial_shape) 95 | # frame.landmarks['final_shape'] = fitting_result.final_shape 96 | landmarks = [-1] * 136 97 | else: 98 | lmg = frame.landmarks['final_shape'] 99 | landmarks = lmg['all'].points.reshape((136,)).tolist() # reshape to 136 points 100 | fill_row(outwriter, i, landmarks) 101 | except Exception as e: 102 | warnings.warn('Runtime Error at frame {}'.format(i)) 103 | print('initializing landmarks to -1s...') 104 | fill_row(outwriter, i, [-1] * 136) 105 | 106 | 107 | if __name__ == '__main__': 108 | print('Generating Landmarks for OULU Dataset...') 109 | # use a file list instead to control which files to process 110 | # process only the frontal faces v1 111 | files = glob.glob(os.path.join(OULU_DIR, 'orig', 's[0-9]*_v1_u[0-9]*.mp4')) 112 | files.sort() 113 | fit_image.detect = load_dlib_frontal_face_detector() 114 | fit_image.fitter = DlibWrapper(FACE_MODEL_PATH) 115 | # files = files[3200:] # modify to adjust what to process 116 | for i, video in enumerate(files): 117 | print('[{}/{}] - '.format(i + 1, len(files)), end='') 118 | basename = os.path.basename(video) 119 | landmarkfile = os.path.splitext(basename)[0] + '.csv' 120 | process_video(video, 121 | os.path.join(OULU_DIR, 'landmarks', landmarkfile)) 122 | print('All Done!') 123 | -------------------------------------------------------------------------------- /modelzoo/adenet_v1.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer, BatchNormLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear 7 | 8 | from custom.layers import DeltaLayer 9 | 10 | 11 | def create_pretrained_encoder(weights, biases, incoming): 12 | l_1 = DenseLayer(incoming, 2000, W=weights[0], b=biases[0], nonlinearity=sigmoid, name='fc1') 13 | l_2 = DenseLayer(l_1, 1000, W=weights[1], b=biases[1], nonlinearity=sigmoid, name='fc2') 14 | l_3 = DenseLayer(l_2, 500, W=weights[2], b=biases[2], nonlinearity=sigmoid, name='fc3') 15 | l_4 = DenseLayer(l_3, 50, W=weights[3], b=biases[3], nonlinearity=linear, name='bottleneck') 16 | return l_4 17 | 18 | 19 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): 20 | 21 | if cell_parameters is None: 22 | cell_parameters = Gate() 23 | if gate_parameters is None: 24 | gate_parameters = Gate() 25 | 26 | l_lstm = LSTMLayer( 27 | l_incoming, hidden_units, 28 | # We need to specify a separate input for masks 29 | mask_input=l_mask, 30 | # Here, we supply the gate parameters for each gate 31 | ingate=gate_parameters, forgetgate=gate_parameters, 32 | cell=cell_parameters, outgate=gate_parameters, 33 | # We'll learn the initialization and use gradient clipping 34 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 35 | 36 | # The "backwards" layer is the same as the first, 37 | # except that the backwards argument is set to True. 38 | l_lstm_back = LSTMLayer( 39 | l_incoming, hidden_units, ingate=gate_parameters, 40 | mask_input=l_mask, forgetgate=gate_parameters, 41 | cell=cell_parameters, outgate=gate_parameters, 42 | learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) 43 | 44 | return l_lstm, l_lstm_back 45 | 46 | 47 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var, 48 | dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'), 49 | output_classes=26): 50 | 51 | dbn_layers = dbn.get_all_layers() 52 | weights = [] 53 | biases = [] 54 | weights.append(dbn_layers[1].W.astype('float32')) 55 | weights.append(dbn_layers[2].W.astype('float32')) 56 | weights.append(dbn_layers[3].W.astype('float32')) 57 | weights.append(dbn_layers[4].W.astype('float32')) 58 | biases.append(dbn_layers[1].b.astype('float32')) 59 | biases.append(dbn_layers[2].b.astype('float32')) 60 | biases.append(dbn_layers[3].b.astype('float32')) 61 | biases.append(dbn_layers[4].b.astype('float32')) 62 | 63 | gate_parameters = Gate( 64 | W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), 65 | b=las.init.Constant(0.)) 66 | cell_parameters = Gate( 67 | W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), 68 | # Setting W_cell to None denotes that no cell connection will be used. 69 | W_cell=None, b=las.init.Constant(0.), 70 | # By convention, the cell nonlinearity is tanh in an LSTM. 71 | nonlinearity=tanh) 72 | 73 | l_in = InputLayer(input_shape, input_var, 'input') 74 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 75 | l_dct = InputLayer(dct_shape, dct_var, 'dct') 76 | 77 | symbolic_batchsize = l_in.input_var.shape[0] 78 | symbolic_seqlen = l_in.input_var.shape[1] 79 | 80 | l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') 81 | l_encoder = create_pretrained_encoder(weights, biases, l_reshape1) 82 | l_encoder_bn = BatchNormLayer(l_encoder, name='batchnorm1') 83 | encoder_len = las.layers.get_output_shape(l_encoder)[-1] 84 | l_reshape2 = ReshapeLayer(l_encoder_bn, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') 85 | l_delta = DeltaLayer(l_reshape2, win, name='delta') 86 | 87 | l_concat = ConcatLayer([l_delta, l_dct], axis=2, name='concat') 88 | 89 | l_lstm, l_lstm_back = create_blstm(l_concat, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1') 90 | 91 | # We'll combine the forward and backward layer output by summing. 92 | # Merge layers take in lists of layers to merge as input. 93 | l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') 94 | 95 | l_lstm2, l_lstm2_back = create_blstm(l_sum1, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm2') 96 | 97 | # We'll combine the forward and backward layer output by summing. 98 | # Merge layers take in lists of layers to merge as input. 99 | l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back]) 100 | 101 | l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1') 102 | 103 | # Now, we can apply feed-forward layers as usual. 104 | # We want the network to predict a classification for the sequence, 105 | # so we'll use a the number of classes. 106 | l_out = DenseLayer( 107 | l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') 108 | 109 | return l_out, l_concat 110 | -------------------------------------------------------------------------------- /modelzoo/adenet_v1_1.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer, BatchNormLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear 7 | 8 | from custom.layers import DeltaLayer 9 | 10 | 11 | def create_pretrained_encoder(weights, biases, incoming): 12 | l_1 = DenseLayer(incoming, 2000, W=weights[0], b=biases[0], nonlinearity=sigmoid, name='fc1') 13 | l_2 = DenseLayer(l_1, 1000, W=weights[1], b=biases[1], nonlinearity=sigmoid, name='fc2') 14 | l_3 = DenseLayer(l_2, 500, W=weights[2], b=biases[2], nonlinearity=sigmoid, name='fc3') 15 | l_4 = DenseLayer(l_3, 50, W=weights[3], b=biases[3], nonlinearity=linear, name='bottleneck') 16 | return l_4 17 | 18 | 19 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): 20 | 21 | if cell_parameters is None: 22 | cell_parameters = Gate() 23 | if gate_parameters is None: 24 | gate_parameters = Gate() 25 | 26 | l_lstm = LSTMLayer( 27 | l_incoming, hidden_units, 28 | # We need to specify a separate input for masks 29 | mask_input=l_mask, 30 | # Here, we supply the gate parameters for each gate 31 | ingate=gate_parameters, forgetgate=gate_parameters, 32 | cell=cell_parameters, outgate=gate_parameters, 33 | # We'll learn the initialization and use gradient clipping 34 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 35 | 36 | # The "backwards" layer is the same as the first, 37 | # except that the backwards argument is set to True. 38 | l_lstm_back = LSTMLayer( 39 | l_incoming, hidden_units, ingate=gate_parameters, 40 | mask_input=l_mask, forgetgate=gate_parameters, 41 | cell=cell_parameters, outgate=gate_parameters, 42 | learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) 43 | 44 | return l_lstm, l_lstm_back 45 | 46 | 47 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var, 48 | dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'), 49 | output_classes=26): 50 | 51 | dbn_layers = dbn.get_all_layers() 52 | weights = [] 53 | biases = [] 54 | weights.append(dbn_layers[1].W.astype('float32')) 55 | weights.append(dbn_layers[2].W.astype('float32')) 56 | weights.append(dbn_layers[3].W.astype('float32')) 57 | weights.append(dbn_layers[4].W.astype('float32')) 58 | biases.append(dbn_layers[1].b.astype('float32')) 59 | biases.append(dbn_layers[2].b.astype('float32')) 60 | biases.append(dbn_layers[3].b.astype('float32')) 61 | biases.append(dbn_layers[4].b.astype('float32')) 62 | 63 | gate_parameters = Gate( 64 | W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), 65 | b=las.init.Constant(0.)) 66 | cell_parameters = Gate( 67 | W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), 68 | # Setting W_cell to None denotes that no cell connection will be used. 69 | W_cell=None, b=las.init.Constant(0.), 70 | # By convention, the cell nonlinearity is tanh in an LSTM. 71 | nonlinearity=tanh) 72 | 73 | l_in = InputLayer(input_shape, input_var, 'input') 74 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 75 | l_dct = InputLayer(dct_shape, dct_var, 'dct') 76 | 77 | symbolic_batchsize = l_in.input_var.shape[0] 78 | symbolic_seqlen = l_in.input_var.shape[1] 79 | 80 | l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') 81 | l_encoder = create_pretrained_encoder(weights, biases, l_reshape1) 82 | l_encoder_bn = BatchNormLayer(l_encoder, name='batchnorm1') 83 | encoder_len = las.layers.get_output_shape(l_encoder)[-1] 84 | l_reshape2 = ReshapeLayer(l_encoder_bn, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') 85 | l_delta = DeltaLayer(l_reshape2, win, name='delta') 86 | 87 | l_concat = ConcatLayer([l_delta, l_dct], axis=2, name='concat') 88 | 89 | l_dropout1 = DropoutLayer(l_concat, name='dropout1') 90 | 91 | l_lstm, l_lstm_back = create_blstm(l_dropout1, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm1') 92 | 93 | # We'll combine the forward and backward layer output by summing. 94 | # Merge layers take in lists of layers to merge as input. 95 | l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') 96 | 97 | # implement drop-out regularization 98 | l_dropout2 = DropoutLayer(l_sum1, name='dropout2') 99 | 100 | l_lstm2, l_lstm2_back = create_blstm(l_dropout2, l_mask, lstm_size * 2, cell_parameters, gate_parameters, 'lstm2') 101 | 102 | # We'll combine the forward and backward layer output by summing. 103 | # Merge layers take in lists of layers to merge as input. 104 | l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back]) 105 | 106 | l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1') 107 | 108 | # Now, we can apply feed-forward layers as usual. 109 | # We want the network to predict a classification for the sequence, 110 | # so we'll use a the number of classes. 111 | l_out = DenseLayer( 112 | l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') 113 | 114 | return l_out 115 | -------------------------------------------------------------------------------- /modelzoo/baseline_end2end.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear 7 | 8 | 9 | def create_pretrained_encoder(weights, biases, incoming): 10 | l_1 = DenseLayer(incoming, 2000, W=weights[0], b=biases[0], nonlinearity=sigmoid, name='fc1') 11 | l_2 = DenseLayer(l_1, 1000, W=weights[1], b=biases[1], nonlinearity=sigmoid, name='fc2') 12 | l_3 = DenseLayer(l_2, 500, W=weights[2], b=biases[2], nonlinearity=sigmoid, name='fc3') 13 | l_4 = DenseLayer(l_3, 50, W=weights[3], b=biases[3], nonlinearity=linear, name='bottleneck') 14 | return l_4 15 | 16 | 17 | def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): 18 | if cell_parameters is None: 19 | cell_parameters = Gate() 20 | if gate_parameters is None: 21 | gate_parameters = Gate() 22 | 23 | l_lstm = LSTMLayer( 24 | l_incoming, hidden_units, 25 | # We need to specify a separate input for masks 26 | mask_input=l_mask, 27 | # Here, we supply the gate parameters for each gate 28 | ingate=gate_parameters, forgetgate=gate_parameters, 29 | cell=cell_parameters, outgate=gate_parameters, 30 | # We'll learn the initialization and use gradient clipping 31 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 32 | 33 | return l_lstm 34 | 35 | 36 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): 37 | 38 | if cell_parameters is None: 39 | cell_parameters = Gate() 40 | if gate_parameters is None: 41 | gate_parameters = Gate() 42 | 43 | l_lstm = LSTMLayer( 44 | l_incoming, hidden_units, 45 | # We need to specify a separate input for masks 46 | mask_input=l_mask, 47 | # Here, we supply the gate parameters for each gate 48 | ingate=gate_parameters, forgetgate=gate_parameters, 49 | cell=cell_parameters, outgate=gate_parameters, 50 | # We'll learn the initialization and use gradient clipping 51 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 52 | 53 | # The "backwards" layer is the same as the first, 54 | # except that the backwards argument is set to True. 55 | l_lstm_back = LSTMLayer( 56 | l_incoming, hidden_units, ingate=gate_parameters, 57 | mask_input=l_mask, forgetgate=gate_parameters, 58 | cell=cell_parameters, outgate=gate_parameters, 59 | learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) 60 | 61 | return l_lstm, l_lstm_back 62 | 63 | 64 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var, 65 | lstm_size=250, output_classes=26): 66 | 67 | dbn_layers = dbn.get_all_layers() 68 | weights = [] 69 | biases = [] 70 | weights.append(dbn_layers[1].W.astype('float32')) 71 | weights.append(dbn_layers[2].W.astype('float32')) 72 | weights.append(dbn_layers[3].W.astype('float32')) 73 | weights.append(dbn_layers[4].W.astype('float32')) 74 | biases.append(dbn_layers[1].b.astype('float32')) 75 | biases.append(dbn_layers[2].b.astype('float32')) 76 | biases.append(dbn_layers[3].b.astype('float32')) 77 | biases.append(dbn_layers[4].b.astype('float32')) 78 | 79 | gate_parameters = Gate( 80 | W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), 81 | b=las.init.Constant(0.)) 82 | cell_parameters = Gate( 83 | W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), 84 | # Setting W_cell to None denotes that no cell connection will be used. 85 | W_cell=None, b=las.init.Constant(0.), 86 | # By convention, the cell nonlinearity is tanh in an LSTM. 87 | nonlinearity=tanh) 88 | 89 | l_in = InputLayer(input_shape, input_var, 'input') 90 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 91 | 92 | symbolic_batchsize = l_in.input_var.shape[0] 93 | symbolic_seqlen = l_in.input_var.shape[1] 94 | 95 | l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') 96 | l_encoder = create_pretrained_encoder(weights, biases, l_reshape1) 97 | encoder_len = las.layers.get_output_shape(l_encoder)[-1] 98 | l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') 99 | # l_delta = DeltaLayer(l_reshape2, win, name='delta') 100 | 101 | # l_lstm = create_lstm(l_reshape2, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1') 102 | l_lstm, l_lstm_back = create_blstm(l_reshape2, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1') 103 | 104 | # We'll combine the forward and backward layer output by summing. 105 | # Merge layers take in lists of layers to merge as input. 106 | l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') 107 | 108 | l_forward_slice1 = SliceLayer(l_sum1, -1, 1, name='slice1') 109 | 110 | # Now, we can apply feed-forward layers as usual. 111 | # We want the network to predict a classification for the sequence, 112 | # so we'll use a the number of classes. 113 | l_out = DenseLayer( 114 | l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') 115 | 116 | return l_out 117 | -------------------------------------------------------------------------------- /modelzoo/avnet.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer, GlobalPoolLayer 6 | from lasagne.nonlinearities import tanh, linear, rectify 7 | 8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer, create_blstm 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder 10 | 11 | 12 | def extract_weights(ae): 13 | weights = [] 14 | biases = [] 15 | shapes = [2000, 1000, 500, 50] 16 | nonlinearities = [rectify, rectify, rectify, linear] 17 | ae_layers = ae.get_all_layers() 18 | weights.append(ae_layers[1].W.astype('float32')) 19 | weights.append(ae_layers[2].W.astype('float32')) 20 | weights.append(ae_layers[3].W.astype('float32')) 21 | weights.append(ae_layers[4].W.astype('float32')) 22 | biases.append(ae_layers[1].b.astype('float32')) 23 | biases.append(ae_layers[2].b.astype('float32')) 24 | biases.append(ae_layers[3].b.astype('float32')) 25 | biases.append(ae_layers[4].b.astype('float32')) 26 | 27 | return weights, biases, shapes, nonlinearities 28 | 29 | 30 | def create_pretrained_substream(weights, biases, input_shape, input_var, mask_shape, mask_var, name, 31 | lstm_size=250, win=T.iscalar('theta'), nonlinearity=rectify, 32 | w_init_fn=las.init.Orthogonal(), use_peepholes=True): 33 | gate_parameters = Gate( 34 | W_in=w_init_fn, W_hid=w_init_fn, 35 | b=las.init.Constant(0.)) 36 | cell_parameters = Gate( 37 | W_in=w_init_fn, W_hid=w_init_fn, 38 | # Setting W_cell to None denotes that no cell connection will be used. 39 | W_cell=None, b=las.init.Constant(0.), 40 | # By convention, the cell nonlinearity is tanh in an LSTM. 41 | nonlinearity=tanh) 42 | 43 | l_input = InputLayer(input_shape, input_var, 'input_'+name) 44 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 45 | 46 | symbolic_batchsize_raw = l_input.input_var.shape[0] 47 | symbolic_seqlen_raw = l_input.input_var.shape[1] 48 | 49 | l_reshape1_raw = ReshapeLayer(l_input, (-1, input_shape[-1]), name='reshape1_'+name) 50 | l_encoder_raw = create_pretrained_encoder(l_reshape1_raw, weights, biases, 51 | [2000, 1000, 500, 50], 52 | [nonlinearity, nonlinearity, nonlinearity, linear], 53 | ['fc1_'+name, 'fc2_'+name, 'fc3_'+name, 'bottleneck_'+name]) 54 | input_len = las.layers.get_output_shape(l_encoder_raw)[-1] 55 | 56 | l_reshape2 = ReshapeLayer(l_encoder_raw, 57 | (symbolic_batchsize_raw, symbolic_seqlen_raw, input_len), 58 | name='reshape2_'+name) 59 | l_delta = DeltaLayer(l_reshape2, win, name='delta_'+name) 60 | 61 | l_lstm = LSTMLayer( 62 | l_delta, int(lstm_size), peepholes=use_peepholes, 63 | # We need to specify a separate input for masks 64 | mask_input=l_mask, 65 | # Here, we supply the gate parameters for each gate 66 | ingate=gate_parameters, forgetgate=gate_parameters, 67 | cell=cell_parameters, outgate=gate_parameters, 68 | # We'll learn the initialization and use gradient clipping 69 | learn_init=True, grad_clipping=5., name='lstm_'+name) 70 | 71 | return l_lstm 72 | 73 | 74 | def create_model(substreams, mask_shape, mask_var, lstm_size=250, output_classes=26, 75 | fusiontype='concat', w_init_fn=las.init.Orthogonal(), use_peepholes=True): 76 | 77 | gate_parameters = Gate( 78 | W_in=w_init_fn, W_hid=w_init_fn, 79 | b=las.init.Constant(0.)) 80 | cell_parameters = Gate( 81 | W_in=w_init_fn, W_hid=w_init_fn, 82 | # Setting W_cell to None denotes that no cell connection will be used. 83 | W_cell=None, b=las.init.Constant(0.), 84 | # By convention, the cell nonlinearity is tanh in an LSTM. 85 | nonlinearity=tanh) 86 | 87 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 88 | symbolic_seqlen_raw = l_mask.input_var.shape[1] 89 | 90 | # We'll combine the forward and backward layer output by summing. 91 | # Merge layers take in lists of layers to merge as input. 92 | if fusiontype == 'adasum': 93 | l_fuse = AdaptiveElemwiseSumLayer(substreams, name='adasum1') 94 | elif fusiontype == 'sum': 95 | l_fuse = ElemwiseSumLayer(substreams, name='sum1') 96 | elif fusiontype == 'concat': 97 | l_fuse = ConcatLayer(substreams, axis=-1, name='concat') 98 | 99 | f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') 100 | l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') 101 | 102 | # reshape to (num_examples * seq_len, lstm_size) 103 | l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3') 104 | 105 | # Now, we can apply feed-forward layers as usual. 106 | # We want the network to predict a classification for the sequence, 107 | # so we'll use a the number of classes. 108 | l_softmax = DenseLayer( 109 | l_reshape3, num_units=output_classes, 110 | nonlinearity=las.nonlinearities.softmax, name='softmax') 111 | 112 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_raw, output_classes), name='output') 113 | 114 | return l_out, l_fuse 115 | -------------------------------------------------------------------------------- /oulu/ae_finetuner.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.insert(0, '../') 4 | import os 5 | import time 6 | import pickle 7 | import ConfigParser 8 | 9 | import theano.tensor as T 10 | import theano 11 | 12 | import matplotlib 13 | # matplotlib.use('Agg') # Change matplotlib backend, in case we have no X server running.. 14 | 15 | from utils.preprocessing import * 16 | from utils.plotting_utils import * 17 | from utils.io import * 18 | 19 | import numpy as np 20 | from lasagne.layers import InputLayer, DenseLayer 21 | from lasagne.nonlinearities import tanh, linear, sigmoid, rectify, leaky_rectify 22 | from lasagne.updates import nesterov_momentum, adadelta, sgd, norm_constraint 23 | from lasagne.objectives import squared_error 24 | from nolearn.lasagne import NeuralNet 25 | 26 | 27 | def configure_theano(): 28 | theano.config.floatX = 'float32' 29 | sys.setrecursionlimit(10000) 30 | 31 | 32 | def load_ae(path, train_params, nonlinearity=sigmoid): 33 | """ 34 | load a pretrained dbn from path 35 | :param path: path to the .mat dbn 36 | :return: pretrained deep belief network 37 | """ 38 | # create the network using weights from pretrain_nn.mat 39 | nn = sio.loadmat(path) 40 | w1 = nn['w1'] 41 | w2 = nn['w2'] 42 | w3 = nn['w3'] 43 | w4 = nn['w4'] 44 | w5 = nn['w5'] 45 | w6 = nn['w6'] 46 | w7 = nn['w7'] 47 | w8 = nn['w8'] 48 | b1 = nn['b1'][0] 49 | b2 = nn['b2'][0] 50 | b3 = nn['b3'][0] 51 | b4 = nn['b4'][0] 52 | b5 = nn['b5'][0] 53 | b6 = nn['b6'][0] 54 | b7 = nn['b7'][0] 55 | b8 = nn['b8'][0] 56 | 57 | layers = [ 58 | (InputLayer, {'name': 'input', 'shape': (None, 1144)}), 59 | (DenseLayer, {'name': 'l1', 'num_units': 2000, 'nonlinearity': nonlinearity, 'W': w1, 'b': b1}), 60 | (DenseLayer, {'name': 'l2', 'num_units': 1000, 'nonlinearity': nonlinearity, 'W': w2, 'b': b2}), 61 | (DenseLayer, {'name': 'l3', 'num_units': 500, 'nonlinearity': nonlinearity, 'W': w3, 'b': b3}), 62 | (DenseLayer, {'name': 'l4', 'num_units': 50, 'nonlinearity': linear, 'W': w4, 'b': b4}), 63 | (DenseLayer, {'name': 'l5', 'num_units': 500, 'nonlinearity': nonlinearity, 'W': w5, 'b': b5}), 64 | (DenseLayer, {'name': 'l6', 'num_units': 1000, 'nonlinearity': nonlinearity, 'W': w6, 'b': b6}), 65 | (DenseLayer, {'name': 'l7', 'num_units': 2000, 'nonlinearity': nonlinearity, 'W': w7, 'b': b7}), 66 | (DenseLayer, {'name': 'output', 'num_units': 1144, 'nonlinearity': linear, 'W': w8, 'b': b8}), 67 | ] 68 | 69 | ''' 70 | dbn = NeuralNet( 71 | layers=layers, 72 | max_epochs=30, 73 | objective_loss_function=squared_error, 74 | update=nesterov_momentum, 75 | regression=True, 76 | verbose=1, 77 | update_learning_rate=0.001, 78 | update_momentum=0.05, 79 | objective_l2=0.005, 80 | ) 81 | ''' 82 | 83 | dbn = NeuralNet( 84 | layers=layers, 85 | max_epochs=10, 86 | objective_loss_function=squared_error, 87 | update=adadelta, 88 | regression=True, 89 | verbose=1, 90 | update_learning_rate=0.01, 91 | # update_learning_rate=0.001, 92 | # update_momentum=0.05, 93 | objective_l2=0.005, 94 | ) 95 | return dbn 96 | 97 | 98 | def main(): 99 | configure_theano() 100 | config_file = 'config/finetuner.ini' 101 | config = ConfigParser.ConfigParser() 102 | config.read(config_file) 103 | print('loading config file: {}'.format(config_file)) 104 | 105 | print('preprocessing dataset...') 106 | data = load_mat_file(config.get('data', 'images')) 107 | ae_pretrained = config.get('models', 'pretrained') 108 | ae_finetuned = config.get('models', 'finetuned') 109 | do_finetune = config.getboolean('training', 'do_finetune') 110 | save_finetune = config.getboolean('training', 'save_finetune') 111 | load_finetune = config.getboolean('training', 'load_finetune') 112 | train_params = dict() 113 | train_params['max_epochs'] = config.getint('training', 'max_epochs') 114 | train_params['learning_rate'] = config.getfloat('training', 'learning_rate') 115 | train_params['objective_l2'] = config.getfloat('training', 'objective_l2') 116 | 117 | # create the necessary variable mappings 118 | data_matrix = data['dataMatrix'].astype('float32') 119 | data_matrix_len = data_matrix.shape[0] 120 | vid_len_vec = data['videoLengthVec'] 121 | iter_vec = data['iterVec'] 122 | 123 | indexes = create_split_index(data_matrix_len, vid_len_vec, iter_vec) 124 | train_vidlen_vec, test_vidlen_vec = split_videolen(vid_len_vec, iter_vec) 125 | 126 | data_matrix = normalize_input(data_matrix) 127 | 128 | # split the data 129 | train_data = data_matrix[indexes == True] 130 | test_data = data_matrix[indexes == False] 131 | 132 | if do_finetune: 133 | print('performing finetuning...') 134 | ae = load_ae(ae_pretrained, train_params, nonlinearity=rectify) 135 | ae.initialize() 136 | # ae.fit(train_data, train_data) 137 | res = ae.predict(test_data) 138 | # print(res.shape) 139 | visualize_reconstruction(test_data[300:336], res[300:336], shape=(26, 44)) 140 | 141 | if save_finetune: 142 | print('saving finetuned encoder: {}...'.format(ae_finetuned)) 143 | pickle.dump(ae, open(ae_finetuned, 'wb')) 144 | 145 | if load_finetune: 146 | print('loading finetuned encoder: {}'.format(ae_finetuned)) 147 | ae = load_ae(ae_pretrained, train_params) 148 | # ae = pickle.load(open(ae_finetuned, 'rb')) 149 | ae.initialize() 150 | print('performing prediction...') 151 | res = ae.predict(test_data) 152 | visualize_reconstruction(test_data[300:336], res[300:336]) 153 | print('done!') 154 | 155 | 156 | if __name__ == '__main__': 157 | main() 158 | -------------------------------------------------------------------------------- /avletters/ae_finetuner.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.insert(0, '../') 4 | import os 5 | import time 6 | import pickle 7 | import ConfigParser 8 | 9 | import theano.tensor as T 10 | import theano 11 | 12 | import matplotlib 13 | # matplotlib.use('Agg') # Change matplotlib backend, in case we have no X server running.. 14 | 15 | from utils.preprocessing import * 16 | from utils.plotting_utils import * 17 | from utils.io import * 18 | 19 | import numpy as np 20 | from lasagne.layers import InputLayer, DenseLayer 21 | from lasagne.nonlinearities import tanh, linear, sigmoid, rectify, leaky_rectify 22 | from lasagne.updates import nesterov_momentum, adadelta, sgd, norm_constraint 23 | from lasagne.objectives import squared_error 24 | from nolearn.lasagne import NeuralNet 25 | 26 | 27 | def configure_theano(): 28 | theano.config.floatX = 'float32' 29 | sys.setrecursionlimit(10000) 30 | 31 | 32 | def load_ae(path, train_params, nonlinearity=sigmoid): 33 | """ 34 | load a pretrained dbn from path 35 | :param path: path to the .mat dbn 36 | :return: pretrained deep belief network 37 | """ 38 | # create the network using weights from pretrain_nn.mat 39 | nn = sio.loadmat(path) 40 | w1 = nn['w1'] 41 | w2 = nn['w2'] 42 | w3 = nn['w3'] 43 | w4 = nn['w4'] 44 | w5 = nn['w5'] 45 | w6 = nn['w6'] 46 | w7 = nn['w7'] 47 | w8 = nn['w8'] 48 | b1 = nn['b1'][0] 49 | b2 = nn['b2'][0] 50 | b3 = nn['b3'][0] 51 | b4 = nn['b4'][0] 52 | b5 = nn['b5'][0] 53 | b6 = nn['b6'][0] 54 | b7 = nn['b7'][0] 55 | b8 = nn['b8'][0] 56 | 57 | layers = [ 58 | (InputLayer, {'name': 'input', 'shape': (None, 1200)}), 59 | (DenseLayer, {'name': 'l1', 'num_units': 2000, 'nonlinearity': nonlinearity, 'W': w1, 'b': b1}), 60 | (DenseLayer, {'name': 'l2', 'num_units': 1000, 'nonlinearity': nonlinearity, 'W': w2, 'b': b2}), 61 | (DenseLayer, {'name': 'l3', 'num_units': 500, 'nonlinearity': nonlinearity, 'W': w3, 'b': b3}), 62 | (DenseLayer, {'name': 'l4', 'num_units': 50, 'nonlinearity': linear, 'W': w4, 'b': b4}), 63 | (DenseLayer, {'name': 'l5', 'num_units': 500, 'nonlinearity': nonlinearity, 'W': w5, 'b': b5}), 64 | (DenseLayer, {'name': 'l6', 'num_units': 1000, 'nonlinearity': nonlinearity, 'W': w6, 'b': b6}), 65 | (DenseLayer, {'name': 'l7', 'num_units': 2000, 'nonlinearity': nonlinearity, 'W': w7, 'b': b7}), 66 | (DenseLayer, {'name': 'output', 'num_units': 1200, 'nonlinearity': linear, 'W': w8, 'b': b8}), 67 | ] 68 | 69 | ''' 70 | dbn = NeuralNet( 71 | layers=layers, 72 | max_epochs=30, 73 | objective_loss_function=squared_error, 74 | update=nesterov_momentum, 75 | regression=True, 76 | verbose=1, 77 | update_learning_rate=0.001, 78 | update_momentum=0.05, 79 | objective_l2=0.005, 80 | ) 81 | ''' 82 | 83 | dbn = NeuralNet( 84 | layers=layers, 85 | max_epochs=10, 86 | objective_loss_function=squared_error, 87 | update=adadelta, 88 | regression=True, 89 | verbose=1, 90 | update_learning_rate=0.01, 91 | # update_learning_rate=0.001, 92 | # update_momentum=0.05, 93 | objective_l2=0.005, 94 | ) 95 | return dbn 96 | 97 | 98 | def main(): 99 | configure_theano() 100 | config_file = 'config/finetuner.ini' 101 | config = ConfigParser.ConfigParser() 102 | config.read(config_file) 103 | print('loading config file: {}'.format(config_file)) 104 | 105 | print('preprocessing dataset...') 106 | data = load_mat_file(config.get('data', 'images')) 107 | ae_pretrained = config.get('models', 'pretrained') 108 | ae_finetuned = config.get('models', 'finetuned') 109 | do_finetune = config.getboolean('training', 'do_finetune') 110 | save_finetune = config.getboolean('training', 'save_finetune') 111 | load_finetune = config.getboolean('training', 'load_finetune') 112 | train_params = dict() 113 | train_params['max_epochs'] = config.getint('training', 'max_epochs') 114 | train_params['learning_rate'] = config.getfloat('training', 'learning_rate') 115 | train_params['objective_l2'] = config.getfloat('training', 'objective_l2') 116 | 117 | # create the necessary variable mappings 118 | data_matrix = data['dataMatrix'].astype('float32') 119 | data_matrix_len = data_matrix.shape[0] 120 | vid_len_vec = data['videoLengthVec'] 121 | iter_vec = data['iterVec'] 122 | 123 | indexes = create_split_index(data_matrix_len, vid_len_vec, iter_vec) 124 | train_vidlen_vec, test_vidlen_vec = split_videolen(vid_len_vec, iter_vec) 125 | assert len(train_vidlen_vec) == 520 126 | assert len(test_vidlen_vec) == 260 127 | assert np.sum(vid_len_vec) == data_matrix_len 128 | 129 | data_matrix = normalize_input(data_matrix) 130 | 131 | # split the data 132 | train_data = data_matrix[indexes == True] 133 | test_data = data_matrix[indexes == False] 134 | 135 | if do_finetune: 136 | print('performing finetuning...') 137 | ae = load_ae(ae_pretrained, train_params, rectify) 138 | ae.initialize() 139 | #ae.fit(train_data, train_data) 140 | res = ae.predict(test_data) 141 | print(res.shape) 142 | visualize_reconstruction(test_data[300:336], res[300:336]) 143 | 144 | if save_finetune: 145 | print('saving finetuned encoder: {}...'.format(ae_finetuned)) 146 | pickle.dump(ae, open(ae_finetuned, 'wb')) 147 | 148 | if load_finetune: 149 | print('loading finetuned encoder: {}'.format(ae_finetuned)) 150 | ae = load_ae(ae_pretrained, train_params) 151 | # ae = pickle.load(open(ae_finetuned, 'rb')) 152 | ae.initialize() 153 | print('performing prediction...') 154 | res = ae.predict(test_data) 155 | visualize_reconstruction(test_data[300:336], res[300:336]) 156 | print('done!') 157 | 158 | 159 | if __name__ == '__main__': 160 | main() 161 | -------------------------------------------------------------------------------- /utils/draw_net.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions to create network diagrams from a list of Layers. 3 | 4 | Examples: 5 | 6 | Draw a minimal diagram to a pdf file: 7 | layers = lasagne.layers.get_all_layers(output_layer) 8 | draw_to_file(layers, 'network.pdf', output_shape=False) 9 | 10 | Draw a verbose diagram in an IPython notebook: 11 | from IPython.display import Image #needed to render in notebook 12 | 13 | layers = lasagne.layers.get_all_layers(output_layer) 14 | dot = get_pydot_graph(layers, verbose=True) 15 | return Image(dot.create_png()) 16 | """ 17 | 18 | import pydot 19 | import lasagne.layers 20 | 21 | 22 | def get_hex_color(layer_type): 23 | """ 24 | Determines the hex color for a layer. Some classes are given 25 | default values, all others are calculated pseudorandomly 26 | from their name. 27 | :parameters: 28 | - layer_type : string 29 | Class name of the layer 30 | 31 | :returns: 32 | - color : string containing a hex color. 33 | 34 | :usage: 35 | >>> color = get_hex_color('MaxPool2DDNN') 36 | '#9D9DD2' 37 | """ 38 | 39 | if 'Input' in layer_type: 40 | return '#A2CECE' 41 | if 'Conv' in layer_type: 42 | return '#7C9ABB' 43 | if 'Dense' in layer_type: 44 | return '#6CCF8D' 45 | if 'Pool' in layer_type: 46 | return '#9D9DD2' 47 | if 'Slice' in layer_type: 48 | return '#f6f930' 49 | if 'LSTM' in layer_type: 50 | return '#e06b04' 51 | if 'Reshape' in layer_type: 52 | return '#e3b029' 53 | if 'Dropout' in layer_type: 54 | return '#ffb2ea' 55 | if 'Delta' in layer_type: 56 | return '#d7b8ff' 57 | else: 58 | return '#{0:x}'.format(hash(layer_type) % 2**24) 59 | 60 | 61 | def get_pydot_graph(layers, output_shape=True, verbose=False): 62 | """ 63 | Creates a PyDot graph of the network defined by the given layers. 64 | :parameters: 65 | - layers : list 66 | List of the layers, as obtained from lasange.layers.get_all_layers 67 | - output_shape: (default `True`) 68 | If `True`, the output shape of each layer will be displayed. 69 | - verbose: (default `False`) 70 | If `True`, layer attributes like filter shape, stride, etc. 71 | will be displayed. 72 | - verbose: 73 | :returns: 74 | - pydot_graph : PyDot object containing the graph 75 | 76 | """ 77 | pydot_graph = pydot.Dot('Network', graph_type='digraph') 78 | pydot_nodes = {} 79 | pydot_edges = [] 80 | for i, layer in enumerate(layers): 81 | layer_type = '{0}: {1}'.format(layer.__class__.__name__, layer.name) 82 | key = repr(layer) 83 | label = layer_type 84 | color = get_hex_color(layer_type) 85 | if verbose: 86 | for attr in ['num_filters', 'num_units', 'ds', 87 | 'filter_shape', 'stride', 'strides', 'p']: 88 | if hasattr(layer, attr): 89 | label += '\n' + \ 90 | '{0}: {1}'.format(attr, getattr(layer, attr)) 91 | if hasattr(layer, 'nonlinearity'): 92 | try: 93 | nonlinearity = layer.nonlinearity.__name__ 94 | except AttributeError: 95 | nonlinearity = layer.nonlinearity.__class__.__name__ 96 | label += '\n' + 'nonlinearity: {0}'.format(nonlinearity) 97 | 98 | if output_shape: 99 | output_shape = lasagne.layers.get_output_shape(layer) 100 | if len(output_shape) is 3: 101 | output_shape_str = '(Batch Size, Seq Len, {})'.format(output_shape[-1]) 102 | if len(output_shape) is 2: 103 | output_shape_str = '(Batch Size x Seq Len, {})'.format(output_shape[-1]) 104 | if layer.name == 'mask': 105 | output_shape_str = '(Batch Size, Seq Len)' 106 | label += '\n' + \ 107 | 'Output shape: {0}'.format(output_shape_str) 108 | pydot_nodes[key] = pydot.Node(key, 109 | label=label, 110 | shape='record', 111 | fillcolor=color, 112 | style='filled', 113 | ) 114 | 115 | if hasattr(layer, 'input_layers'): 116 | for input_layer in layer.input_layers: 117 | pydot_edges.append([repr(input_layer), key]) 118 | 119 | if hasattr(layer, 'input_layer'): 120 | pydot_edges.append([repr(layer.input_layer), key]) 121 | 122 | for node in pydot_nodes.values(): 123 | pydot_graph.add_node(node) 124 | for edge in pydot_edges: 125 | pydot_graph.add_edge( 126 | pydot.Edge(pydot_nodes[edge[0]], pydot_nodes[edge[1]])) 127 | return pydot_graph 128 | 129 | 130 | def draw_to_file(layers, filename, **kwargs): 131 | """ 132 | Draws a network diagram to a file 133 | :parameters: 134 | - layers : list 135 | List of the layers, as obtained from lasange.layers.get_all_layers 136 | - filename: string 137 | The filename to save output to. 138 | - **kwargs: see docstring of get_pydot_graph for other options 139 | """ 140 | dot = get_pydot_graph(layers, **kwargs) 141 | 142 | ext = filename[filename.rfind('.') + 1:] 143 | with open(filename, 'w') as fid: 144 | fid.write(dot.create(format=ext)) 145 | 146 | 147 | def draw_to_notebook(layers, **kwargs): 148 | """ 149 | Draws a network diagram in an IPython notebook 150 | :parameters: 151 | - layers : list 152 | List of the layers, as obtained from lasange.layers.get_all_layers 153 | - **kwargs: see docstring of get_pydot_graph for other options 154 | """ 155 | from IPython.display import Image # needed to render in notebook 156 | 157 | dot = get_pydot_graph(layers, **kwargs) 158 | return Image(dot.create_png()) -------------------------------------------------------------------------------- /modelzoo/adenet_3stream_dct.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate 6 | from lasagne.nonlinearities import tanh 7 | 8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer, create_blstm 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder 10 | 11 | 12 | def create_model(s1_ae, s2_ae, s1_shape, s1_var, 13 | s2_shape, s2_var, 14 | s3_shape, s3_var, 15 | mask_shape, mask_var, 16 | lstm_size=250, win=T.iscalar('theta)'), 17 | output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), 18 | use_peepholes=True): 19 | 20 | s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities = s1_ae 21 | s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae 22 | 23 | gate_parameters = Gate( 24 | W_in=w_init_fn, W_hid=w_init_fn, 25 | b=las.init.Constant(0.)) 26 | cell_parameters = Gate( 27 | W_in=w_init_fn, W_hid=w_init_fn, 28 | # Setting W_cell to None denotes that no cell connection will be used. 29 | W_cell=None, b=las.init.Constant(0.), 30 | # By convention, the cell nonlinearity is tanh in an LSTM. 31 | nonlinearity=tanh) 32 | 33 | l_s1 = InputLayer(s1_shape, s1_var, 's1_im') 34 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 35 | l_s2 = InputLayer(s2_shape, s2_var, 's2_im') 36 | l_s3 = InputLayer(s3_shape, s3_var, 's3_im') 37 | 38 | symbolic_batchsize_s1 = l_s1.input_var.shape[0] 39 | symbolic_seqlen_s1 = l_s1.input_var.shape[1] 40 | symbolic_batchsize_s2 = l_s2.input_var.shape[0] 41 | symbolic_seqlen_s2 = l_s2.input_var.shape[1] 42 | 43 | l_reshape1_s1 = ReshapeLayer(l_s1, (-1, s1_shape[-1]), name='reshape1_s1') 44 | l_encoder_s1 = create_pretrained_encoder(l_reshape1_s1, s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities, 45 | ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1']) 46 | s1_len = las.layers.get_output_shape(l_encoder_s1)[-1] 47 | 48 | l_reshape2_s1 = ReshapeLayer(l_encoder_s1, 49 | (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len), 50 | name='reshape2_s1') 51 | l_delta_s1 = DeltaLayer(l_reshape2_s1, win, name='delta_s1') 52 | 53 | # s2 images 54 | l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2') 55 | l_encoder_s2 = create_pretrained_encoder(l_reshape1_s2, s2_weights, s2_biases, s2_shapes, 56 | s2_nonlinearities, 57 | ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2']) 58 | s2_len = las.layers.get_output_shape(l_encoder_s2)[-1] 59 | l_reshape2_s2 = ReshapeLayer(l_encoder_s2, 60 | (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len), 61 | name='reshape2_s2') 62 | l_delta_s2 = DeltaLayer(l_reshape2_s2, win, name='delta_s2') 63 | 64 | # s3 images 65 | l_delta_s3 = DeltaLayer(l_s3, win, name='delta_s3') 66 | 67 | l_lstm_s1 = LSTMLayer( 68 | l_delta_s1, int(lstm_size), peepholes=use_peepholes, 69 | # We need to specify a separate input for masks 70 | mask_input=l_mask, 71 | # Here, we supply the gate parameters for each gate 72 | ingate=gate_parameters, forgetgate=gate_parameters, 73 | cell=cell_parameters, outgate=gate_parameters, 74 | # We'll learn the initialization and use gradient clipping 75 | learn_init=True, grad_clipping=5., name='lstm_s1') 76 | 77 | l_lstm_s2 = LSTMLayer( 78 | l_delta_s2, lstm_size, peepholes=use_peepholes, 79 | # We need to specify a separate input for masks 80 | mask_input=l_mask, 81 | # Here, we supply the gate parameters for each gate 82 | ingate=gate_parameters, forgetgate=gate_parameters, 83 | cell=cell_parameters, outgate=gate_parameters, 84 | # We'll learn the initialization and use gradient clipping 85 | learn_init=True, grad_clipping=5., name='lstm_s2') 86 | 87 | l_lstm_s3 = LSTMLayer( 88 | l_delta_s3, lstm_size, peepholes=use_peepholes, 89 | # We need to specify a separate input for masks 90 | mask_input=l_mask, 91 | # Here, we supply the gate parameters for each gate 92 | ingate=gate_parameters, forgetgate=gate_parameters, 93 | cell=cell_parameters, outgate=gate_parameters, 94 | # We'll learn the initialization and use gradient clipping 95 | learn_init=True, grad_clipping=5., name='lstm_s3') 96 | 97 | # We'll combine the forward and backward layer output by summing. 98 | # Merge layers take in lists of layers to merge as input. 99 | if fusiontype == 'adasum': 100 | l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], name='adasum1') 101 | elif fusiontype == 'sum': 102 | l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], name='sum1') 103 | elif fusiontype == 'concat': 104 | l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3], axis=-1, name='concat') 105 | 106 | f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') 107 | l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') 108 | 109 | # reshape to (num_examples * seq_len, lstm_size) 110 | l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3') 111 | 112 | # Now, we can apply feed-forward layers as usual. 113 | # We want the network to predict a classification for the sequence, 114 | # so we'll use a the number of classes. 115 | l_softmax = DenseLayer( 116 | l_reshape3, num_units=output_classes, 117 | nonlinearity=las.nonlinearities.softmax, name='softmax') 118 | 119 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output') 120 | 121 | return l_out, l_fuse 122 | -------------------------------------------------------------------------------- /modelzoo/adenet_v2_4.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer, GlobalPoolLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear, rectify 7 | 8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder 10 | 11 | 12 | def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): 13 | 14 | if cell_parameters is None: 15 | cell_parameters = Gate() 16 | if gate_parameters is None: 17 | gate_parameters = Gate() 18 | 19 | l_lstm = LSTMLayer( 20 | l_incoming, hidden_units, 21 | # We need to specify a separate input for masks 22 | mask_input=l_mask, peepholes=use_peepholes, 23 | # Here, we supply the gate parameters for each gate 24 | ingate=gate_parameters, forgetgate=gate_parameters, 25 | cell=cell_parameters, outgate=gate_parameters, 26 | # We'll learn the initialization and use gradient clipping 27 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 28 | 29 | return l_lstm 30 | 31 | 32 | def create_model(ae, diff_ae, input_shape, input_var, mask_shape, mask_var, 33 | diff_shape, diff_var, lstm_size=250, win=T.iscalar('theta)'), 34 | output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), 35 | use_peepholes=True): 36 | 37 | bn_weights, bn_biases, bn_shapes, bn_nonlinearities = ae 38 | diff_weights, diff_biases, diff_shapes, diff_nonlinearities = diff_ae 39 | 40 | gate_parameters = Gate( 41 | W_in=w_init_fn, W_hid=w_init_fn, 42 | b=las.init.Constant(0.)) 43 | cell_parameters = Gate( 44 | W_in=w_init_fn, W_hid=w_init_fn, 45 | # Setting W_cell to None denotes that no cell connection will be used. 46 | W_cell=None, b=las.init.Constant(0.), 47 | # By convention, the cell nonlinearity is tanh in an LSTM. 48 | nonlinearity=tanh) 49 | 50 | l_raw = InputLayer(input_shape, input_var, 'raw_im') 51 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 52 | l_diff = InputLayer(diff_shape, diff_var, 'diff_im') 53 | 54 | symbolic_batchsize_raw = l_raw.input_var.shape[0] 55 | symbolic_seqlen_raw = l_raw.input_var.shape[1] 56 | symbolic_batchsize_diff = l_diff.input_var.shape[0] 57 | symbolic_seqlen_diff = l_diff.input_var.shape[1] 58 | 59 | l_reshape1_raw = ReshapeLayer(l_raw, (-1, input_shape[-1]), name='reshape1_raw') 60 | l_encoder_raw = create_pretrained_encoder(l_reshape1_raw, bn_weights, bn_biases, bn_shapes, bn_nonlinearities, 61 | ['fc1_raw', 'fc2_raw', 'fc3_raw', 'bottleneck_raw']) 62 | raw_len = las.layers.get_output_shape(l_encoder_raw)[-1] 63 | 64 | l_reshape2_raw = ReshapeLayer(l_encoder_raw, 65 | (symbolic_batchsize_raw, symbolic_seqlen_raw, raw_len), 66 | name='reshape2_raw') 67 | l_delta_raw = DeltaLayer(l_reshape2_raw, win, name='delta_raw') 68 | 69 | # diff images 70 | l_reshape1_diff = ReshapeLayer(l_diff, (-1, diff_shape[-1]), name='reshape1_diff') 71 | l_encoder_diff = create_pretrained_encoder(l_reshape1_diff, diff_weights, diff_biases, diff_shapes, 72 | diff_nonlinearities, 73 | ['fc1_diff', 'fc2_diff', 'fc3_diff', 'bottleneck_diff']) 74 | diff_len = las.layers.get_output_shape(l_encoder_diff)[-1] 75 | l_reshape2_diff = ReshapeLayer(l_encoder_diff, 76 | (symbolic_batchsize_diff, symbolic_seqlen_diff, diff_len), 77 | name='reshape2_diff') 78 | l_delta_diff = DeltaLayer(l_reshape2_diff, win, name='delta_diff') 79 | 80 | l_lstm_raw = LSTMLayer( 81 | l_delta_raw, int(lstm_size), peepholes=use_peepholes, 82 | # We need to specify a separate input for masks 83 | mask_input=l_mask, 84 | # Here, we supply the gate parameters for each gate 85 | ingate=gate_parameters, forgetgate=gate_parameters, 86 | cell=cell_parameters, outgate=gate_parameters, 87 | # We'll learn the initialization and use gradient clipping 88 | learn_init=True, grad_clipping=5., name='lstm_raw') 89 | 90 | l_lstm_diff = LSTMLayer( 91 | l_delta_diff, lstm_size, peepholes=use_peepholes, 92 | # We need to specify a separate input for masks 93 | mask_input=l_mask, 94 | # Here, we supply the gate parameters for each gate 95 | ingate=gate_parameters, forgetgate=gate_parameters, 96 | cell=cell_parameters, outgate=gate_parameters, 97 | # We'll learn the initialization and use gradient clipping 98 | learn_init=True, grad_clipping=5., name='lstm_diff') 99 | 100 | # We'll combine the forward and backward layer output by summing. 101 | # Merge layers take in lists of layers to merge as input. 102 | if fusiontype == 'adasum': 103 | l_fuse = AdaptiveElemwiseSumLayer([l_lstm_raw, l_lstm_diff], name='adasum1') 104 | elif fusiontype == 'sum': 105 | l_fuse = ElemwiseSumLayer([l_lstm_raw, l_lstm_diff], name='sum1') 106 | elif fusiontype == 'concat': 107 | l_fuse = ConcatLayer([l_lstm_raw, l_lstm_diff], axis=-1, name='concat') 108 | 109 | f_lstm_agg = create_lstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') 110 | 111 | # reshape to (num_examples * seq_len, lstm_size) 112 | l_reshape3 = ReshapeLayer(f_lstm_agg, (-1, lstm_size)) 113 | 114 | # Now, we can apply feed-forward layers as usual. 115 | # We want the network to predict a classification for the sequence, 116 | # so we'll use a the number of classes. 117 | l_softmax = DenseLayer( 118 | l_reshape3, num_units=output_classes, 119 | nonlinearity=las.nonlinearities.softmax, name='softmax') 120 | 121 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_raw, output_classes), name='output') 122 | 123 | return l_out, l_fuse 124 | -------------------------------------------------------------------------------- /modelzoo/adenet_v2_nodelta.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer, GlobalPoolLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear, rectify, leaky_rectify 7 | 8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder 10 | 11 | 12 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): 13 | 14 | if cell_parameters is None: 15 | cell_parameters = Gate() 16 | if gate_parameters is None: 17 | gate_parameters = Gate() 18 | 19 | l_lstm = LSTMLayer( 20 | l_incoming, hidden_units, 21 | # We need to specify a separate input for masks 22 | mask_input=l_mask, peepholes=use_peepholes, 23 | # Here, we supply the gate parameters for each gate 24 | ingate=gate_parameters, forgetgate=gate_parameters, 25 | cell=cell_parameters, outgate=gate_parameters, 26 | # We'll learn the initialization and use gradient clipping 27 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 28 | 29 | # The "backwards" layer is the same as the first, 30 | # except that the backwards argument is set to True. 31 | l_lstm_back = LSTMLayer( 32 | l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes, 33 | mask_input=l_mask, forgetgate=gate_parameters, 34 | cell=cell_parameters, outgate=gate_parameters, 35 | learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) 36 | 37 | return l_lstm, l_lstm_back 38 | 39 | 40 | def create_model(ae, s2_ae, input_shape, input_var, mask_shape, mask_var, 41 | s2_shape, s2_var, lstm_size=250, 42 | output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), 43 | use_peepholes=True): 44 | 45 | bn_weights, bn_biases, bn_shapes, bn_nonlinearities = ae 46 | s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae 47 | 48 | gate_parameters = Gate( 49 | W_in=w_init_fn, W_hid=w_init_fn, 50 | b=las.init.Constant(0.)) 51 | cell_parameters = Gate( 52 | W_in=w_init_fn, W_hid=w_init_fn, 53 | # Setting W_cell to None denotes that no cell connection will be used. 54 | W_cell=None, b=las.init.Constant(0.), 55 | # By convention, the cell nonlinearity is tanh in an LSTM. 56 | nonlinearity=tanh) 57 | 58 | l_s1 = InputLayer(input_shape, input_var, 's1_im') 59 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 60 | l_s2 = InputLayer(s2_shape, s2_var, 's2_im') 61 | 62 | symbolic_batchsize_s1 = l_s1.input_var.shape[0] 63 | symbolic_seqlen_s1 = l_s1.input_var.shape[1] 64 | symbolic_batchsize_s2 = l_s2.input_var.shape[0] 65 | symbolic_seqlen_s2 = l_s2.input_var.shape[1] 66 | 67 | l_reshape1_s1 = ReshapeLayer(l_s1, (-1, input_shape[-1]), name='reshape1_s1') 68 | l_encoder_s1 = create_pretrained_encoder(l_reshape1_s1, bn_weights, bn_biases, bn_shapes, bn_nonlinearities, 69 | ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1']) 70 | s1_len = las.layers.get_output_shape(l_encoder_s1)[-1] 71 | 72 | l_reshape2_s1 = ReshapeLayer(l_encoder_s1, 73 | (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len), 74 | name='reshape2_s1') 75 | 76 | # s2 images 77 | l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2') 78 | l_encoder_s2 = create_pretrained_encoder(l_reshape1_s2, s2_weights, s2_biases, s2_shapes, 79 | s2_nonlinearities, 80 | ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2']) 81 | s2_len = las.layers.get_output_shape(l_encoder_s2)[-1] 82 | l_reshape2_s2 = ReshapeLayer(l_encoder_s2, 83 | (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len), 84 | name='reshape2_s2') 85 | 86 | l_lstm_s1 = LSTMLayer( 87 | l_reshape2_s1, int(lstm_size), peepholes=use_peepholes, 88 | # We need to specify a separate input for masks 89 | mask_input=l_mask, 90 | # Here, we supply the gate parameters for each gate 91 | ingate=gate_parameters, forgetgate=gate_parameters, 92 | cell=cell_parameters, outgate=gate_parameters, 93 | # We'll learn the initialization and use gradient clipping 94 | learn_init=True, grad_clipping=5., name='lstm_s1') 95 | 96 | l_lstm_s2 = LSTMLayer( 97 | l_reshape2_s2, lstm_size, peepholes=use_peepholes, 98 | # We need to specify a separate input for masks 99 | mask_input=l_mask, 100 | # Here, we supply the gate parameters for each gate 101 | ingate=gate_parameters, forgetgate=gate_parameters, 102 | cell=cell_parameters, outgate=gate_parameters, 103 | # We'll learn the initialization and use gradient clipping 104 | learn_init=True, grad_clipping=5., name='lstm_s2') 105 | 106 | # We'll combine the forward and backward layer output by summing. 107 | # Merge layers take in lists of layers to merge as input. 108 | if fusiontype == 'adasum': 109 | l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2], name='adasum1') 110 | elif fusiontype == 'sum': 111 | l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2], name='sum1') 112 | elif fusiontype == 'concat': 113 | l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2], axis=-1, name='concat') 114 | 115 | f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') 116 | l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') 117 | 118 | # reshape to (num_examples * seq_len, lstm_size) 119 | l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3') 120 | 121 | # Now, we can apply feed-forward layers as usual. 122 | # We want the network to predict a classification for the sequence, 123 | # so we'll use a the number of classes. 124 | l_softmax = DenseLayer( 125 | l_reshape3, num_units=output_classes, 126 | nonlinearity=las.nonlinearities.softmax, name='softmax') 127 | 128 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output') 129 | 130 | return l_out, l_fuse 131 | -------------------------------------------------------------------------------- /modelzoo/adenet_v2_2.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer, GlobalPoolLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear, rectify, leaky_rectify 7 | 8 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer 9 | from modelzoo.pretrained_encoder import create_pretrained_encoder 10 | 11 | 12 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): 13 | 14 | if cell_parameters is None: 15 | cell_parameters = Gate() 16 | if gate_parameters is None: 17 | gate_parameters = Gate() 18 | 19 | l_lstm = LSTMLayer( 20 | l_incoming, hidden_units, 21 | # We need to specify a separate input for masks 22 | mask_input=l_mask, peepholes=use_peepholes, 23 | # Here, we supply the gate parameters for each gate 24 | ingate=gate_parameters, forgetgate=gate_parameters, 25 | cell=cell_parameters, outgate=gate_parameters, 26 | # We'll learn the initialization and use gradient clipping 27 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 28 | 29 | # The "backwards" layer is the same as the first, 30 | # except that the backwards argument is set to True. 31 | l_lstm_back = LSTMLayer( 32 | l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes, 33 | mask_input=l_mask, forgetgate=gate_parameters, 34 | cell=cell_parameters, outgate=gate_parameters, 35 | learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) 36 | 37 | return l_lstm, l_lstm_back 38 | 39 | 40 | def create_model(ae, s2_ae, input_shape, input_var, mask_shape, mask_var, 41 | s2_shape, s2_var, lstm_size=250, win=T.iscalar('theta)'), 42 | output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), 43 | use_peepholes=True): 44 | 45 | bn_weights, bn_biases, bn_shapes, bn_nonlinearities = ae 46 | s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae 47 | 48 | gate_parameters = Gate( 49 | W_in=w_init_fn, W_hid=w_init_fn, 50 | b=las.init.Constant(0.)) 51 | cell_parameters = Gate( 52 | W_in=w_init_fn, W_hid=w_init_fn, 53 | # Setting W_cell to None denotes that no cell connection will be used. 54 | W_cell=None, b=las.init.Constant(0.), 55 | # By convention, the cell nonlinearity is tanh in an LSTM. 56 | nonlinearity=tanh) 57 | 58 | l_s1 = InputLayer(input_shape, input_var, 's1_im') 59 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 60 | l_s2 = InputLayer(s2_shape, s2_var, 's2_im') 61 | 62 | symbolic_batchsize_s1 = l_s1.input_var.shape[0] 63 | symbolic_seqlen_s1 = l_s1.input_var.shape[1] 64 | symbolic_batchsize_s2 = l_s2.input_var.shape[0] 65 | symbolic_seqlen_s2 = l_s2.input_var.shape[1] 66 | 67 | l_reshape1_s1 = ReshapeLayer(l_s1, (-1, input_shape[-1]), name='reshape1_s1') 68 | l_encoder_s1 = create_pretrained_encoder(l_reshape1_s1, bn_weights, bn_biases, bn_shapes, bn_nonlinearities, 69 | ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1']) 70 | s1_len = las.layers.get_output_shape(l_encoder_s1)[-1] 71 | 72 | l_reshape2_s1 = ReshapeLayer(l_encoder_s1, 73 | (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len), 74 | name='reshape2_s1') 75 | l_delta_s1 = DeltaLayer(l_reshape2_s1, win, name='delta_s1') 76 | 77 | # s2 images 78 | l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2') 79 | l_encoder_s2 = create_pretrained_encoder(l_reshape1_s2, s2_weights, s2_biases, s2_shapes, 80 | s2_nonlinearities, 81 | ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2']) 82 | s2_len = las.layers.get_output_shape(l_encoder_s2)[-1] 83 | l_reshape2_s2 = ReshapeLayer(l_encoder_s2, 84 | (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len), 85 | name='reshape2_s2') 86 | l_delta_s2 = DeltaLayer(l_reshape2_s2, win, name='delta_s2') 87 | 88 | l_lstm_s1 = LSTMLayer( 89 | l_delta_s1, int(lstm_size), peepholes=use_peepholes, 90 | # We need to specify a separate input for masks 91 | mask_input=l_mask, 92 | # Here, we supply the gate parameters for each gate 93 | ingate=gate_parameters, forgetgate=gate_parameters, 94 | cell=cell_parameters, outgate=gate_parameters, 95 | # We'll learn the initialization and use gradient clipping 96 | learn_init=True, grad_clipping=5., name='lstm_s1') 97 | 98 | l_lstm_s2 = LSTMLayer( 99 | l_delta_s2, lstm_size, peepholes=use_peepholes, 100 | # We need to specify a separate input for masks 101 | mask_input=l_mask, 102 | # Here, we supply the gate parameters for each gate 103 | ingate=gate_parameters, forgetgate=gate_parameters, 104 | cell=cell_parameters, outgate=gate_parameters, 105 | # We'll learn the initialization and use gradient clipping 106 | learn_init=True, grad_clipping=5., name='lstm_s2') 107 | 108 | # We'll combine the forward and backward layer output by summing. 109 | # Merge layers take in lists of layers to merge as input. 110 | if fusiontype == 'adasum': 111 | l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2], name='adasum1') 112 | elif fusiontype == 'sum': 113 | l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2], name='sum1') 114 | elif fusiontype == 'concat': 115 | l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2], axis=-1, name='concat') 116 | 117 | f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') 118 | l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') 119 | 120 | # reshape to (num_examples * seq_len, lstm_size) 121 | l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3') 122 | 123 | # Now, we can apply feed-forward layers as usual. 124 | # We want the network to predict a classification for the sequence, 125 | # so we'll use a the number of classes. 126 | l_softmax = DenseLayer( 127 | l_reshape3, num_units=output_classes, 128 | nonlinearity=las.nonlinearities.softmax, name='softmax') 129 | 130 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes), name='output') 131 | 132 | return l_out, l_fuse 133 | -------------------------------------------------------------------------------- /landmarking/landmarker.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import os 3 | import errno 4 | import csv 5 | import argparse 6 | 7 | import menpo.io as mio 8 | from menpo.visualize import print_progress 9 | from menpodetect.dlib import load_dlib_frontal_face_detector 10 | from menpofit.dlib import DlibWrapper 11 | 12 | # constants, change according to system 13 | FACE_MODEL_PATH = '../config/shape_predictor_68_face_landmarks.dat' 14 | EXT = ['.mp4', '.mov', '.mpg'] 15 | NO_LANDMARKS = 68 16 | 17 | 18 | def find_all_videos(dir, ext=EXT, relpath=False): 19 | # get the absolute path of the file 20 | abspath = os.path.abspath(dir) 21 | videofiles = [] 22 | find_all_videos_impl(abspath, videofiles, ext) 23 | if relpath: 24 | for i, f in enumerate(videofiles): 25 | videofiles[i] = f[len(dir) + 1:] 26 | return videofiles 27 | 28 | 29 | def find_all_videos_impl(dir, videofiles, ext): 30 | files = os.listdir(dir) 31 | for f in files: 32 | path = os.path.join(dir, f) 33 | if os.path.isdir(path): 34 | find_all_videos_impl(path, videofiles, ext) 35 | elif os.path.splitext(f)[1] in ext: 36 | videofiles.append(path) 37 | 38 | 39 | def is_video(file, ext=EXT): 40 | return os.path.splitext(file)[1] in ext 41 | 42 | 43 | def fit_image(image): 44 | # Face detection 45 | bboxes = fit_image.detect(image, image_diagonal=1000) 46 | 47 | # Check if at least one face was detected, otherwise throw a warning 48 | if len(bboxes) > 0: 49 | # Use the first bounding box (the most probable to represent a face) to initialise 50 | fitting_result = fit_image.fitter.fit_from_bb(image, bboxes[0]) 51 | 52 | # Assign shape on the image 53 | image.landmarks['final_shape'] = fitting_result.final_shape 54 | else: 55 | # Throw warning if no face was detected 56 | warnings.warn('No face detected') 57 | 58 | # Return the image 59 | return image 60 | 61 | 62 | def create_dir(dir): 63 | if not os.path.exists(dir): 64 | try: 65 | os.makedirs(dir) 66 | except OSError as exc: # Guard against race condition 67 | if exc.errno != errno.EEXIST: 68 | raise 69 | 70 | 71 | def fill_row(outwriter, frame_no, row): 72 | outwriter.writerow([frame_no] + row) 73 | 74 | 75 | def process_video(file, dest): 76 | if is_video(file): 77 | try: 78 | frames = mio.import_video(file, normalise=False) 79 | except IOError: 80 | warnings.warn('IO error reading video file {}, '.format(file) + 81 | 'the file may be corrupted or the video format is unsupported, skipping...') 82 | except ValueError as e: 83 | warnings.warn('Value Error reading video file {}, '.format(file) + 84 | e.message) 85 | return 86 | # check if directory is non empty 87 | if os.path.dirname(dest): 88 | create_dir(os.path.dirname(dest)) 89 | print('{} contains {} frames'.format(file, len(frames))) 90 | print('writing landmarks to {}...'.format(dest)) 91 | frames = frames.map(fit_image) 92 | with open(dest, 'w') as outputfile: 93 | outwriter = csv.writer(outputfile) 94 | try: 95 | for i, frame in enumerate(print_progress(frames)): 96 | if 'final_shape' not in frame.landmarks: 97 | warnings.warn('no faces detected in the frame {}, ' 98 | 'initializing landmarks to -1s...'.format(i)) 99 | # dlib does not fitting from previous initial shape so 100 | # leave entire row as -1s 101 | # initial_shape = frames[i - 1].landmarks['final_shape'].lms 102 | # fitting_result = fit_image.fitter.fit_from_shape(frame, initial_shape) 103 | # frame.landmarks['final_shape'] = fitting_result.final_shape 104 | landmarks = [-1] * NO_LANDMARKS*2 105 | else: 106 | lmg = frame.landmarks['final_shape'] 107 | landmarks = lmg['all'].points.reshape((NO_LANDMARKS*2,)).tolist() # reshape to 136 points 108 | fill_row(outwriter, i, landmarks) 109 | except Exception as e: 110 | warnings.warn('Runtime Error at frame {}'.format(i)) 111 | print('initializing landmarks to -1s...') 112 | fill_row(outwriter, i, [-1] * NO_LANDMARKS*2) 113 | 114 | 115 | def parse_options(): 116 | options = dict() 117 | parser = argparse.ArgumentParser() 118 | options['model'] = '../config/shape_predictor_68_face_landmarks.dat' 119 | parser.add_argument('--input_dir', help='directory to search for videos, supported formats [.mov, .mpg, .mp4]') 120 | parser.add_argument('--output_dir', help='output directory to store the landmarks') 121 | parser.add_argument('--model', help='location of landmark model file. ' 122 | 'Default: ../config/shape_predictor_68_face_landmarks.dat') 123 | parser.add_argument('--file', help='perform landmarking on a single file') 124 | parser.add_argument('--output', help='output landmark file name, if not specified ' 125 | 'creates landmark file in current directory') 126 | args = parser.parse_args() 127 | if args.input_dir: 128 | options['input_dir'] = args.input_dir 129 | if args.output_dir: 130 | options['output_dir'] = args.output_dir 131 | if args.model: 132 | options['model'] = args.model 133 | if args.file: 134 | options['file'] = args.file 135 | if args.output: 136 | options['output'] = args.output 137 | return options 138 | 139 | 140 | if __name__ == '__main__': 141 | options = parse_options() 142 | fit_image.detect = load_dlib_frontal_face_detector() 143 | fit_image.fitter = DlibWrapper(options['model']) 144 | 145 | if 'file' in options: 146 | video_file = options['file'] 147 | video_file_basename = os.path.basename(video_file) 148 | print('Generating Landmarks from {}'.format(video_file)) 149 | output = options['output'] if 'output' in options else os.path.splitext(video_file_basename)[0] + '.csv' 150 | process_video(video_file, output) 151 | exit() 152 | 153 | print('Generating Landmarks from {}'.format(options['input_dir'])) 154 | videofiles = find_all_videos(options['input_dir'], relpath=False) 155 | videofiles.sort() 156 | print('Found {} video(s)...'.format(len(videofiles))) 157 | input_dir = os.path.abspath(options['input_dir']) 158 | output_dir = os.path.abspath(options['output_dir']) 159 | for video in videofiles: 160 | relative_path = video[len(input_dir) + 1:] 161 | landmarkfile = os.path.join(output_dir, os.path.splitext(relative_path)[0] + '.csv') 162 | process_video(video, landmarkfile) 163 | print('All Done!') 164 | -------------------------------------------------------------------------------- /modelzoo/adenet_v2_3.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear 7 | from lasagne.layers import batch_norm, BatchNormLayer 8 | 9 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer 10 | from modelzoo.pretrained_encoder import create_pretrained_encoder 11 | 12 | 13 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): 14 | 15 | if cell_parameters is None: 16 | cell_parameters = Gate() 17 | if gate_parameters is None: 18 | gate_parameters = Gate() 19 | 20 | l_lstm = LSTMLayer( 21 | l_incoming, hidden_units, peepholes=use_peepholes, 22 | # We need to specify a separate input for masks 23 | mask_input=l_mask, 24 | # Here, we supply the gate parameters for each gate 25 | ingate=gate_parameters, forgetgate=gate_parameters, 26 | cell=cell_parameters, outgate=gate_parameters, 27 | # We'll learn the initialization and use gradient clipping 28 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 29 | 30 | # The "backwards" layer is the same as the first, 31 | # except that the backwards argument is set to True. 32 | l_lstm_back = LSTMLayer( 33 | l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes, 34 | mask_input=l_mask, forgetgate=gate_parameters, 35 | cell=cell_parameters, outgate=gate_parameters, 36 | learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) 37 | 38 | return l_lstm, l_lstm_back 39 | 40 | 41 | def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): 42 | 43 | if cell_parameters is None: 44 | cell_parameters = Gate() 45 | if gate_parameters is None: 46 | gate_parameters = Gate() 47 | 48 | l_lstm = LSTMLayer( 49 | l_incoming, hidden_units, peepholes=use_peepholes, 50 | # We need to specify a separate input for masks 51 | mask_input=l_mask, 52 | # Here, we supply the gate parameters for each gate 53 | ingate=gate_parameters, forgetgate=gate_parameters, 54 | cell=cell_parameters, outgate=gate_parameters, 55 | # We'll learn the initialization and use gradient clipping 56 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 57 | 58 | return l_lstm 59 | 60 | 61 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var, 62 | dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'), 63 | output_classes=26, fusiontype='sum', w_init_fn=las.init.Orthogonal(), 64 | use_peepholes=True): 65 | 66 | dbn_layers = dbn.get_all_layers() 67 | weights = [] 68 | biases = [] 69 | shapes = [2000, 1000, 500, 50] 70 | nonlinearities = [rectify, rectify, rectify, linear] 71 | weights.append(dbn_layers[1].W.astype('float32')) 72 | weights.append(dbn_layers[2].W.astype('float32')) 73 | weights.append(dbn_layers[3].W.astype('float32')) 74 | weights.append(dbn_layers[4].W.astype('float32')) 75 | biases.append(dbn_layers[1].b.astype('float32')) 76 | biases.append(dbn_layers[2].b.astype('float32')) 77 | biases.append(dbn_layers[3].b.astype('float32')) 78 | biases.append(dbn_layers[4].b.astype('float32')) 79 | 80 | gate_parameters = Gate( 81 | W_in=las.init.Orthogonal(), W_hid=w_init_fn, 82 | b=las.init.Constant(0.)) 83 | cell_parameters = Gate( 84 | W_in=w_init_fn, W_hid=w_init_fn, 85 | # Setting W_cell to None denotes that no cell connection will be used. 86 | W_cell=None, b=las.init.Constant(0.), 87 | # By convention, the cell nonlinearity is tanh in an LSTM. 88 | nonlinearity=tanh) 89 | 90 | l_in = InputLayer(input_shape, input_var, 'input') 91 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 92 | l_dct = InputLayer(dct_shape, dct_var, 'dct') 93 | 94 | symbolic_batchsize = l_in.input_var.shape[0] 95 | symbolic_seqlen = l_in.input_var.shape[1] 96 | 97 | l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') 98 | l_encoder = create_pretrained_encoder(l_reshape1, weights, biases, shapes, nonlinearities, 99 | ['fc1', 'fc2', 'fc3', 'bottleneck']) 100 | encoder_len = las.layers.get_output_shape(l_encoder)[-1] 101 | l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') 102 | l_delta = DeltaLayer(l_reshape2, win, name='delta') 103 | 104 | l_lstm_bn = LSTMLayer( 105 | l_delta, lstm_size, peepholes=use_peepholes, 106 | # We need to specify a separate input for masks 107 | mask_input=l_mask, 108 | # Here, we supply the gate parameters for each gate 109 | ingate=gate_parameters, forgetgate=gate_parameters, 110 | cell=cell_parameters, outgate=gate_parameters, 111 | # We'll learn the initialization and use gradient clipping 112 | learn_init=True, grad_clipping=5., name='lstm_bn') 113 | 114 | l_lstm_dct = LSTMLayer( 115 | l_dct, lstm_size, peepholes=use_peepholes, 116 | # We need to specify a separate input for masks 117 | mask_input=l_mask, 118 | # Here, we supply the gate parameters for each gate 119 | ingate=gate_parameters, forgetgate=gate_parameters, 120 | cell=cell_parameters, outgate=gate_parameters, 121 | # We'll learn the initialization and use gradient clipping 122 | learn_init=True, grad_clipping=5., name='lstm_dct') 123 | 124 | # We'll combine the forward and backward layer output by summing. 125 | # Merge layers take in lists of layers to merge as input. 126 | 127 | if fusiontype == 'sum': 128 | l_fuse = ElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='sum1') 129 | elif fusiontype == 'adasum': 130 | l_fuse = AdaptiveElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='adasum') 131 | elif fusiontype == 'concat': 132 | l_fuse = ConcatLayer([l_lstm_bn, l_lstm_dct], axis=2, name='concat') 133 | 134 | f_lstm_agg = create_lstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') 135 | 136 | # reshape to (num_examples * seq_len, lstm_size) 137 | l_reshape3 = ReshapeLayer(f_lstm_agg, (-1, lstm_size)) 138 | 139 | # l_forward_slice1 = SliceLayer(l_sum2, -1, 1, name='slice1') 140 | 141 | # Now, we can apply feed-forward layers as usual. 142 | # We want the network to predict a classification for the sequence, 143 | # so we'll use a the number of classes. 144 | l_softmax = DenseLayer( 145 | l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') 146 | 147 | l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') 148 | 149 | return l_out, l_fuse 150 | -------------------------------------------------------------------------------- /modelzoo/adenet_v4.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | import lasagne as las 4 | from lasagne.layers import InputLayer, LSTMLayer, DenseLayer, ConcatLayer, SliceLayer, ReshapeLayer, ElemwiseSumLayer 5 | from lasagne.layers import Gate, DropoutLayer 6 | from lasagne.nonlinearities import tanh, sigmoid, linear 7 | from lasagne.layers import batch_norm 8 | 9 | from custom.layers import DeltaLayer, AdaptiveElemwiseSumLayer 10 | 11 | 12 | def create_pretrained_encoder(weights, biases, incoming): 13 | l_1 = DenseLayer(incoming, 2000, W=weights[0], b=biases[0], nonlinearity=sigmoid, name='fc1') 14 | l_2 = DenseLayer(l_1, 1000, W=weights[1], b=biases[1], nonlinearity=sigmoid, name='fc2') 15 | l_3 = DenseLayer(l_2, 500, W=weights[2], b=biases[2], nonlinearity=sigmoid, name='fc3') 16 | l_4 = DenseLayer(l_3, 50, W=weights[3], b=biases[3], nonlinearity=linear, name='bottleneck') 17 | return l_4 18 | 19 | 20 | def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): 21 | 22 | if cell_parameters is None: 23 | cell_parameters = Gate() 24 | if gate_parameters is None: 25 | gate_parameters = Gate() 26 | 27 | l_lstm = LSTMLayer( 28 | l_incoming, hidden_units, 29 | # We need to specify a separate input for masks 30 | mask_input=l_mask, 31 | # Here, we supply the gate parameters for each gate 32 | ingate=gate_parameters, forgetgate=gate_parameters, 33 | cell=cell_parameters, outgate=gate_parameters, 34 | # We'll learn the initialization and use gradient clipping 35 | learn_init=True, grad_clipping=5., name='f_{}'.format(name)) 36 | 37 | # The "backwards" layer is the same as the first, 38 | # except that the backwards argument is set to True. 39 | l_lstm_back = LSTMLayer( 40 | l_incoming, hidden_units, ingate=gate_parameters, 41 | mask_input=l_mask, forgetgate=gate_parameters, 42 | cell=cell_parameters, outgate=gate_parameters, 43 | learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) 44 | 45 | return l_lstm, l_lstm_back 46 | 47 | 48 | def create_model(dbn, input_shape, input_var, mask_shape, mask_var, 49 | dct_shape, dct_var, lstm_size=250, win=T.iscalar('theta)'), 50 | output_classes=26): 51 | 52 | dbn_layers = dbn.get_all_layers() 53 | weights = [] 54 | biases = [] 55 | weights.append(dbn_layers[1].W.astype('float32')) 56 | weights.append(dbn_layers[2].W.astype('float32')) 57 | weights.append(dbn_layers[3].W.astype('float32')) 58 | weights.append(dbn_layers[4].W.astype('float32')) 59 | biases.append(dbn_layers[1].b.astype('float32')) 60 | biases.append(dbn_layers[2].b.astype('float32')) 61 | biases.append(dbn_layers[3].b.astype('float32')) 62 | biases.append(dbn_layers[4].b.astype('float32')) 63 | 64 | gate_parameters = Gate( 65 | W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), 66 | b=las.init.Constant(0.)) 67 | cell_parameters = Gate( 68 | W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), 69 | # Setting W_cell to None denotes that no cell connection will be used. 70 | W_cell=None, b=las.init.Constant(0.), 71 | # By convention, the cell nonlinearity is tanh in an LSTM. 72 | nonlinearity=tanh) 73 | 74 | l_in = InputLayer(input_shape, input_var, 'input') 75 | l_mask = InputLayer(mask_shape, mask_var, 'mask') 76 | l_dct = InputLayer(dct_shape, dct_var, 'dct') 77 | 78 | symbolic_batchsize = l_in.input_var.shape[0] 79 | symbolic_seqlen = l_in.input_var.shape[1] 80 | 81 | l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') 82 | l_encoder = create_pretrained_encoder(weights, biases, l_reshape1) 83 | encoder_len = las.layers.get_output_shape(l_encoder)[-1] 84 | l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') 85 | l_delta = DeltaLayer(l_reshape2, win, name='delta') 86 | l_delta_drop = DropoutLayer(l_delta, name='dropout_delta') 87 | l_dct_drop = DropoutLayer(l_dct, p=0.2, name='dropout_dct') 88 | 89 | l_lstm_bn = LSTMLayer( 90 | l_delta_drop, lstm_size * 2, 91 | # We need to specify a separate input for masks 92 | mask_input=l_mask, 93 | # Here, we supply the gate parameters for each gate 94 | ingate=gate_parameters, forgetgate=gate_parameters, 95 | cell=cell_parameters, outgate=gate_parameters, 96 | # We'll learn the initialization and use gradient clipping 97 | learn_init=True, grad_clipping=5., name='lstm_bn') 98 | 99 | l_lstm_dct = LSTMLayer( 100 | l_dct_drop, lstm_size * 2, 101 | # We need to specify a separate input for masks 102 | mask_input=l_mask, 103 | # Here, we supply the gate parameters for each gate 104 | ingate=gate_parameters, forgetgate=gate_parameters, 105 | cell=cell_parameters, outgate=gate_parameters, 106 | # We'll learn the initialization and use gradient clipping 107 | learn_init=True, grad_clipping=5., name='lstm_dct') 108 | 109 | # We'll combine the forward and backward layer output by summing. 110 | # Merge layers take in lists of layers to merge as input. 111 | # l_sum1 = AdaptiveElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='adasum1') 112 | l_sum1 = ElemwiseSumLayer([l_lstm_bn, l_lstm_dct], name='sum1') 113 | l_sum1_drop = DropoutLayer(l_sum1, name='dropout_agg') 114 | # f_lstm_agg, b_lstm_agg = create_blstm(l_sum1, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') 115 | 116 | l_lstm_agg = LSTMLayer( 117 | l_sum1_drop, lstm_size * 2, 118 | # We need to specify a separate input for masks 119 | mask_input=l_mask, 120 | # Here, we supply the gate parameters for each gate 121 | ingate=gate_parameters, forgetgate=gate_parameters, 122 | cell=cell_parameters, outgate=gate_parameters, 123 | # We'll learn the initialization and use gradient clipping 124 | learn_init=True, grad_clipping=5., name='lstm_agg') 125 | 126 | ''' 127 | # implement drop-out regularization 128 | l_dropout = DropoutLayer(l_sum1, p=0.4, name='dropout1') 129 | 130 | l_lstm2, l_lstm2_back = create_blstm(l_dropout, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm2') 131 | 132 | # We'll combine the forward and backward layer output by summing. 133 | # Merge layers take in lists of layers to merge as input. 134 | l_sum2 = ElemwiseSumLayer([l_lstm2, l_lstm2_back]) 135 | ''' 136 | 137 | # l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') 138 | 139 | l_forward_slice1 = SliceLayer(l_lstm_agg, -1, 1, name='slice1') 140 | 141 | # Now, we can apply feed-forward layers as usual. 142 | # We want the network to predict a classification for the sequence, 143 | # so we'll use a the number of classes. 144 | l_out = DenseLayer( 145 | l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') 146 | 147 | return l_out, l_sum1 148 | -------------------------------------------------------------------------------- /utils/ffmpeg.py: -------------------------------------------------------------------------------- 1 | """ 2 | module containing functions to use ffprobe to parse video frame info 3 | """ 4 | from __future__ import print_function 5 | import subprocess 6 | import cStringIO 7 | 8 | 9 | class base_frame(object): 10 | """ 11 | Base Frame from FFProbe 12 | [FRAME] 13 | media_type=video 14 | stream_index=0 15 | key_frame=0 16 | pkt_pts=11745667 17 | pkt_pts_time=130.507411 18 | pkt_dts=11745667 19 | pkt_dts_time=130.507411 20 | best_effort_timestamp=11745667 21 | best_effort_timestamp_time=130.507411 22 | pkt_duration=3003 23 | pkt_duration_time=0.033367 24 | pkt_pos=86509020 25 | pkt_size=13294 26 | ... 27 | [/FRAME] 28 | """ 29 | def __init__(self, buf, parser): 30 | """ 31 | Constructs a base ffprobe frame 32 | :param buf: buffer containing frame info 33 | :param parser: ffprobe frame parser 34 | """ 35 | self.stream_index = parser.get_int(buf) 36 | self.key_frame = parser.get_int(buf) 37 | self.pkt_pts = parser.get_int(buf) 38 | self.pkt_pts_time = parser.get_float(buf) 39 | self.pkt_dts = parser.get_int(buf) 40 | self.pkt_dts_time = parser.get_float(buf) 41 | self.best_effort_timestamp = parser.get_int(buf) 42 | self.best_effort_timestamp_time = parser.get_float(buf) 43 | self.pkt_duration = parser.get_int(buf) 44 | self.pkt_duration_time = parser.get_float(buf) 45 | self.pkt_pos = parser.get_int(buf) 46 | self.pkt_size = parser.get_int(buf) 47 | 48 | 49 | class audio_frame(base_frame): 50 | """ 51 | Audio Frame from FFProbe 52 | [FRAME] 53 | ... 54 | sample_fmt=s16p 55 | nb_samples=1152 56 | channels=2 57 | channel_layout=stereo 58 | [/FRAME] 59 | """ 60 | def __init__(self, buf, parser): 61 | """ 62 | Constructs an Audio Frame from FFprobe 63 | :param buf: buffer containing ffprobe frame info 64 | :param parser: ffprobe frame parser 65 | """ 66 | super(audio_frame, self).__init__(buf, parser) 67 | self.media_type = 'audio' 68 | self.sample_fmt = parser.get_str(buf) 69 | self.nb_samples = parser.get_int(buf) 70 | self.channels = parser.get_int(buf) 71 | self.channel_layout = parser.get_str(buf) 72 | 73 | 74 | class video_frame(base_frame): 75 | """ 76 | Video Frame from FFProbe 77 | [FRAME] 78 | ... 79 | width=720 80 | height=480 81 | pix_fmt=yuv420p 82 | sample_aspect_ratio=1:1 83 | pict_type=B 84 | coded_picture_number=3889 85 | display_picture_number=0 86 | interlaced_frame=0 87 | top_field_first=0 88 | repeat_pict=0 89 | [/FRAME] 90 | """ 91 | def __init__(self, buf, parser): 92 | """ 93 | Constructs a Video Frame from ffprobe 94 | :param buf: buffer containing ffprobe frame info 95 | :param parser: ffprobe frame parser 96 | """ 97 | super(video_frame, self).__init__(buf, parser) 98 | self.media_type = 'video' 99 | self.width = parser.get_int(buf) 100 | self.height = parser.get_int(buf) 101 | self.pix_fmt = parser.get_str(buf) 102 | self.sample_aspect_ratio = parser.get_str(buf) 103 | self.pict_type = parser.get_str(buf) 104 | self.coded_picture_number = parser.get_int(buf) 105 | self.display_picture_number = parser.get_int(buf) 106 | self.interlaced_frame = parser.get_int(buf) 107 | self.top_field_first = parser.get_int(buf) 108 | self.repeat_pict = parser.get_int(buf) 109 | 110 | 111 | class side_data(object): 112 | """ 113 | Side Data from FFProbe 114 | [SIDE_DATA] 115 | side_data_type=GOP timecode 116 | side_data_size=8 117 | timecode=00:00:00:00 118 | [/SIDE_DATA] 119 | """ 120 | def __init__(self, buf, parser): 121 | """ 122 | Constructs side data frame 123 | :param buf: buffer containing ffprobe frame info 124 | :param parser: ffprobe frame parser 125 | """ 126 | self.side_data_type = parser.get_str(buf) 127 | self.side_data_size = parser.get_int(buf) 128 | self.timecode = parser.get_str(buf) 129 | 130 | 131 | class ffprobe_frame_info_parser(object): 132 | """ 133 | ffprobe frame parser, reads ffprobe entries and extracts key, value pairs 134 | """ 135 | def get_str(self, buf, sep='='): 136 | _, value = buf.readline().split(sep) 137 | return value[:-1] 138 | 139 | def get_int(self, buf, sep='='): 140 | _, value = buf.readline().split(sep) 141 | value = value[:-1] 142 | if value == 'N/A': 143 | value = -1 144 | else: 145 | value = int(value) 146 | return value 147 | 148 | def get_float(self, buf, sep='='): 149 | _, value = buf.readline().split(sep) 150 | value = value[:-1] 151 | if value == 'N/A': 152 | value = float('nan') 153 | else: 154 | value = float(value) 155 | return value 156 | 157 | def get_entry(self, buf, sep='='): 158 | key, value = buf.readline().split(sep) 159 | value = value[:-1] 160 | return key, value 161 | 162 | 163 | def peek_line(buf): 164 | pos = buf.tell() 165 | line = buf.readline() 166 | buf.seek(pos) 167 | return line 168 | 169 | 170 | def ffprobe_video(filename): 171 | """ 172 | probes a video using ffprobe subprocess 173 | :param filename: video file to probe 174 | :return: list of audio, video frames 175 | """ 176 | command = ["ffprobe", "-show_frames", filename] 177 | p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 178 | out, err = p.communicate() 179 | video_frames = [] 180 | audio_frames = [] 181 | p = ffprobe_frame_info_parser() 182 | buf = cStringIO.StringIO(out) 183 | while True: 184 | line = buf.readline() 185 | if line == '': 186 | break 187 | else: 188 | info_type = line[:-1] 189 | if info_type == '[FRAME]': 190 | media_type = p.get_str(buf) 191 | if media_type == "video": 192 | frame = video_frame(buf, p) 193 | video_frames.append(frame) 194 | # check if [SIDE_DATA] exists 195 | line = peek_line(buf)[:-1] 196 | if line == '[SIDE_DATA]': 197 | _ = buf.readline() # read the header [SIDE_DATA] 198 | _ = side_data(buf, p) 199 | buf.readline() # read the end tag [/SIDE_DATA] 200 | else: 201 | frame = audio_frame(buf, p) 202 | audio_frames.append(frame) 203 | buf.readline() # read the end tag [/FRAME] 204 | return audio_frames, video_frames 205 | 206 | 207 | def main(): 208 | audio_frames, video_frames = ffprobe_video('s01.mpg') 209 | assert len(video_frames) == 3890 210 | 211 | 212 | if __name__ == '__main__': 213 | main() 214 | -------------------------------------------------------------------------------- /cuave/prepare_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.insert(0, '../') 4 | import argparse 5 | import utils.ffmpeg 6 | from utils.preprocessing import * 7 | from utils.io import * 8 | from utils.plotting_utils import * 9 | 10 | 11 | def parse_htk_labels(filename): 12 | """ 13 | #Normal in 100ns 14 | 7800000 14480000 zero 15 | 17510000 22920000 one 16 | 26580000 32630000 two 17 | 36290000 40590000 three 18 | 46240000 49900000 four 19 | 55310000 59370000 five 20 | 63590000 69800000 six 21 | ... 22 | #Moving 23 | 24 | :param filename: 25 | :return: 26 | """ 27 | labels = [] 28 | with open(filename, 'r') as f: 29 | line = f.readline()[:-1] 30 | if 'Normal' in line: 31 | while True: 32 | # iterate until #Moving 33 | line = f.readline() 34 | if '#Moving' in line: 35 | break 36 | else: 37 | start, end, number = line[:-2].split(' ') # remove \n\r 38 | labels.append((start, end, number)) 39 | return labels 40 | 41 | 42 | def to_100ns(time_in_sec): 43 | return int(time_in_sec * 10000000) 44 | 45 | 46 | def digit_to_int(digit): 47 | digit_map = {'zero': 0, 48 | 'one': 1, 49 | 'two': 2, 50 | 'three': 3, 51 | 'four': 4, 52 | 'five': 5, 53 | 'six': 6, 54 | 'seven': 7, 55 | 'eight': 8, 56 | 'nine': 9} 57 | return digit_map[digit] 58 | 59 | 60 | def segment_video(video_file, label_file): 61 | _, video_frames = utils.ffmpeg.ffprobe_video(video_file) 62 | htk_labels = parse_htk_labels(label_file) 63 | print('number of video frames: {}'.format(len(video_frames))) 64 | print('number of labels: {}'.format(len(htk_labels))) 65 | current_frame = 0 66 | idxes = [] 67 | seq_lens = [] 68 | labels = [] 69 | for start, end, label in htk_labels: 70 | start = int(start) 71 | end = int(end) 72 | number = digit_to_int(label) 73 | # print(start, end, number) 74 | seq_len = 0 75 | while True: 76 | f = video_frames[current_frame] 77 | pts_time = to_100ns(f.pkt_pts_time) 78 | # check if frame is withing utterance window 79 | if pts_time > start and pts_time <= end: 80 | idxes.append(current_frame) 81 | labels.append(number) 82 | seq_len += 1 83 | current_frame += 1 84 | # TODO: extract/select mouth ROI of frame 85 | else: 86 | if pts_time > end: 87 | break 88 | current_frame += 1 # keep moving to the start of the next sequence 89 | seq_lens.append(seq_len) 90 | print(len(idxes)) 91 | print(len(labels)) 92 | print(seq_lens) 93 | 94 | 95 | def test_mergesamples(): 96 | s = np.array([[1],[2],[3],[4],[1],[2],[3],[4],[1],[2],[3],[4],[1],[2],[3],[4],[5]]) 97 | # s = np.array([1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,5]) 98 | l = [4,4,4,5] 99 | r = factorize(s, l, 3, 0) 100 | print(r) 101 | 102 | 103 | def test_embed_temporal_info(): 104 | s = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[1,1,1],[2,2,2],[3,3,3],[4,4,4],[1,1,1],[2,2,2],[3,3,3],[4,4,4], 105 | [1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]]) 106 | # s = np.array([1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,5]) 107 | l = np.array([4,4,4,5]) 108 | r, l = factorize(s, l, 3, 0) 109 | r, l = embed_temporal_info(r, l, 3, 3) 110 | print(r) 111 | 112 | 113 | def parse_options(): 114 | options = dict() 115 | options['remove_mean'] = False 116 | options['diff_image'] = False 117 | options['samplewise_norm'] = False 118 | options['merge_samples'] = False 119 | options['output'] = None 120 | options['mergesize'] = 3 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument('--remove_mean', action='store_true', help='remove mean image') 123 | parser.add_argument('--diff_image', action='store_true', help='compute difference of image') 124 | parser.add_argument('--samplewise_norm', action='store_true', help='samplewise normalize') 125 | parser.add_argument('--reorder_data', help='redorder data from f to c convention. eg: 30,50') 126 | parser.add_argument('--concat_deltas', help='concat 1st and 2nd deltas, default delta window: 2') 127 | parser.add_argument('--embed_temporal_info', help='embed temporal info to features [window],[step]. ie: 3,1') 128 | parser.add_argument('--output', help='write output to .mat file') 129 | parser.add_argument('input', nargs='+', help='input cuave .mat file to preprocess') 130 | args = parser.parse_args() 131 | if args.remove_mean: 132 | options['remove_mean'] = args.remove_mean 133 | if args.diff_image: 134 | options['diff_image'] = args.diff_image 135 | if args.samplewise_norm: 136 | options['samplewise_norm'] = args.samplewise_norm 137 | if args.embed_temporal_info: 138 | options['embed_temporal_info'] = args.embed_temporal_info 139 | if args.reorder_data: 140 | options['reorder_data'] = args.reorder_data 141 | if args.output: 142 | options['output'] = args.output 143 | if args.input: 144 | options['input'] = args.input[0] 145 | if args.concat_deltas: 146 | options['concat_deltas'] = int(args.concat_deltas) 147 | return options 148 | 149 | 150 | def main(): 151 | options = parse_options() 152 | data = load_mat_file(options['input']) 153 | data_matrix = data['dataMatrix'].astype('float32') 154 | vid_len_vec = data['videoLengthVec'].astype('int').reshape((-1,)) 155 | targets_vec = data['targetsVec'].reshape((-1,)) 156 | 157 | if 'reorder_data' in options: 158 | imagesize = tuple([int(d) for d in options['reorder_data'].split(',')]) 159 | data_matrix = reorder_data(data_matrix, imagesize) 160 | if options['samplewise_norm']: 161 | data_matrix = normalize_input(data_matrix) 162 | if options['remove_mean']: 163 | data_matrix = sequencewise_mean_image_subtraction(data_matrix, vid_len_vec) 164 | if options['diff_image']: 165 | data_matrix = compute_diff_images(data_matrix, vid_len_vec) 166 | if 'embed_temporal_info' in options: 167 | window, step = tuple([int(d) for d in options['embed_temporal_info'].split(',')]) 168 | data_matrix, targets_vec, vid_len_vec = factorize(data_matrix, targets_vec, vid_len_vec, step, 0) 169 | data_matrix, targets_vec, vid_len_vec = embed_temporal_info(data_matrix, targets_vec, vid_len_vec, window, step) 170 | if 'concat_deltas' in options: 171 | data_matrix = concat_first_second_deltas(data_matrix, vid_len_vec, options['concat_deltas']) 172 | 173 | data['dataMatrix'] = data_matrix 174 | 175 | if 'embed_temporal_info' in options: 176 | data['videoLengthVec'] = vid_len_vec 177 | data['targetsVec'] = targets_vec 178 | 179 | if 'output' in options: 180 | save_mat(data, options['output']) 181 | # print(data.keys()) 182 | print('data prepared!') 183 | 184 | 185 | if __name__ == '__main__': 186 | main() --------------------------------------------------------------------------------