├── .gitmodules ├── dicnn ├── visualize_approximate_dynamic_images.m ├── compute_approximate_dynamic_images.m ├── cnn_init_cafferef.m ├── cnn_video_rgb_get_batch.m ├── cnn_init_resnext.m ├── cnn_video_of_get_batch.m ├── cnn_single_of.m ├── cnn_single_rgb.m ├── cnn_dicnn_of.m ├── cnn_dicnn_rgb.m └── cnn_train_dicnn_dag.m ├── utils └── extract_frames.sh ├── Layers ├── L2Normalize.m ├── TemporalPooling.m ├── AppRankPooling.m ├── vl_nnpooltemporal.m ├── vl_nnarpooltemporal.m ├── vl_nnl2norm.m ├── ErrorMultiClass.m ├── LossNormalized.m └── BatchNormN.m ├── main_train.m ├── Datasets ├── cnn_hmdb51_setup_data.m ├── cnn_hmdb51_of_setup_data.m ├── cnn_ucf101_setup_data.m └── cnn_ucf101_of_setup_data.m └── README.md /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "matconvnet"] 2 | path = matconvnet 3 | url = https://github.com/vlfeat/matconvnet 4 | branch = master 5 | -------------------------------------------------------------------------------- /dicnn/visualize_approximate_dynamic_images.m: -------------------------------------------------------------------------------- 1 | function visualize_approximate_dynamic_images(images) 2 | % VISUALIZE_DYNAMIC_IMAGES 3 | 4 | di = compute_approximate_dynamic_images(images) ; 5 | 6 | di = di - min(di(:)) ; 7 | di = 255 * di ./ max(di(:)) ; 8 | image(uint8(di)) ; 9 | -------------------------------------------------------------------------------- /utils/extract_frames.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | 3 | # This script converts videos into frames 4 | # for different fps change (-r 1) 5 | 6 | for f in *.avi 7 | do g=`echo $f | sed 's/\.avi//'`; 8 | echo Processing $f; 9 | mkdir -p frames/$g/ ; 10 | ffmpeg -i $f frames/$g/image-%04d.jpeg ; 11 | done 12 | -------------------------------------------------------------------------------- /Layers/L2Normalize.m: -------------------------------------------------------------------------------- 1 | classdef L2Normalize < dagnn.ElementWise 2 | % author: Hakan Bilen 3 | % dagnn wrapper for l2 normalization 4 | 5 | properties 6 | scale = 1; 7 | clip = [-inf inf]; 8 | offset = 0; 9 | end 10 | 11 | methods 12 | function outputs = forward(obj, inputs, params) 13 | outputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset]); 14 | end 15 | 16 | function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) 17 | derInputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset],derOutputs{1}); 18 | derParams = {} ; 19 | end 20 | 21 | function obj = L2Normalize(varargin) 22 | obj.load(varargin) ; 23 | end 24 | 25 | end 26 | end 27 | 28 | -------------------------------------------------------------------------------- /Layers/TemporalPooling.m: -------------------------------------------------------------------------------- 1 | classdef TemporalPooling < dagnn.ElementWise 2 | % author: Hakan Bilen 3 | % dagnn wrapper for approximate rank pooling 4 | 5 | properties 6 | method = 'max'; 7 | end 8 | 9 | methods 10 | function outputs = forward(obj, inputs, params) 11 | outputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method); 12 | end 13 | 14 | function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) 15 | derInputs = cell(1,2); 16 | derInputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method,derOutputs{1}); 17 | derParams = {} ; 18 | end 19 | 20 | function obj = TemporalPooling(varargin) 21 | obj.load(varargin) ; 22 | end 23 | 24 | end 25 | end 26 | 27 | -------------------------------------------------------------------------------- /dicnn/compute_approximate_dynamic_images.m: -------------------------------------------------------------------------------- 1 | function di = compute_approximate_dynamic_images(images) 2 | % Computes approximate dynamic images for a given array of images 3 | % IMAGES must be a tensor of H x W x D x N dimensionality or 4 | % cell of image names 5 | 6 | % For the exact dynamic images, use the code 7 | % http://users.cecs.anu.edu.au/~basura/dynamic_images/code.zip 8 | % Explained here http://arxiv.org/abs/1512.01848 9 | 10 | if isempty(images) 11 | di = [] ; 12 | return ; 13 | end 14 | 15 | 16 | if iscell(images) 17 | imagesA = cell(1,numel(images)) ; 18 | for i=1:numel(images) 19 | if ~ischar(images{i}) 20 | error('images must be an array of images or cell of image names') ; 21 | end 22 | imagesA{i} = imread(images{i}) ; 23 | end 24 | images = cat(4,imagesA{:}) ; 25 | end 26 | 27 | N = size(images,4) ; 28 | di = vl_nnarpooltemporal(single(images),ones(1,N)) ; 29 | 30 | 31 | -------------------------------------------------------------------------------- /Layers/AppRankPooling.m: -------------------------------------------------------------------------------- 1 | classdef AppRankPooling < dagnn.ElementWise 2 | % author: Hakan Bilen 3 | % dagnn wrapper for approximate rank pooling 4 | 5 | properties 6 | scale = 1 7 | end 8 | 9 | methods 10 | function outputs = forward(obj, inputs, params) 11 | outputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2}) * obj.scale ; 12 | end 13 | 14 | function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) 15 | derInputs = cell(1,2); 16 | derInputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2},derOutputs{1}) * obj.scale; 17 | derParams = {} ; 18 | end 19 | 20 | function outputSizes = getOutputSizes(obj, inputSizes) 21 | % This is not correct, dim(4) depends on inputs{2} 22 | outputSizes{1} = inputSizes{1} ; 23 | end 24 | 25 | function obj = AppRankPooling(varargin) 26 | obj.load(varargin) ; 27 | end 28 | 29 | end 30 | end 31 | 32 | -------------------------------------------------------------------------------- /Layers/vl_nnpooltemporal.m: -------------------------------------------------------------------------------- 1 | function Y = vl_nnpooltemporal(X,ids,method,dzdy) 2 | % author: Hakan Bilen 3 | % temporal pooling along frames 4 | % ids indicates frame-video association 5 | % method 'max' or 'avg' 6 | 7 | sz = size(X); 8 | forward = logical(nargin<4); 9 | Xp = permute(X,[4,1,2,3]); 10 | 11 | if numel(ids)~=size(X,4) 12 | error('Error: ids dimension does not match with X!'); 13 | end 14 | 15 | nVideos = max(ids); 16 | 17 | if forward 18 | Yp = zeros([nVideos,sz(1:3)],'like',X); 19 | for v=1:nVideos 20 | % pool among frames 21 | indv = find(ids==v); 22 | Yp(v,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], ... 23 | 'pad', 0, 'stride', [numel(indv),1], 'method', method) ; 24 | end 25 | else 26 | dzdyp = permute(dzdy,[4,1,2,3]); 27 | Yp = zeros(size(Xp),'like',Xp); 28 | for v=1:nVideos 29 | % pool among frames 30 | indv = find(ids==v); 31 | Yp(indv,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], dzdyp(v,:,:,:), ... 32 | 'pad', 0, 'stride', [numel(indv),1], 'method', method) ; 33 | end 34 | 35 | end 36 | % permute back 37 | Y = permute(Yp,[2,3,4,1]); 38 | 39 | % if forward 40 | % fprintf(' fwd-ptemp %.2f ',sqrt(sum(Y(:).^2))); 41 | % else 42 | % fprintf(' back-ptemp %.2f ',sqrt(sum(Y(:).^2))); 43 | % end 44 | -------------------------------------------------------------------------------- /Layers/vl_nnarpooltemporal.m: -------------------------------------------------------------------------------- 1 | function Y = vl_nnarpooltemporal(X,ids,dzdy) 2 | % author: Hakan Bilen 3 | % approximate rank pooling 4 | % ids indicates frame-video association (must be in range [1-N]) 5 | 6 | sz = size(X); 7 | forward = logical(nargin<3); 8 | 9 | if numel(ids)~=size(X,4) 10 | error('Error: ids dimension does not match with X!'); 11 | end 12 | 13 | nVideos = max(ids); 14 | 15 | if forward 16 | Y = zeros([sz(1:3),nVideos],'like',X); 17 | else 18 | Y = zeros(size(X),'like',X); 19 | end 20 | 21 | for v=1:nVideos 22 | % pool among frames 23 | indv = find(ids==v); 24 | if isempty(indv) 25 | error('Error: No frames in video %d',v); 26 | end 27 | N = numel(indv); 28 | % magic numbers 29 | fw = zeros(1,N); 30 | if N==1 31 | fw = 1; 32 | else 33 | for i=1:N 34 | fw(i) = sum((2*(i:N)-N-1) ./ (i:N)); 35 | end 36 | end 37 | 38 | if forward 39 | Y(:,:,:,v) = sum(bsxfun(@times,X(:,:,:,indv),... 40 | reshape(single(fw),[1 1 1 numel(indv)])),4); 41 | else 42 | Y(:,:,:,indv) = (bsxfun(@times,repmat(dzdy(:,:,:,v),[1,1,1,numel(indv)]),... 43 | reshape(fw,[1 1 1 numel(indv)]))) ; 44 | end 45 | end 46 | % 47 | % if forward 48 | % fprintf(' fwd-arpool %.2f ',sqrt(sum(Y(:).^2))); 49 | % else 50 | % fprintf(' back-arpool %f ',sqrt(sum(Y(:).^2))); 51 | % end 52 | 53 | -------------------------------------------------------------------------------- /Layers/vl_nnl2norm.m: -------------------------------------------------------------------------------- 1 | function y = vl_nnl2norm(x,param,dzdy) 2 | % author: Hakan Bilen 3 | % l2 normalize whole feature map 4 | 5 | sc = param(1); 6 | clip = param(2:3); 7 | offset = param(4); 8 | 9 | if nargin == 3 10 | assert(all(size(x) == size(dzdy))); 11 | else 12 | dzdy = []; 13 | end 14 | 15 | x_sz = size(x); 16 | if ~all(x_sz([1 2]) == 1) 17 | % Create an array of size #channels x #samples 18 | x = reshape(x, prod(x_sz(1:3)), []); 19 | end 20 | 21 | 22 | x = x + offset; 23 | 24 | if isempty(dzdy) 25 | 26 | y = (bsxfun(@times, x, sc./(sqrt(sum(x .* x)) + single(1e-12)))); 27 | % clip max values 28 | if all(y(:)clip(2)) 29 | warning('Too small clipping interval'); 30 | fprintf('min %f max %f\n',min(y(:)),max(y(:))); 31 | end 32 | 33 | y(y(:)clip(2)) = clip(2); 35 | 36 | 37 | else 38 | if ~all(x_sz([1 2]) == 1) 39 | dzdy = reshape(dzdy, prod(x_sz(1:3)), []); 40 | end 41 | 42 | len_ = 1./sqrt(sum(x.*x)+single(1e-12)); 43 | dzdy_ = bsxfun(@times,dzdy,len_.^3); 44 | y = sc * (bsxfun(@times,dzdy,len_)-bsxfun(@times,x,sum(x.*dzdy_))); 45 | end 46 | 47 | if ~all(x_sz([1 2]) == 1) 48 | y = reshape(y, x_sz); 49 | end 50 | % 51 | % if isempty(dzdy) 52 | % fprintf(' fwd-l2 %.2f ',sqrt(sum(y(:).^2))); 53 | % else 54 | % fprintf(' back-l2 %f dzdy %f ',sqrt(sum(y(:).^2)),sqrt(sum(dzdy(:).^2))); 55 | % end 56 | -------------------------------------------------------------------------------- /Layers/ErrorMultiClass.m: -------------------------------------------------------------------------------- 1 | classdef ErrorMultiClass < dagnn.Loss 2 | % author: Hakan Bilen 3 | % computes multi-class accuracy 4 | % inputs{1}->scores 5 | % inputs{2}->gt labels 6 | properties 7 | nImgPerClass = [] 8 | nCorPred = [] 9 | accuracy = [] 10 | resetLayer = false 11 | end 12 | 13 | methods 14 | function outputs = forward(obj, inputs, params) 15 | 16 | if numel(inputs)~=2 17 | error('wrong number of inputs'); 18 | end 19 | 20 | nCls = size(inputs{1},3); 21 | 22 | if obj.resetLayer || isempty(obj.nImgPerClass) 23 | obj.nImgPerClass = zeros(1,size(inputs{1},3)); 24 | obj.nCorPred = zeros(1,size(inputs{1},3)); 25 | obj.accuracy = zeros(1,size(inputs{1},3)); 26 | 27 | if obj.resetLayer 28 | obj.resetLayer = false ; 29 | obj.average = 0 ; 30 | end 31 | end 32 | 33 | 34 | [~,predictions] = max(gather(squeeze(inputs{1})),[],1); 35 | 36 | for c=1:nCls 37 | obj.nImgPerClass(c) = obj.nImgPerClass(c) + sum(inputs{2}==c); 38 | obj.nCorPred(c) = obj.nCorPred(c) + sum(predictions==c & inputs{2}==c); 39 | end 40 | 41 | ni = obj.nImgPerClass; 42 | ni(ni==0) = 1; 43 | 44 | obj.accuracy = obj.nCorPred ./ ni; 45 | obj.average = (1-mean(obj.accuracy)); 46 | outputs{1} = obj.average; 47 | end 48 | 49 | function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) 50 | derInputs = cell(1,2); 51 | derParams = {} ; 52 | end 53 | 54 | function reset(obj) 55 | obj.resetLayer = true ; 56 | % obj.nImgPerClass = []; 57 | % obj.nCorPred = []; 58 | % obj.accuracy = []; 59 | % obj.average = 0; 60 | end 61 | 62 | 63 | function obj = ErrorMultiClass(varargin) 64 | obj.load(varargin) ; 65 | obj.loss = 'error_multi_class' ; 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /Layers/LossNormalized.m: -------------------------------------------------------------------------------- 1 | classdef LossNormalized < dagnn.Loss 2 | % properties 3 | % loss = 'softmaxlog' 4 | % ignoreAverage = false 5 | % opts = {} 6 | % end 7 | % properties (Transient) 8 | % average = 0 9 | % numAveraged = 0 10 | % end 11 | 12 | methods 13 | function outputs = forward(obj, inputs, params) 14 | outputs{1} = vl_nnloss(inputs{1}, inputs{2}, [], 'loss', obj.loss, obj.opts{:}) ; 15 | obj.accumulateAverage(inputs, outputs); 16 | if numel(size(inputs{1}))>3 17 | bs = size(inputs{1},4) ; 18 | else 19 | bs = 1 ; 20 | end 21 | outputs{1} = outputs{1} / bs ; 22 | end 23 | 24 | function accumulateAverage(obj, inputs, outputs) 25 | if obj.ignoreAverage, return; end; 26 | n = obj.numAveraged ; 27 | m = n + size(inputs{1}, 1) * size(inputs{1}, 2) * size(inputs{1}, 4); 28 | obj.average = bsxfun(@plus, n * obj.average, gather(outputs{1})) / m ; 29 | obj.numAveraged = m ; 30 | end 31 | 32 | function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) 33 | if numel(size(inputs{1}))>3 34 | bs = size(inputs{1},4) ; 35 | else 36 | bs = 1 ; 37 | end 38 | 39 | derInputs{1} = vl_nnloss(inputs{1}, inputs{2}, derOutputs{1}, 'loss', obj.loss, obj.opts{:}) / bs; 40 | derInputs{2} = [] ; 41 | derParams = {} ; 42 | end 43 | 44 | function reset(obj) 45 | obj.average = 0 ; 46 | obj.numAveraged = 0 ; 47 | end 48 | 49 | function outputSizes = getOutputSizes(obj, inputSizes, paramSizes) 50 | outputSizes{1} = [1 1 1 inputSizes{1}(4)] ; 51 | end 52 | 53 | function rfs = getReceptiveFields(obj) 54 | % the receptive field depends on the dimension of the variables 55 | % which is not known until the network is run 56 | rfs(1,1).size = [NaN NaN] ; 57 | rfs(1,1).stride = [NaN NaN] ; 58 | rfs(1,1).offset = [NaN NaN] ; 59 | rfs(2,1) = rfs(1,1) ; 60 | end 61 | 62 | function obj = LossNormalized(varargin) 63 | obj.load(varargin) ; 64 | end 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /Layers/BatchNormN.m: -------------------------------------------------------------------------------- 1 | classdef BatchNormN < dagnn.ElementWise 2 | properties 3 | numChannels 4 | epsilon = 1e-5 5 | opts = {'NoCuDNN'} % ours seems slightly faster 6 | end 7 | 8 | properties (Transient) 9 | moments 10 | end 11 | 12 | methods 13 | function outputs = forward(obj, inputs, params) 14 | if strcmp(obj.net.mode, 'test') 15 | outputs{1} = vl_nnbnorm(inputs{1}, params{1}, params{2}, ... 16 | 'moments', params{3}, ... 17 | 'epsilon', obj.epsilon, ... 18 | obj.opts{:}) ; 19 | else 20 | [outputs{1},obj.moments] = ... 21 | vl_nnbnorm(inputs{1}, params{1}, params{2}, ... 22 | 'epsilon', obj.epsilon, ... 23 | obj.opts{:}) ; 24 | end 25 | end 26 | 27 | function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) 28 | [derInputs{1}, derParams{1}, derParams{2}, derParams{3}] = ... 29 | vl_nnbnorm(inputs{1}, params{1}, params{2}, derOutputs{1}, ... 30 | 'epsilon', obj.epsilon, ... 31 | 'moments', obj.moments, ... 32 | obj.opts{:}) ; 33 | obj.moments = [] ; 34 | % multiply the moments update by the number of images in the batch 35 | % this is required to make the update additive for subbatches 36 | % and will eventually be normalized away 37 | % derParams{3} = derParams{3} * size(inputs{1},4) ; 38 | end 39 | 40 | % --------------------------------------------------------------------- 41 | function obj = BatchNormN(varargin) 42 | obj.load(varargin{:}) ; 43 | end 44 | 45 | function params = initParams(obj) 46 | params{1} = ones(obj.numChannels,1,'single') ; 47 | params{2} = zeros(obj.numChannels,1,'single') ; 48 | params{3} = zeros(obj.numChannels,2,'single') ; 49 | end 50 | 51 | function attach(obj, net, index) 52 | attach@dagnn.ElementWise(obj, net, index) ; 53 | p = net.getParamIndex(net.layers(index).params{3}) ; 54 | net.params(p).trainMethod = 'average' ; 55 | net.params(p).learningRate = 0.1 ; 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /main_train.m: -------------------------------------------------------------------------------- 1 | model = 'resnext50' ; % {'cafferef','resnext50','resnext101'} 2 | input = 'rgb' ; % {'rgb','of'} 3 | dataset = 'ucf101' ; % {'ucf101','hmdb51'} hmdb51 requires more iterations to train (add more epochs to learning rate) 4 | opts.train.batchSize = 128 ; 5 | opts.train.numSubBatches = 32 ; % increase the number (16,32) if it does not fit into gpu mem 6 | opts.epochFactor = 5 ; 7 | opts.split = 1 ; 8 | 9 | opts.train.gpus = 1 ; 10 | 11 | run matconvnet/matlab/vl_setupnn.m ; 12 | vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ; 13 | vl_contrib install autonn ; vl_contrib setup autonn ; 14 | 15 | % addpath(fullfile('matconvnet','contrib','mcnExtraLayers','matlab')) ; 16 | 17 | opts.expDir = ['exp/' model 'rgb-arpool-split' num2str(opts.split)] ; 18 | if strcmp(input,'rgb') 19 | opts.DropOutRate = 0.5 ; 20 | trainfn = @cnn_dicnn_rgb ; 21 | elseif strcmp(input,'of') 22 | opts.DropOutRate = 0.8 ; 23 | trainfn = @cnn_dicnn_of ; 24 | end 25 | 26 | if strcmp(model,'cafferef') 27 | 28 | opts.pool1Layer = 'conv1' ; 29 | % download from http://www.vlfeat.org/matconvnet/models/imagenet-caffe-ref.mat 30 | opts.modelPath = fullfile('models','imagenet-caffe-ref.mat') ; 31 | opts.networkFn = @cnn_init_cafferef ; 32 | 33 | if strcmp(input,'rgb') 34 | opts.train.learningRate = 1e-3 * [ones(1,2) 0.1*ones(1,2)] ; 35 | else 36 | opts.train.learningRate = 3e-3 * [ones(1,10) 0.1*ones(1,2)] ; 37 | end 38 | 39 | opts.train.numEpochs = numel(opts.train.learningRate) ; 40 | elseif strcmp(model,'resnext50') || strcmp(model,'resnext101') 41 | % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_50_32x4d-pt-mcn.mat 42 | % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_101_32x4d-pt-mcn.mat 43 | if strcmp(model,'resnext50') 44 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ; 45 | else 46 | opts.modelPath = fullfile('models','resnext_101_32x4d-pt-mcn.mat') ; 47 | end 48 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ; 49 | opts.networkFn = @cnn_init_resnext ; 50 | if strcmp(input,'rgb') 51 | opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,8) ] ; 52 | else 53 | opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,2) ] ; 54 | end 55 | end 56 | 57 | addpath dicnn ; 58 | 59 | [net, info] = trainfn(opts) 60 | -------------------------------------------------------------------------------- /Datasets/cnn_hmdb51_setup_data.m: -------------------------------------------------------------------------------- 1 | function imdb = cnn_hmdb51_setup_data(varargin) 2 | % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set 3 | % http://crcv.ucf.edu/data/UCF101.php 4 | % this script requires UCF101 downloaded and frames extracted in frames 5 | % folder 6 | 7 | opts.dataDir = fullfile('data','HMDB51') ; 8 | opts.lite = false ; 9 | % opts = vl_argparse(opts, varargin) ; 10 | 11 | %% ------------------------------------------------------------------------ 12 | % Load categories metadata 13 | % ------------------------------------------------------------------------- 14 | % find images 15 | imagePath = fullfile(opts.dataDir, 'frames', '*') ; 16 | images = dir(imagePath) ; 17 | 18 | videoNames = cell(1,numel(images)) ; 19 | frameNames = cell(1,numel(images)) ; 20 | nrFrames = zeros(1,numel(images)) ; 21 | for i=1:numel(images) 22 | 23 | frames = dir(fullfile(opts.dataDir,'frames',images(i).name,'frame*.jpg')) ; 24 | framesc = cell(1,numel(frames)) ; 25 | if ~isempty(numel(frames)) 26 | for j=1:numel(frames) 27 | framesc{j} = frames(j).name ; 28 | end 29 | frameNames{i} = strcat(images(i).name,'/',framesc) ; 30 | nrFrames(i) = numel(framesc) ; 31 | videoNames{i} = images(i).name ; 32 | end 33 | end 34 | 35 | videoNames(nrFrames==0) = [] ; 36 | frameNames(nrFrames==0) = [] ; 37 | % nrFrames(nrFrames==0) = [] ; 38 | 39 | 40 | % find metadata 41 | % ncls = 51 ; 42 | 43 | 44 | metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ; 45 | 46 | splits = dir(metaPath) ; 47 | 48 | % splitFiles = cell(1,3*ncls) ; 49 | cats = cell(1,numel(videoNames)) ; 50 | sets = zeros(3,numel(videoNames)) ; 51 | catNames = cell(1,numel(splits)) ; 52 | 53 | for i=1:numel(splits) 54 | j = strfind(splits(i).name,'_test_') ; 55 | splitno = str2double(splits(i).name(j+11)) ; 56 | catNames{i} = splits(i).name(1:j-1) ; 57 | t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ; 58 | 59 | vids = cell(1,numel(t.textdata)) ; 60 | for k=1:numel(t.textdata) 61 | vids{k} = t.textdata{k}(1:end-4) ; 62 | end 63 | 64 | [ia,ib] = ismember(vids,videoNames) ; 65 | assert(all(ia)) ; 66 | sets(splitno,ib) = t.data' ; 67 | cats(ib) = repmat(catNames(i),numel(ia),1) ; 68 | end 69 | 70 | [cu,~,labels] = unique(cats) ; 71 | sets(sets(:)==2) = 3 ; 72 | 73 | imdb.classes.name = cu ; 74 | imdb.images.name = videoNames ; 75 | imdb.images.names = frameNames ; 76 | imdb.images.label = labels' ; 77 | imdb.images.sets = sets ; 78 | imdb.imageDir = fullfile(opts.dataDir, 'frames') ; 79 | 80 | 81 | -------------------------------------------------------------------------------- /Datasets/cnn_hmdb51_of_setup_data.m: -------------------------------------------------------------------------------- 1 | function imdb = cnn_hmdb51_of_setup_data(varargin) 2 | % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set 3 | % http://crcv.ucf.edu/data/UCF101.php 4 | % this script requires UCF101 downloaded and frames extracted in frames 5 | % folder 6 | 7 | 8 | opts.dataDir = fullfile('data','HMDB51') ; 9 | opts.lite = false ; 10 | % opts = vl_argparse(opts, varargin) ; 11 | 12 | %% ------------------------------------------------------------------------ 13 | % Load categories metadata 14 | % ------------------------------------------------------------------------- 15 | % find images 16 | imagePath = fullfile(opts.dataDir, 'tvl1_flow', 'u', '*') ; 17 | images = dir(imagePath) ; 18 | 19 | videoNames = cell(1,numel(images)) ; 20 | frameNames = cell(1,numel(images)) ; 21 | nrFrames = zeros(1,numel(images)) ; 22 | for i=1:numel(images) 23 | 24 | frames = dir(fullfile(opts.dataDir,'tvl1_flow','u',images(i).name,'frame*.jpg')) ; 25 | framesc = cell(1,numel(frames)) ; 26 | if ~isempty(numel(frames)) 27 | for j=1:numel(frames) 28 | framesc{j} = frames(j).name ; 29 | end 30 | frameNames{i} = framesc ; 31 | frameNames{i} = strcat(images(i).name,'/',framesc) ; 32 | nrFrames(i) = numel(framesc) ; 33 | videoNames{i} = images(i).name ; 34 | end 35 | end 36 | 37 | videoNames(nrFrames==0) = [] ; 38 | frameNames(nrFrames==0) = [] ; 39 | % nrFrames(nrFrames==0) = [] ; 40 | 41 | 42 | frameNamesuv = cell(1,numel(frameNames)) ; 43 | for i=1:numel(frameNames) 44 | nn = frameNames{i} ; 45 | nn1 = strcat('u/',nn) ; 46 | nn2 = strcat('v/',nn) ; 47 | 48 | frameNamesuv{i} = cell(1,2*numel(nn1)) ; 49 | frameNamesuv{i}(1:2:end) = nn1 ; 50 | frameNamesuv{i}(2:2:end) = nn2 ; 51 | end 52 | 53 | % find metadata 54 | % ncls = 51 ; 55 | 56 | metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ; 57 | 58 | splits = dir(metaPath) ; 59 | 60 | cats = cell(1,numel(videoNames)) ; 61 | sets = zeros(3,numel(videoNames)) ; 62 | catNames = cell(1,numel(splits)) ; 63 | 64 | for i=1:numel(splits) 65 | j = strfind(splits(i).name,'_test_') ; 66 | splitno = str2double(splits(i).name(j+11)) ; 67 | catNames{i} = splits(i).name(1:j-1) ; 68 | t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ; 69 | 70 | vids = cell(1,numel(t.textdata)) ; 71 | for k=1:numel(t.textdata) 72 | vids{k} = t.textdata{k}(1:end-4) ; 73 | end 74 | 75 | [ia,ib] = ismember(vids,videoNames) ; 76 | assert(all(ia)) ; 77 | sets(splitno,ib) = t.data' ; 78 | cats(ib) = repmat(catNames(i),numel(ia),1) ; 79 | end 80 | 81 | [cu,~,labels] = unique(cats) ; 82 | sets(sets(:)==2) = 3 ; 83 | 84 | imdb.classes.name = cu ; 85 | imdb.images.name = videoNames ; 86 | imdb.images.names = frameNamesuv ; 87 | imdb.images.label = labels' ; 88 | imdb.images.sets = sets ; 89 | imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ; 90 | -------------------------------------------------------------------------------- /Datasets/cnn_ucf101_setup_data.m: -------------------------------------------------------------------------------- 1 | function imdb = cnn_ucf101_setup_data(varargin) 2 | % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set 3 | % http://crcv.ucf.edu/data/UCF101.php 4 | % this script requires UCF101 downloaded and frames extracted in frames 5 | % folder 6 | 7 | opts.dataDir = fullfile('data','UCF101') ; 8 | opts.lite = false ; 9 | opts = vl_argparse(opts, varargin) ; 10 | 11 | %% ------------------------------------------------------------------------ 12 | % Load categories metadata 13 | % ------------------------------------------------------------------------- 14 | 15 | % find metadata 16 | metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ; 17 | 18 | fprintf('using metadata %s\n', metaPath) ; 19 | tmp = importdata(metaPath); 20 | nCls = numel(tmp); 21 | 22 | if nCls ~= 101 23 | error('Wrong meta file %s',metaPath); 24 | end 25 | 26 | cats = cell(1,nCls); 27 | for i=1:numel(tmp) 28 | t = strsplit(tmp{i}); 29 | cats{i} = t{2}; 30 | end 31 | 32 | imdb.classes.name = cats ; 33 | imdb.imageDir = fullfile(opts.dataDir, 'frames') ; 34 | 35 | %% ------------------------------------------------------------------------ 36 | % load image names and labels 37 | % ------------------------------------------------------------------------- 38 | 39 | fprintf('searching training images ...\n') ; 40 | names = {} ; 41 | name = {}; 42 | labels = {} ; 43 | for d = dir(fullfile(imdb.imageDir, 'v_*'))' 44 | [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ; 45 | if lab==0 46 | error('no class label found for %s',d.name); 47 | end 48 | ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ; 49 | name{end+1} = d.name; 50 | names{end+1} = strcat([d.name, filesep], {ims.name}) ; 51 | labels{end+1} = lab ; 52 | if mod(numel(names), 10) == 0, fprintf('.') ; end 53 | if mod(numel(names), 500) == 0, fprintf('\n') ; end 54 | %fprintf('found %s with %d images\n', d.name, numel(ims)) ; 55 | end 56 | % names = horzcat(names{:}) ; 57 | labels = horzcat(labels{:}) ; 58 | 59 | imdb.images.id = 1:numel(names) ; 60 | imdb.images.name = name ; 61 | imdb.images.names = names ; 62 | imdb.images.label = labels ; 63 | 64 | 65 | %% ------------------------------------------------------------------------ 66 | % load train / test splits 67 | % ------------------------------------------------------------------------- 68 | 69 | fprintf('labeling data...(this may take couple of minutes)') ; 70 | imdb.images.sets = zeros(3, numel(names)) ; 71 | setNames = {'train','test'}; 72 | setVal = [1,3]; 73 | 74 | for s=1:numel(setNames) 75 | for i=1:3 76 | trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',... 77 | setNames{s},i)) ; 78 | trainList = importdata(trainFl); 79 | if isfield(trainList,'textdata') 80 | trainList = trainList.textdata; 81 | end 82 | for j=1:numel(trainList) 83 | tmp = strsplit(trainList{j},'/'); 84 | [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ; 85 | if lab==0 86 | error('cannot find the video %s',tmp{2}); 87 | end 88 | % if trainList.data(j) ~= labels(lab) 89 | % error('Labels do not match for %s',tmp{2}); 90 | % end 91 | imdb.images.sets(i,lab) = setVal(s); 92 | end 93 | end 94 | end 95 | fprintf('\n') ; 96 | %% ------------------------------------------------------------------------ 97 | % Postprocessing 98 | % ------------------------------------------------------------------------- 99 | 100 | % sort categories by WNID (to be compatible with other implementations) 101 | [imdb.classes.name,perm] = sort(imdb.classes.name) ; 102 | relabel(perm) = 1:numel(imdb.classes.name) ; 103 | ok = imdb.images.label > 0 ; 104 | imdb.images.label(ok) = relabel(imdb.images.label(ok)) ; 105 | 106 | if opts.lite 107 | % pick a small number of images for the first 10 classes 108 | % this cannot be done for test as we do not have test labels 109 | clear keep ; 110 | for i=1:10 111 | sel = find(imdb.images.label == i) ; 112 | train = sel(imdb.images.sets(1,sel) == 1) ; 113 | test = sel(imdb.images.sets(1,sel) == 3) ; 114 | keep{i} = [train test] ; 115 | end 116 | keep = keep{:}; 117 | imdb.images.id = imdb.images.id(keep) ; 118 | imdb.images.name = imdb.images.name(keep) ; 119 | imdb.images.names = imdb.images.names(keep) ; 120 | imdb.images.sets = imdb.images.sets(1,keep) ; 121 | imdb.images.label = imdb.images.label(keep) ; 122 | end 123 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dynamic Image Networks for Action Recognition 2 | ## Improved Results (see the extended version of CVPR paper) 3 | 4 | 5 | ResNeXt-50 | HMDB51 (%) | UCF101 (%) | 6 | ------------------|--------|--------| 7 | SI | 53.5 | 87.6 | 8 | DI | 57.3 | 86.6 | 9 | OF | 55.8 | 84.9 | 10 | DOF | 58.9 | 86.6 | 11 | SI+OF | 67.5 | 93.9 | 12 | SI+DI | 61.3 | 90.6 | 13 | OF+DOF | 62.6 | 89.1 | 14 | SI+DI+OF+DOF | 71.5 | 95.0 | 15 | SI+DI+OF+DOF+iDT | 74.2 | 95.4 | 16 | 17 | * Results are in the standard average multi-class accuracy (%) 18 | * SI: RGB image 19 | * DI: dynamic RBG image 20 | * OF: optical flow 21 | * DOF: dynamic optical flow 22 | * iDT: improved trajectory features 23 | 24 | 25 | ## Installation 26 | 1. Clone the Dynamic Image Net repository: 27 | 28 | ```Shell 29 | git clone --recursive https://github.com/hbilen/dynamic-image-nets 30 | ``` 31 | 32 | 2. Compile matconvnet toolbox: (see [http://www.vlfeat.org/matconvnet/install/](http://www.vlfeat.org/matconvnet/install/)) 33 | 34 | 3. Install additional matconvnet packages 35 | 36 | ```Shell 37 | run matconvnet/matlab/vl_setupnn.m ; 38 | vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ; 39 | vl_contrib install autonn ; vl_contrib setup autonn ; 40 | ``` 41 | 42 | 4. Download your dataset : (e.g. UCF101 from [http://crcv.ucf.edu/data/UCF101.php](http://crcv.ucf.edu/data/UCF101.php)) 43 | 44 | 5. Convert videos to frames, resize them to 256x256 and store them in such a directory structure: 45 | Alternatively, you can download RGB and precomputed optical flow frames from [Christoph Feichtenhofer](http://ftp.tugraz.at/pub/feichtenhofer/tsfusion/data/) and copy RGB frames under "UCF101/frames" and optical flow frames under "UCF101/tvl1_flow". 46 | 47 | ```Shell 48 | data/UCF101/ucfTrainTestlist/ 49 | ├── classInd.txt 50 | ├── testlist01.txt 51 | ├── testlist02.txt 52 | ├── testlist03.txt 53 | ├── trainlist01.txt 54 | ├── trainlist02.txt 55 | └── trainlist03.txt 56 | data/UCF101/frames/ 57 | ├── v_ApplyEyeMakeup_g01_c01 58 | │ ├── 00001.jpg 59 | │ ├── 00002.jpg 60 | │ ├── 00003.jpg 61 | │ ├── 00004.jpg 62 | │ ├── 00005.jpg 63 | ``` 64 | 65 | ## Compute and Visualise Approximate Dynamic Images 66 | 1. If you want to compute approximate dynamic images, get a list of ordered frames from a video and try 67 | ```matlab 68 | di = compute_approximate_dynamic_images(images) ; 69 | ``` 70 | 71 | 2. If you want to visualise approximate dynamic images, get a list of ordered frames from a video and try 72 | ```matlab 73 | visualize_approximate_dynamic_images(images) 74 | ``` 75 | 76 | ## Train a Dynamic Image Net 77 | You can modify the options in `main_train.m` and train your model by running 78 | ```matlab 79 | main_train 80 | ``` 81 | 82 | Note: If you want to train a model on a different dataset than UCF101 or HMDB51, you need to write a custom script `cnn_dataset_setup_data` to build your database (imdb). 83 | 84 | ## Evaluation 85 | 1. Download the CNN Models for the UCF101 dataset, that are used in the journal, from [here](http://groups.inf.ed.ac.uk/hbilen-data/data/resnext50_dicnn.tar). 86 | 2. Choose the right model, split and input type (e.g.) 87 | ```matlab 88 | net = load('resnext50-rgb-arpool-split1.mat') ; 89 | net = dagnn.DagNN.loadobj(net) ; 90 | net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr') ; 91 | opts.network = net ; 92 | opts.split = 1 ; 93 | opts.train.gpus = 1 ; 94 | opts.epochFactor = 0 ; 95 | [net, info] = cnn_dicnn_rgb(opts) 96 | ``` 97 | 98 | ## Citing Dynamic Image Networks 99 | 100 | If you find the code useful, please cite: 101 | 102 | @inproceedings{Bilen2016a, 103 | author = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A. and Gould, S.", 104 | title = "Dynamic Image Networks for Action Recognition", 105 | booktitle = "CVPR", 106 | year = "2016" 107 | } 108 | @journal{Bilen2017a, 109 | author = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A.", 110 | title = "Action Recognition with Dynamic Image Networks", 111 | journal = " IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)", 112 | year = "2017" 113 | } 114 | 115 | ## License 116 | The analysis work performed with the program(s) must be non-proprietary work. Licensee and its contract users must be or be affiliated with an academic facility. Licensee may additionally permit individuals who are students at such academic facility to access and use the program(s). Such students will be considered contract users of licensee. The program(s) may not be used for commercial competitive analysis (such as benchmarking) or for any commercial activity, including consulting. 117 | 118 | -------------------------------------------------------------------------------- /Datasets/cnn_ucf101_of_setup_data.m: -------------------------------------------------------------------------------- 1 | function imdb = cnn_ucf101_of_setup_data(varargin) 2 | % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set 3 | % http://crcv.ucf.edu/data/UCF101.php 4 | % this script requires UCF101 downloaded and frames extracted in frames 5 | % folder 6 | 7 | opts.dataDir = fullfile('data','UCF101') ; 8 | opts.lite = false ; 9 | opts = vl_argparse(opts, varargin) ; 10 | 11 | %% ------------------------------------------------------------------------ 12 | % Load categories metadata 13 | % ------------------------------------------------------------------------- 14 | 15 | % find metadata 16 | metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ; 17 | 18 | fprintf('using metadata %s\n', metaPath) ; 19 | tmp = importdata(metaPath); 20 | nCls = numel(tmp); 21 | 22 | if nCls ~= 101 23 | error('Wrong meta file %s',metaPath); 24 | end 25 | 26 | cats = cell(1,nCls); 27 | for i=1:numel(tmp) 28 | t = strsplit(tmp{i}); 29 | cats{i} = t{2}; 30 | end 31 | 32 | imdb.classes.name = sort(cats) ; 33 | imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow', 'u') ; 34 | 35 | %% ------------------------------------------------------------------------ 36 | % load image names and labels 37 | % ------------------------------------------------------------------------- 38 | 39 | fprintf('searching training images ...\n') ; 40 | names = {} ; 41 | name = {}; 42 | labels = {} ; 43 | for d = dir(fullfile(imdb.imageDir, 'v_*'))' 44 | [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ; 45 | if lab==0 46 | error('no class label found for %s',d.name); 47 | end 48 | ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ; 49 | name{end+1} = d.name; 50 | names{end+1} = strcat([d.name, filesep], {ims.name}) ; 51 | labels{end+1} = lab ; 52 | if mod(numel(names), 10) == 0, fprintf('.') ; end 53 | if mod(numel(names), 500) == 0, fprintf('\n') ; end 54 | %fprintf('found %s with %d images\n', d.name, numel(ims)) ; 55 | end 56 | % names = horzcat(names{:}) ; 57 | 58 | labels = horzcat(labels{:}) ; 59 | % labels = [labels ; labels] ; 60 | labels = labels(:)' ; 61 | 62 | for i=1:numel(names) 63 | nn = names{i} ; 64 | nn1 = strcat('u/',nn) ; 65 | nn2 = strcat('v/',nn) ; 66 | 67 | names{i} = cell(1,2*numel(nn1)) ; 68 | names{i}(1:2:end) = nn1 ; 69 | names{i}(2:2:end) = nn2 ; 70 | end 71 | 72 | imdb.images.id = 1:numel(names) ; 73 | imdb.images.name = name ; 74 | imdb.images.names = names ; 75 | imdb.images.label = labels ; 76 | imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ; 77 | 78 | %% ------------------------------------------------------------------------ 79 | % load train / test splits 80 | % ------------------------------------------------------------------------- 81 | 82 | fprintf('labeling data...(this may take couple of minutes)') ; 83 | imdb.images.sets = zeros(3, numel(names)) ; 84 | setNames = {'train','test'}; 85 | setVal = [1,3]; 86 | 87 | for s=1:numel(setNames) 88 | for i=1:3 89 | trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',... 90 | setNames{s},i)) ; 91 | trainList = importdata(trainFl); 92 | if isfield(trainList,'textdata') 93 | trainList = trainList.textdata; 94 | end 95 | for j=1:numel(trainList) 96 | tmp = strsplit(trainList{j},'/'); 97 | [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ; 98 | if lab==0 99 | % error('cannot find the video %s',tmp{2}(1:end-4)); 100 | warning('cannot find the video %s',tmp{2}(1:end-4)); 101 | continue ; 102 | end 103 | % if trainList.data(j) ~= labels(lab) 104 | % error('Labels do not match for %s',tmp{2}); 105 | % end 106 | imdb.images.sets(i,lab) = setVal(s); 107 | end 108 | end 109 | end 110 | fprintf('\n') ; 111 | %% ------------------------------------------------------------------------ 112 | % Postprocessing 113 | % ------------------------------------------------------------------------- 114 | 115 | % sort categories by WNID (to be compatible with other implementations) 116 | [imdb.classes.name,perm] = sort(imdb.classes.name) ; 117 | relabel(perm) = 1:numel(imdb.classes.name) ; 118 | ok = imdb.images.label > 0 ; 119 | imdb.images.label(ok) = relabel(imdb.images.label(ok)) ; 120 | 121 | if opts.lite 122 | % pick a small number of images for the first 10 classes 123 | % this cannot be done for test as we do not have test labels 124 | clear keep ; 125 | for i=1:10 126 | sel = find(imdb.images.label == i) ; 127 | train = sel(imdb.images.sets(1,sel) == 1) ; 128 | test = sel(imdb.images.sets(1,sel) == 3) ; 129 | keep{i} = [train test] ; 130 | end 131 | keep = keep{:}; 132 | imdb.images.id = imdb.images.id(keep) ; 133 | imdb.images.name = imdb.images.name(keep) ; 134 | imdb.images.names = imdb.images.names(keep) ; 135 | imdb.images.sets = imdb.images.sets(1,keep) ; 136 | imdb.images.label = imdb.images.label(keep) ; 137 | end 138 | -------------------------------------------------------------------------------- /dicnn/cnn_init_cafferef.m: -------------------------------------------------------------------------------- 1 | % ------------------------------------------------------------------------- 2 | function net = cnn_init_cafferef(net,opts) 3 | % ------------------------------------------------------------------------- 4 | 5 | drop6p = find(cellfun(@(a) strcmp(a.name, 'dropout6'), net.layers)==1); 6 | drop7p = find(cellfun(@(a) strcmp(a.name, 'dropout7'), net.layers)==1); 7 | 8 | if ~isempty(drop6p) 9 | assert(~isempty(drop7p)); 10 | net.layers{drop6p}.rate = opts.DropOutRate; 11 | net.layers{drop7p}.rate = opts.DropOutRate; 12 | else 13 | relu6p = find(cellfun(@(a) strcmp(a.name, 'relu6'), net.layers)==1); 14 | relu7p = find(cellfun(@(a) strcmp(a.name, 'relu7'), net.layers)==1); 15 | 16 | drop6 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout6') ; 17 | drop7 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout7') ; 18 | net.layers = [net.layers(1:relu6p) drop6 net.layers(relu6p+1:relu7p) drop7 net.layers(relu7p+1:end)]; 19 | end 20 | 21 | % replace fc8 22 | fc8l = cellfun(@(a) strcmp(a.name, 'fc8'), net.layers)==1; 23 | 24 | nCls = opts.nCls ; 25 | % nCls = 101; 26 | sizeW = size(net.layers{fc8l}.weights{1}); 27 | 28 | if sizeW(4)~=nCls 29 | net.layers{fc8l}.weights = {zeros(sizeW(1),sizeW(2),sizeW(3),nCls,'single'), ... 30 | zeros(1, nCls, 'single')}; 31 | end 32 | 33 | % change loss 34 | % net.layers(end) = []; 35 | net.layers{end} = struct('name','loss', 'type','softmaxloss') ; 36 | 37 | % convert to dagnn 38 | net = dagnn.DagNN.fromSimpleNN(net, 'canonicalNames', true) ; 39 | 40 | poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1); 41 | assert(~isempty(poolLyr1)); 42 | % configure appr-rank-pool 43 | switch opts.pool1Type 44 | case 'arpool' 45 | if strcmp(opts.pool1Layer,'conv1') 46 | net.addLayer('arpool',AppRankPooling('scale',1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN'); 47 | net.addLayer('l2normalize',L2Normalize('scale',6000,'clip',[-128 128]),... 48 | 'DynImgN','DynImg'); 49 | else 50 | net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN'); 51 | net.addLayer('reluP',dagnn.ReLU(),... 52 | {'DynImgN'},'DynImg'); 53 | end 54 | net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ; 55 | case 'ppool1' 56 | if strcmp(opts.pool1Layer,'conv1') 57 | net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... 58 | {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'}); 59 | else 60 | net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... 61 | {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'}); 62 | net.addLayer('reluP',dagnn.ReLU(),... 63 | {'DynImgN'},'DynImg'); 64 | end 65 | 66 | net.layers(poolLyr1).inputs{1} = 'DynImg' ; 67 | % net.params(end-1).value = 0.01 * randn(1,1,10,1,'single'); 68 | net.params(end-1).value = 0.1 * ones(1,1,10,1,'single'); 69 | net.params(end).value = zeros(1,1,'single'); 70 | 71 | net.params(end-1).learningRate = 0.1 ; 72 | net.params(end).learningRate = 0.2 ; 73 | case 'ppool2' 74 | if strcmp(opts.pool1Layer,'conv1') 75 | net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... 76 | {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'}); 77 | else 78 | net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... 79 | {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'}); 80 | net.addLayer('reluP',dagnn.ReLU(),... 81 | {'DynImgN'},'DynImg'); 82 | end 83 | 84 | net.layers(poolLyr1).inputs{1} = 'DynImg' ; 85 | % net.params(end-1).value = 0.01 * randn(1,1,10,1,'single'); 86 | net.params(end-1).value = 0.1 * ones(1,1,10,1,'single'); 87 | net.params(end).value = zeros(1,1,'single'); 88 | 89 | net.params(end-1).learningRate = 0.1 ; 90 | net.params(end).learningRate = 0.2 ; 91 | case 'none' 92 | 93 | otherwise 94 | error('Unknown pool type %s', opts.pool1Type) ; 95 | end 96 | 97 | 98 | 99 | % second pool layer (max pooling) 100 | poolLyr2 = find(arrayfun(@(a) strcmp(a.name, opts.pool2Layer), net.layers)==1); 101 | net.addLayer('tempPoolMax',TemporalPooling('method','max'),... 102 | {net.layers(poolLyr2(1)).inputs{1},'VideoId2'},'tempPoolMax'); 103 | 104 | net.layers(poolLyr2).inputs{1} = 'tempPoolMax'; 105 | 106 | % add multi-class error 107 | net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr'); 108 | 109 | net_ = net.saveobj ; 110 | net = dagnn.DagNN.loadobj(net_) ; 111 | 112 | net.removeLayer('loss') ; 113 | net.addLayer('loss', ... 114 | LossNormalized('loss', 'softmaxlog') ,... 115 | {'prediction', 'label'}, ... 116 | 'objective') ; 117 | 118 | % replace standard matconvnet bnorm with my version 119 | bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1); 120 | for i=1:numel(bns) 121 | bb = net.layers(bns(i)).block ; 122 | net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,... 123 | 'epsilon',bb.epsilon,... 124 | 'opts',bb.opts) ; 125 | end 126 | -------------------------------------------------------------------------------- /dicnn/cnn_video_rgb_get_batch.m: -------------------------------------------------------------------------------- 1 | function imo = cnn_video_rgb_get_batch(images, vids, varargin) 2 | % CNN_VIDEO_RGB_GET_BATCH Load, preprocess, and pack images for CNN evaluation 3 | 4 | % video ids 5 | % use same spatial jittering for frames from the same video 6 | % NOTE: all the frames from a video should have the same size (wxh) 7 | 8 | opts.imageSize = [227, 227] ; 9 | opts.border = [29, 29] ; 10 | opts.keepAspect = true ; 11 | opts.numAugments = 1 ; 12 | opts.transformation = 'none' ; 13 | opts.averageImage = [] ; 14 | opts.rgbVariance = zeros(0,3,'single') ; 15 | opts.interpolation = 'bilinear' ; 16 | opts.numThreads = 1 ; 17 | opts.prefetch = false ; 18 | opts.subMean = false ; % subtract the mean from each video 19 | opts.lazyResize = true ; 20 | 21 | opts = vl_argparse(opts, varargin); 22 | 23 | % fetch is true if images is a list of filenames (instead of 24 | % a cell array of images) 25 | fetch = numel(images) >= 1 && ischar(images{1}) ; 26 | 27 | % prefetch is used to load images in a separate thread 28 | prefetch = fetch & opts.prefetch ; 29 | 30 | if prefetch 31 | vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ; 32 | imo = [] ; 33 | return ; 34 | end 35 | if fetch 36 | im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ; 37 | else 38 | im = images ; 39 | end 40 | 41 | tfs = [] ; 42 | switch opts.transformation 43 | case 'none' 44 | tfs = [ 45 | .5 ; 46 | .5 ; 47 | 0 ] ; 48 | case 'f5' 49 | tfs = [... 50 | .5 0 0 1 1 .5 0 0 1 1 ; 51 | .5 0 1 0 1 .5 0 1 0 1 ; 52 | 0 0 0 0 0 1 1 1 1 1] ; 53 | case 'f25' 54 | [tx,ty] = meshgrid(linspace(0,1,5)) ; 55 | tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ; 56 | tfs_ = tfs ; 57 | tfs_(3,:) = 1 ; 58 | tfs = [tfs,tfs_] ; 59 | case 'stretch' 60 | case 'multiScaleRegular' 61 | otherwise 62 | error('Uknown transformations %s', opts.transformation) ; 63 | end 64 | [~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ; 65 | 66 | if ~isempty(opts.rgbVariance) && isempty(opts.averageImage) 67 | opts.averageImage = zeros(1,1,3) ; 68 | end 69 | if numel(opts.averageImage) == 3 70 | opts.averageImage = reshape(opts.averageImage, 1,1,3) ; 71 | end 72 | 73 | imo = zeros(opts.imageSize(1), opts.imageSize(2), 3, ... 74 | numel(images)*opts.numAugments, 'single') ; 75 | 76 | nVid = max(vids); 77 | si = 1 ; 78 | countv = 1; 79 | for v=1:nVid 80 | 81 | vid = find(vids==v); 82 | 83 | for i=1:numel(images(vid)) 84 | 85 | % acquire image 86 | if isempty(im{i}) 87 | imt = imread(images{vid(i)}) ; 88 | imt = single(imt) ; % faster than im2single (and multiplies by 255) 89 | else 90 | imt = im{vid(i)} ; 91 | end 92 | if size(imt,3) == 1 93 | imt = cat(3, imt, imt, imt) ; 94 | end 95 | 96 | % resize 97 | w = size(imt,2) ; 98 | h = size(imt,1) ; 99 | factor = [(opts.imageSize(1)+opts.border(1))/h ... 100 | (opts.imageSize(2)+opts.border(2))/w]; 101 | 102 | if opts.keepAspect 103 | factor = max(factor) ; 104 | end 105 | if any(abs(factor - 1) > 0.0001) 106 | imt = imresize(imt, ... 107 | 'scale', factor, ... 108 | 'method', opts.interpolation) ; 109 | end 110 | 111 | % crop & flip 112 | if i==1 113 | w = size(imt,2) ; 114 | h = size(imt,1) ; 115 | switch opts.transformation 116 | case 'stretch' 117 | sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ; 118 | dx = randi(w - sz(2) + 1, 1) ; 119 | dy = randi(h - sz(1) + 1, 1) ; 120 | flip = rand > 0.5 ; 121 | case 'multiScaleRegular' 122 | reg_szs = [256, 224, 192, 168] ; 123 | sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4)); 124 | 125 | dy = [0 h-sz(1) 0 h-sz(1) floor((h-sz(1)+1)/2)] + 1; 126 | dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1; 127 | corner = randi(5); 128 | dx = dx(corner); dy = dy(corner); 129 | flip = rand > 0.5 ; 130 | otherwise 131 | tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ; 132 | sz = opts.imageSize(1:2) ; 133 | dx = floor((w - sz(2)) * tf(2)) + 1 ; 134 | dy = floor((h - sz(1)) * tf(1)) + 1 ; 135 | flip = tf(3) ; 136 | end 137 | 138 | end 139 | 140 | if opts.lazyResize 141 | sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ; 142 | sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ; 143 | else 144 | factor = [opts.imageSize(1)/sz(1) ... 145 | opts.imageSize(2)/sz(2)]; 146 | if any(abs(factor - 1) > 0.0001) 147 | imt = imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), ... 148 | opts.imageSize(1:2), 'Antialiasing', false, ... 149 | 'Method', opts.interpolation); 150 | end 151 | sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1); 152 | end 153 | 154 | 155 | if flip 156 | sx = fliplr(sx) ; 157 | end 158 | 159 | imo(:,:,:,si) = imt(sy,sx,:) ; 160 | si = si + 1 ; 161 | end 162 | countv = countv + numel(images(vid)); 163 | 164 | end 165 | 166 | if ~isempty(opts.averageImage) && numel(opts.averageImage)==3 167 | if ~isempty(opts.rgbVariance) 168 | imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(3,1), 1,1,3)) ; 169 | else 170 | imo = bsxfun(@minus, imo, opts.averageImage) ; 171 | end 172 | end 173 | -------------------------------------------------------------------------------- /dicnn/cnn_init_resnext.m: -------------------------------------------------------------------------------- 1 | % ------------------------------------------------------------------------- 2 | function net = cnn_init_resnext(net,opts) 3 | % ------------------------------------------------------------------------- 4 | % initialize classifier 5 | net = dagnn.DagNN.loadobj(net) ; 6 | 7 | % convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1); 8 | 9 | fclayer = net.getLayer('classifier_0') ; 10 | sizeW = size(net.params(fclayer.paramIndexes(1)).value); 11 | 12 | % opts.nCls = 101; 13 | nCls = opts.nCls ; 14 | DropOutRate = opts.DropOutRate ; 15 | 16 | 17 | net.params(fclayer.paramIndexes(1)).value = ... 18 | 0.01 * randn([sizeW(1:3),nCls],'single') ; 19 | net.params(fclayer.paramIndexes(2)).value = zeros(nCls,1,'single') ; 20 | 21 | 22 | % change loss 23 | softmax = find(arrayfun(@(a) isa(a.block, 'dagnn.SoftMax'), net.layers)==1); 24 | if ~isempty(softmax) 25 | net.removeLayer(net.layers(softmax(1)).name) ; 26 | end 27 | % convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1); 28 | fclayer = find(arrayfun(@(a) strcmp(a.name, 'classifier_0'), net.layers)==1); 29 | net.renameVar(net.layers(fclayer(end)).name,'prediction') ; 30 | net.renameVar('data','input') ; 31 | 32 | %------------------------------------------------------------------------% 33 | % configure appr-rank-pool 34 | switch opts.pool1Type 35 | case 'arpool' 36 | if strcmp(opts.pool1Layer,'conv0') 37 | poolLyr1 = 1 ; 38 | net.addLayer('arpool',AppRankPooling('scale',0.1),{'input','VideoId1'},'DynImg'); 39 | net.setLayerInputs(net.layers(1).name,{'DynImg'}) ; 40 | else 41 | poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1); 42 | assert(~isempty(poolLyr1)); 43 | net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg'); 44 | net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ; 45 | end 46 | case 'ppool1' 47 | if strcmp(opts.pool1Layer,'conv0') 48 | poolLyr1 = 1 ; 49 | else 50 | poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1); 51 | end 52 | net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... 53 | {'features_4_0_merge','VideoId1'},'DynImg0',{'conv0f','conv0b'}); 54 | 55 | % net.params(end-1).value = 0.1 * ones(1,1,10,1,'single'); 56 | net.params(end-1).value = 0.1 * randn(1,1,10,1,'single'); 57 | net.params(end).value = zeros(1,1,'single'); 58 | 59 | net.addLayer('BnormDyn',dagnn.BatchNorm('numChannels',256),'DynImg0','DynImg',... 60 | {'dym','dyb','dybx'}) ; 61 | net.params(end-2).value = ones(256,1,'single') ; 62 | net.params(end-1).value = zeros(256,1,'single') ; 63 | net.params(end).value = zeros(256,2,'single') ; 64 | 65 | % net.addLayer('reluP',dagnn.ReLU(),... 66 | % {'DynImg1'},'DynImg'); 67 | net.layers(16).inputs{1} = 'DynImg' ; 68 | for i=numel(net.params)-4:numel(net.params), 69 | net.params(i).learningRate = 0.1 * net.params(i).learningRate; 70 | end 71 | case 'none' 72 | otherwise 73 | error('Unknown pool type %s', opts.pool1Type) ; 74 | end 75 | 76 | 77 | net.rebuild() ; 78 | %------------------------------------------------------------------------% 79 | % second pool layer (max pooling) 80 | % poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'pool5'), net.layers)==1); 81 | poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_merge'), net.layers)==1); 82 | net.addLayer('tempPoolMax',TemporalPooling('method','max'),... 83 | {net.layers(poolLyr2(1)).outputs{1},'VideoId2'},'tempPoolMax'); 84 | 85 | % change the input of fc last layer 86 | % net.setLayerInputs(net.layers(convs(end)).name,'tempPoolMax') ; 87 | % net.addLayer('bnar',dagnn.BatchNorm('numChannels',2048),{'tempPoolMax'},... 88 | % 'tempPoolMaxbn',{'bnar_m','bnar_b','bnar_x'}); 89 | poolLyr2next = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_id_relu'), net.layers)==1); 90 | net.setLayerInputs(net.layers(poolLyr2next(1)).name,{'tempPoolMax'}) ; 91 | net.rebuild() ; 92 | %------------------------------------------------------------------------% 93 | % add drop-out layers 94 | if DropOutRate>0 95 | 96 | pool5 = find(arrayfun(@(a) strcmp(a.name, 'features_8'), net.layers)==1); 97 | oo = net.layers(pool5(1)).outputs{1}; 98 | net.addLayer('drop_pool5',dagnn.DropOut('rate',DropOutRate),... 99 | oo,sprintf('drop_%s',oo),{}); 100 | net.setLayerInputs('classifier_permute',{sprintf('drop_%s',oo)}) ; 101 | end 102 | 103 | 104 | %------------------------------------------------------------------------% 105 | % add multi-class error 106 | net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr'); 107 | 108 | net.addLayer('loss', ... 109 | LossNormalized('loss', 'softmaxlog') ,... 110 | {'prediction', 'label'}, ... 111 | 'objective') ; 112 | 113 | %------------------------------------------------------------------------% 114 | net.rebuild() 115 | 116 | % replace standard matconvnet bnorm with my version 117 | bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1); 118 | for i=1:numel(bns) 119 | bb = net.layers(bns(i)).block ; 120 | net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,... 121 | 'epsilon',bb.epsilon,... 122 | 'opts',bb.opts) ; 123 | end 124 | 125 | % dagMergeBatchNorm(net) ; 126 | % dagRemoveLayersOfType(net, 'dagnn.BatchNorm') ; 127 | net_ = net.saveobj ; 128 | net = dagnn.DagNN.loadobj(net_) ; 129 | net.meta.normalization.border = [32 32] ; 130 | -------------------------------------------------------------------------------- /dicnn/cnn_video_of_get_batch.m: -------------------------------------------------------------------------------- 1 | function imo = cnn_video_of_get_batch(images, vids, varargin) 2 | % CNN_VIDEO_OF_GET_BATCH Load, preprocess, and pack images for CNN evaluation 3 | 4 | % video ids 5 | % use same spatial jittering for frames from the same video 6 | % NOTE: all the frames from a video should have the same size (wxh) 7 | 8 | opts.imageSize = [227, 227] ; 9 | opts.border = [29, 29] ; 10 | opts.keepAspect = true ; 11 | opts.numAugments = 1 ; 12 | opts.transformation = 'multiScaleRegular' ; 13 | opts.averageImage = [] ; 14 | opts.rgbVariance = zeros(0,2,'single') ; 15 | opts.interpolation = 'bilinear' ; 16 | opts.numThreads = 1 ; 17 | opts.prefetch = false ; 18 | opts.lazyResize = true ; 19 | opts.subMean = false; % subtract the mean from each video 20 | opts = vl_argparse(opts, varargin); 21 | 22 | % fetch is true if images is a list of filenames (instead of 23 | % a cell array of images) 24 | fetch = numel(images) >= 1 && ischar(images{1}) ; 25 | 26 | % prefetch is used to load images in a separate thread 27 | prefetch = fetch & opts.prefetch ; 28 | 29 | if prefetch 30 | vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ; 31 | imo = [] ; 32 | return ; 33 | end 34 | if fetch 35 | im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ; 36 | else 37 | im = images ; 38 | end 39 | 40 | tfs = [] ; 41 | switch opts.transformation 42 | case 'none' 43 | tfs = [ 44 | .5 ; 45 | .5 ; 46 | 0 ] ; 47 | case 'f5' 48 | tfs = [... 49 | .5 0 0 1 1 .5 0 0 1 1 ; 50 | .5 0 1 0 1 .5 0 1 0 1 ; 51 | 0 0 0 0 0 1 1 1 1 1] ; 52 | case 'f25' 53 | [tx,ty] = meshgrid(linspace(0,1,5)) ; 54 | tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ; 55 | tfs_ = tfs ; 56 | tfs_(3,:) = 1 ; 57 | tfs = [tfs,tfs_] ; 58 | case 'stretch' 59 | case 'multiScaleRegular' 60 | otherwise 61 | error('Uknown transformations %s', opts.transformation) ; 62 | end 63 | [~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ; 64 | 65 | if ~isempty(opts.rgbVariance) && isempty(opts.averageImage) 66 | opts.averageImage = zeros(1,1,2) ; 67 | end 68 | if numel(opts.averageImage) == 2 69 | opts.averageImage = reshape(opts.averageImage, 1,1,2) ; 70 | end 71 | 72 | imo = zeros(opts.imageSize(1), opts.imageSize(2), 2, ... 73 | numel(images)/2*opts.numAugments, 'single') ; 74 | 75 | nVid = max(vids); 76 | si = 1 ; 77 | countv = 1; 78 | for v=1:nVid 79 | 80 | vid = find(vids==v); 81 | 82 | for i=1:numel(images(vid)) 83 | 84 | % acquire image 85 | if isempty(im{i}) 86 | imt1 = imread(images{2*vid(i)-1}) ; 87 | imt2 = imread(images{2*vid(i)}) ; 88 | else 89 | imt1 = im{2*vid(i)-1} ; 90 | imt2 = im{2*vid(i)} ; 91 | end 92 | imt = single(cat(3,imt1,imt2)) ; % faster than im2single (and multiplies by 255) 93 | 94 | % resize 95 | w = size(imt,2) ; 96 | h = size(imt,1) ; 97 | factor = [(opts.imageSize(1)+opts.border(1))/h ... 98 | (opts.imageSize(2)+opts.border(2))/w]; 99 | 100 | if opts.keepAspect 101 | factor = max(factor) ; 102 | end 103 | if any(abs(factor - 1) > 0.0001) 104 | imt = imresize(imt, ... 105 | 'scale', factor, ... 106 | 'method', opts.interpolation) ; 107 | end 108 | 109 | % crop & flip 110 | if i==1 111 | flip = rand > 0.5 ; 112 | w = size(imt,2) ; 113 | h = size(imt,1) ; 114 | switch opts.transformation 115 | case 'stretch' 116 | sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ; 117 | dx = randi(w - sz(2) + 1, 1) ; 118 | dy = randi(h - sz(1) + 1, 1) ; 119 | % flip = rand > 0.5 ; 120 | case 'multiScaleRegular' 121 | reg_szs = [256, 224, 192, 168] ; 122 | sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4)); 123 | 124 | dy = [0 h-sz(1) 0 h-sz(1) floor((h-sz(1)+1)/2)] + 1; 125 | dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1; 126 | corner = randi(5); 127 | dx = dx(corner); dy = dy(corner); 128 | otherwise 129 | tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ; 130 | sz = opts.imageSize(1:2) ; 131 | dx = floor((w - sz(2)) * tf(2)) + 1 ; 132 | dy = floor((h - sz(1)) * tf(1)) + 1 ; 133 | % flip = tf(3) ; 134 | end 135 | 136 | end 137 | if opts.lazyResize 138 | sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ; 139 | sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ; 140 | else 141 | factor = [opts.imageSize(1)/sz(1) ... 142 | opts.imageSize(2)/sz(2)]; 143 | if any(abs(factor - 1) > 0.0001) 144 | imt = imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), [opts.imageSize(1:2)],... 145 | 'Antialiasing', false, 'Method', opts.interpolation); 146 | end 147 | sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1); 148 | end 149 | if flip 150 | sx = fliplr(sx) ; 151 | imo(:,:,1,si) = 255 - imt(sy,sx,1) ; 152 | imo(:,:,2,si) = imt(sy,sx,2) ; 153 | else 154 | imo(:,:,:,si) = imt(sy,sx,:) ; 155 | end 156 | si = si + 1 ; 157 | end 158 | 159 | countv = countv + numel(images(vid)); 160 | end 161 | if ~isempty(opts.averageImage) && numel(opts.averageImage)==2 162 | if ~isempty(opts.rgbVariance) 163 | imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(2,1), 1,1,3)) ; 164 | else 165 | imo = bsxfun(@minus, imo, opts.averageImage) ; 166 | end 167 | end 168 | 169 | 170 | -------------------------------------------------------------------------------- /dicnn/cnn_single_of.m: -------------------------------------------------------------------------------- 1 | function [net, info] = cnn_single_of(varargin) 2 | %CNN_SINGLE_OF Demonstrates fine-tuning a pre-trained CNN with static 3 | % optical flow (OF in pami journal) on UCF101 dataset 4 | 5 | run(fullfile(fileparts(mfilename('fullpath')), ... 6 | '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ; 7 | 8 | addpath Layers Datasets 9 | 10 | opts.dataDir = fullfile('data','UCF101') ; 11 | opts.expDir = fullfile('exp', 'UCF101') ; 12 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ; 13 | [opts, varargin] = vl_argparse(opts, varargin) ; 14 | 15 | opts.numFetchThreads = 8 ; 16 | 17 | opts.lite = false ; 18 | opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat'); 19 | 20 | opts.DropOutRate = 0.85 ; 21 | opts.datasetFn = @cnn_ucf101_of_setup_data ; 22 | opts.networkFn = @cnn_resnext_init ; 23 | 24 | opts.split = 1; % data split 25 | opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1] 26 | opts.numDynImgs = 10 ; 27 | opts.epochFactor = 5 ; 28 | opts.pool1Layer = 'conv0'; % before conv1 29 | opts.pool1Type = 'none' ; 30 | opts.pool2Layer = 'fc6' ; 31 | 32 | opts.train = struct() ; 33 | opts.train.gpus = []; 34 | opts.train.batchSize = 128 ; 35 | opts.train.numSubBatches = 32 ; 36 | opts.train.solver = [] ; 37 | opts.train.prefetch = true ; 38 | opts.train.learningRate = 1e-2 ; 39 | opts.train.numEpochs = 30 ; 40 | 41 | opts = vl_argparse(opts, varargin) ; 42 | if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end; 43 | 44 | 45 | % ------------------------------------------------------------------------- 46 | % Prepare data 47 | % ------------------------------------------------------------------------- 48 | 49 | if exist(opts.imdbPath,'file') 50 | imdb = load(opts.imdbPath) ; 51 | else 52 | imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ; 53 | mkdir(opts.expDir) ; 54 | save(opts.imdbPath, '-struct', 'imdb') ; 55 | end 56 | 57 | % UCF101 has 3 data splits 58 | if opts.split>3 59 | error('split should be <=3'); 60 | end 61 | imdb.images.set = imdb.images.sets(opts.split,:); 62 | 63 | % reverse frame order 64 | if opts.reverseDyn 65 | for i=1:numel(imdb.images.names) 66 | imdb.images.names{i} = imdb.images.names{i}(end:-1:1); 67 | end 68 | end 69 | % ------------------------------------------------------------------------- 70 | % Prepare model 71 | % ------------------------------------------------------------------------- 72 | net = load(opts.modelPath); 73 | if isfield(net,'net') 74 | net = net.net; 75 | end 76 | opts.nCls = max(imdb.images.label) ; 77 | % net = dagnn.DagNN.loadobj(net) ; 78 | net = opts.networkFn(net,opts) ; 79 | 80 | % two channels instead of 3 RGB 81 | net.params(1).value = net.params(1).value(:,:,1:2,:) ; 82 | 83 | % Set the class names in the network 84 | net.meta.classes.name = imdb.classes.name ; 85 | net.meta.classes.description = imdb.classes.name ; 86 | 87 | % ------------------------------------------------------------------------- 88 | % Learn 89 | % ------------------------------------------------------------------------- 90 | if opts.epochFactor>0 91 | opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ; 92 | else 93 | opts.train.train = NaN ; 94 | end 95 | opts.train.val = find(imdb.images.set==3) ; 96 | 97 | [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ... 98 | 'expDir', opts.expDir, ... 99 | opts.train) ; 100 | 101 | % ------------------------------------------------------------------------- 102 | % Report accuracy 103 | % ------------------------------------------------------------------------- 104 | errlayer = net.getLayerIndex('errMC') ; 105 | 106 | if ~isnan(errlayer) 107 | cats = imdb.classes.name ; 108 | accs = net.layers(errlayer).block.accuracy ; 109 | 110 | if numel(cats)~=numel(accs) 111 | error('wrong number of classes\n') ; 112 | end 113 | 114 | for i=1:numel(cats) 115 | fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ; 116 | end 117 | fprintf('Mean accuracy %.1f\n',100*mean(accs)) ; 118 | end 119 | 120 | % ------------------------------------------------------------------------- 121 | function fn = getBatchFn(opts, meta) 122 | % ------------------------------------------------------------------------- 123 | useGpu = numel(opts.train.gpus) > 0 ; 124 | 125 | bopts.numThreads = opts.numFetchThreads ; 126 | bopts.imageSize = meta.normalization.imageSize ; 127 | if isfield(meta.normalization,'border') 128 | bopts.border = meta.normalization.border ; 129 | else 130 | bopts.border = meta.normalization.imageSize(1:2) ./ ... 131 | meta.normalization.cropSize - meta.normalization.imageSize(1:2); 132 | end 133 | 134 | bopts.averageImage = 128 * ones([1 1 2],'single') ; 135 | bopts.numDynImgs = opts.numDynImgs ; 136 | % bopts.averageImage = meta.normalization.averageImage ; 137 | % bopts.rgbVariance = meta.augmentation.rgbVariance ; 138 | % bopts.transformation = meta.augmentation.transformation ; 139 | bopts.transformation = 'stretch' ; 140 | bopts.transformation = 'multiScaleRegular' ; 141 | 142 | fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ; 143 | 144 | 145 | 146 | % ------------------------------------------------------------------------- 147 | function inputs = getDagNNBatch(opts, useGpu, imdb, batch) 148 | % ------------------------------------------------------------------------- 149 | 150 | % batch refers to videos (not for frames) 151 | if isempty(batch) 152 | inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []}; 153 | return; 154 | end 155 | 156 | isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ; 157 | 158 | if ~isVal, transformation='multiScaleRegular'; else transformation='none';end 159 | 160 | names = imdb.images.names(batch); 161 | 162 | 163 | % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ; 164 | 165 | namesM = {}; 166 | nVids = numel(batch); 167 | 168 | VideoId1 = []; 169 | VideoId2 = []; 170 | 171 | % step-size 172 | stepSize = 6; 173 | % pool nFrames into a dynamic image 174 | nFrames = 1; 175 | % number of dynamic images to be max pooled later 176 | nDynImgs = opts.numDynImgs ; 177 | opts = rmfield(opts,'numDynImgs') ; 178 | 179 | 180 | c1 = 1; 181 | for v=1:nVids 182 | 183 | name = names{v}; 184 | nFrms = numel(name)/2; 185 | 186 | nSample = nFrames; 187 | nr = numel(1:stepSize:nFrms); 188 | 189 | % jitter by removing 50 % and limit a batch to nMaxs * nSamples images 190 | if nr > 1 && (~isVal && nr>nDynImgs) 191 | rat = min(nDynImgs,ceil(0.50*nr)); 192 | ri = randperm(nr); 193 | ri = ri(1:rat); 194 | r = zeros(1,nr); 195 | r(ri) = 1; 196 | else 197 | r = ones(1,nr); 198 | end 199 | 200 | c3 = 1; 201 | c2 = 0; 202 | 203 | for f=1:stepSize:nFrms 204 | if r(c3) 205 | idx = f:min(f+nSample-1,nFrms) ; 206 | if numel(idx) 0 232 | if useGpu 233 | im = gpuArray(im) ; 234 | end 235 | inputs = {'input', im, 'label', imdb.images.label(batch), ... 236 | 'VideoId2', VideoId2}; 237 | 238 | end 239 | -------------------------------------------------------------------------------- /dicnn/cnn_single_rgb.m: -------------------------------------------------------------------------------- 1 | function [net, info] = cnn_single_rgb(varargin) 2 | %CNN_SINGLE_RGB Demonstrates fine-tuning a pre-trained CNN with static 3 | % RGB frames (SI in pami journal) on UCF101 dataset 4 | 5 | 6 | run(fullfile(fileparts(mfilename('fullpath')), ... 7 | '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ; 8 | 9 | addpath Layers Datasets 10 | 11 | opts.dataDir = fullfile('data','UCF101') ; 12 | opts.expDir = fullfile('exp', 'UCF101') ; 13 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat'); 14 | opts.datasetFn = @cnn_ucf101_setup_data ; 15 | opts.networkFn = @cnn_init_resnext ; 16 | opts.pool1Type = 'none' ; 17 | opts.pool1Layer = 'conv1' ; 18 | opts.pool2Layer = '' ; 19 | [opts, varargin] = vl_argparse(opts, varargin) ; 20 | 21 | opts.numFetchThreads = 8 ; 22 | 23 | opts.lite = false ; 24 | opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat'); 25 | opts.ARPoolLayer = 'conv0'; % before conv1 26 | opts.DropOutRate = 0.5 ; 27 | opts.epochFactor = 5 ; 28 | 29 | opts.split = 1; % data split 30 | opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1] 31 | opts.train = struct() ; 32 | opts.train.gpus = []; 33 | opts.train.batchSize = 128 ; 34 | opts.train.numSubBatches = 16 ; 35 | opts.train.solver = [] ; 36 | opts.train.prefetch = true ; 37 | opts.train.numEpochs = 30 ; 38 | % resnet50 39 | opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)]; 40 | % caffe-ref 41 | opts.train.learningRate = 1e-4 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)]; 42 | 43 | opts = vl_argparse(opts, varargin) ; 44 | if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end; 45 | % opts.train.numEpochs = numel(opts.train.learningRate); 46 | 47 | % ------------------------------------------------------------------------- 48 | % Prepare data 49 | % ------------------------------------------------------------------------- 50 | 51 | if exist(opts.imdbPath,'file') 52 | imdb = load(opts.imdbPath) ; 53 | else 54 | imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ; 55 | mkdir(opts.expDir) ; 56 | save(opts.imdbPath, '-struct', 'imdb') ; 57 | end 58 | 59 | % UCF101 has 3 data splits 60 | if opts.split>3 61 | error('split should be <=3'); 62 | end 63 | imdb.images.set = imdb.images.sets(opts.split,:); 64 | 65 | % reverse frame order 66 | if opts.reverseDyn 67 | for i=1:numel(imdb.images.names) 68 | imdb.images.names{i} = imdb.images.names{i}(end:-1:1); 69 | end 70 | end 71 | 72 | % ------------------------------------------------------------------------- 73 | % Prepare model 74 | % ------------------------------------------------------------------------- 75 | net = load(opts.modelPath); 76 | if isfield(net,'net') 77 | net = net.net; 78 | end 79 | opts.nCls = max(imdb.images.label) ; 80 | net = opts.networkFn(net,opts); 81 | 82 | if numel(net.meta.normalization.averageImage)>3 83 | sz = size(net.meta.normalization.averageImage) ; 84 | net.meta.normalization.averageImage = ... 85 | mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ; 86 | end 87 | 88 | % Set the class names in the network 89 | net.meta.classes.name = imdb.classes.name ; 90 | net.meta.classes.description = imdb.classes.name ; 91 | % ------------------------------------------------------------------------- 92 | % Learn 93 | % ------------------------------------------------------------------------- 94 | if opts.epochFactor>0 95 | opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ; 96 | else 97 | opts.train.train = NaN ; 98 | end 99 | opts.train.val = find(imdb.images.set==3) ; 100 | 101 | [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ... 102 | 'expDir', opts.expDir, ... 103 | opts.train) ; 104 | 105 | % ------------------------------------------------------------------------- 106 | % Report accuracy 107 | % ------------------------------------------------------------------------- 108 | errlayer = net.getLayerIndex('errMC') ; 109 | 110 | if ~isnan(errlayer) 111 | cats = imdb.classes.name ; 112 | accs = net.layers(errlayer).block.accuracy ; 113 | 114 | if numel(cats)~=numel(accs) 115 | error('wrong number of classes\n') ; 116 | end 117 | 118 | for i=1:numel(cats) 119 | fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ; 120 | end 121 | fprintf('Mean accuracy %.1f\n',100*mean(accs)) ; 122 | end 123 | 124 | % ------------------------------------------------------------------------- 125 | function fn = getBatchFn(opts, meta) 126 | % ------------------------------------------------------------------------- 127 | useGpu = numel(opts.train.gpus) > 0 ; 128 | 129 | bopts.numThreads = opts.numFetchThreads ; 130 | bopts.imageSize = meta.normalization.imageSize ; 131 | if isfield(meta.normalization,'border') 132 | bopts.border = meta.normalization.border ; 133 | else 134 | bopts.border = meta.normalization.imageSize(1:2) ./ ... 135 | meta.normalization.cropSize - meta.normalization.imageSize(1:2); 136 | 137 | end 138 | 139 | % bopts.averageImage = []; 140 | bopts.averageImage = meta.normalization.averageImage ; 141 | bopts.interpolation = meta.normalization.interpolation ; 142 | bopts.keepAspect = meta.normalization.keepAspect ; 143 | % bopts.rgbVariance = meta.augmentation.rgbVariance ; 144 | % bopts.transformation = meta.augmentation.transformation ; 145 | 146 | 147 | fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ; 148 | 149 | 150 | 151 | % ------------------------------------------------------------------------- 152 | function inputs = getDagNNBatch(opts, useGpu, imdb, batch) 153 | % ------------------------------------------------------------------------- 154 | 155 | % batch refers to videos (not for frames) 156 | if isempty(batch) 157 | inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []}; 158 | return; 159 | end 160 | 161 | isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ; 162 | 163 | % if ~isVal, transformation='stretch'; else transformation='none';end 164 | if ~isVal, transformation='multiScaleRegular'; else transformation='none';end 165 | 166 | names = imdb.images.names(batch); 167 | 168 | 169 | % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ; 170 | 171 | namesM = {}; 172 | nVids = numel(batch); 173 | 174 | VideoId1 = []; 175 | VideoId2 = []; 176 | 177 | % step-size 178 | stepSize = 6; 179 | % pool nFrames into a dynamic image 180 | nFrames = 1; 181 | % number of dynamic images to be max pooled later 182 | nDynImgs = 10; 183 | 184 | 185 | c1 = 1; 186 | for v=1:nVids 187 | 188 | name = names{v}; 189 | nFrms = numel(name); 190 | 191 | nSample = nFrames; 192 | nr = numel(1:stepSize:nFrms); 193 | 194 | % jitter by removing 50 % and limit a batch to nMaxs * nSamples images 195 | if nr > 1 && (~isVal && nr>nDynImgs) 196 | rat = min(nDynImgs,ceil(0.50*nr)); 197 | ri = randperm(nr); 198 | ri = ri(1:rat); 199 | r = zeros(1,nr); 200 | r(ri) = 1; 201 | else 202 | r = ones(1,nr); 203 | end 204 | 205 | c3 = 1; 206 | c2 = 0; 207 | 208 | for f=1:stepSize:nFrms 209 | if r(c3) 210 | idx = f:min(f+nSample-1,nFrms) ; 211 | if numel(idx) 0 231 | if useGpu 232 | im = gpuArray(im) ; 233 | end 234 | inputs = {'input', im, 'label', imdb.images.label(batch), ... 235 | 'VideoId2', VideoId2}; 236 | end 237 | -------------------------------------------------------------------------------- /dicnn/cnn_dicnn_of.m: -------------------------------------------------------------------------------- 1 | function [net, info] = cnn_dicnn_of(varargin) 2 | %CNN_DICNN_OF Fine-tunes a pre-trained CNN with dynamic images on optical 3 | % (DOF in pami journal) flow frames on UCF101 dataset 4 | 5 | 6 | run(fullfile(fileparts(mfilename('fullpath')), ... 7 | '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ; 8 | 9 | run(fullfile(fileparts(mfilename('fullpath')), ... 10 | '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ; 11 | 12 | run(fullfile(fileparts(mfilename('fullpath')), ... 13 | '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ; 14 | 15 | addpath Layers Datasets 16 | 17 | opts.dataDir = fullfile('data','UCF101') ; 18 | opts.expDir = fullfile('exp', 'UCF101') ; 19 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ; 20 | [opts, varargin] = vl_argparse(opts, varargin) ; 21 | 22 | opts.numFetchThreads = 8 ; 23 | 24 | opts.lite = false ; 25 | opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat'); 26 | opts.pool1Layer = 'conv0'; % before conv1 27 | opts.pool1Type = 'arpool'; % before conv1 28 | opts.pool2Layer = 'fc6'; % before conv1 29 | opts.DropOutRate = 0.85 ; 30 | opts.datasetFn = @cnn_ucf101_of_setup_data ; 31 | opts.networkFn = @cnn_init_resnext ; 32 | opts.network = [] ; 33 | 34 | opts.split = 1; % data split 35 | opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1] 36 | opts.numDynImgs = 10 ; 37 | opts.epochFactor = 5 ; 38 | 39 | opts.train = struct() ; 40 | opts.train.gpus = []; 41 | opts.train.batchSize = 128 ; 42 | opts.train.numSubBatches = 32 ; 43 | opts.train.solver = [] ; 44 | opts.train.prefetch = true ; 45 | opts.train.learningRate = 1e-2 ; 46 | opts.train.numEpochs = 30 ; 47 | % opts.train.savePreds = true ; 48 | opts.train.randomSeed = 0 ; 49 | 50 | opts = vl_argparse(opts, varargin) ; 51 | if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end; 52 | 53 | 54 | % ------------------------------------------------------------------------- 55 | % Prepare data 56 | % ------------------------------------------------------------------------- 57 | 58 | if exist(opts.imdbPath,'file') 59 | imdb = load(opts.imdbPath) ; 60 | else 61 | imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ; 62 | mkdir(opts.expDir) ; 63 | save(opts.imdbPath, '-struct', 'imdb') ; 64 | end 65 | 66 | % UCF101 has 3 data splits 67 | if opts.split>3 68 | error('split should be <=3'); 69 | end 70 | imdb.images.set = imdb.images.sets(opts.split,:); 71 | 72 | % reverse frame order 73 | if opts.reverseDyn 74 | for i=1:numel(imdb.images.names) 75 | imdb.images.names{i} = imdb.images.names{i}(end:-1:1); 76 | end 77 | end 78 | % ------------------------------------------------------------------------- 79 | % Prepare model 80 | % ------------------------------------------------------------------------- 81 | if isempty(opts.network) 82 | net = load(opts.modelPath); 83 | if isfield(net,'net') 84 | net = net.net; 85 | end 86 | opts.nCls = max(imdb.images.label) ; 87 | % net = dagnn.DagNN.loadobj(net) ; 88 | net = opts.networkFn(net,opts) ; 89 | 90 | % two channels instead of 3 RGB 91 | net.params(1).value = net.params(1).value(:,:,1:2,:) ; 92 | 93 | % Set the class names in the network 94 | net.meta.classes.name = imdb.classes.name ; 95 | net.meta.classes.description = imdb.classes.name ; 96 | else 97 | assert(isa(opts.network,'dagnn.DagNN')) ; 98 | net = opts.network ; 99 | end 100 | 101 | % ------------------------------------------------------------------------- 102 | % Learn 103 | % ------------------------------------------------------------------------- 104 | if opts.epochFactor>0 105 | opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ; 106 | else 107 | opts.train.train = NaN ; 108 | opts.train.numEpochs = 1 ; 109 | end 110 | opts.train.val = find(imdb.images.set==3) ; 111 | 112 | [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ... 113 | 'expDir', opts.expDir, ... 114 | opts.train) ; 115 | 116 | 117 | % ------------------------------------------------------------------------- 118 | % Report accuracy 119 | % ------------------------------------------------------------------------- 120 | errlayer = net.getLayerIndex('errMC') ; 121 | 122 | if ~isnan(errlayer) 123 | cats = imdb.classes.name ; 124 | accs = net.layers(errlayer).block.accuracy ; 125 | 126 | if numel(cats)~=numel(accs) 127 | error('wrong number of classes\n') ; 128 | end 129 | 130 | for i=1:numel(cats) 131 | fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ; 132 | end 133 | fprintf('Mean accuracy %.1f\n',100*mean(accs)) ; 134 | end 135 | % ------------------------------------------------------------------------- 136 | function fn = getBatchFn(opts, meta) 137 | % ------------------------------------------------------------------------- 138 | useGpu = numel(opts.train.gpus) > 0 ; 139 | 140 | bopts.numThreads = opts.numFetchThreads ; 141 | bopts.imageSize = meta.normalization.imageSize ; 142 | if isfield(meta.normalization,'border') 143 | bopts.border = meta.normalization.border ; 144 | else 145 | bopts.border = meta.normalization.imageSize(1:2) ./ ... 146 | meta.normalization.cropSize - meta.normalization.imageSize(1:2); 147 | end 148 | 149 | bopts.averageImage = 128 * ones([1 1 2],'single') ; 150 | bopts.numDynImgs = opts.numDynImgs ; 151 | 152 | fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ; 153 | 154 | 155 | 156 | % ------------------------------------------------------------------------- 157 | function inputs = getDagNNBatch(opts, useGpu, imdb, batch) 158 | % ------------------------------------------------------------------------- 159 | 160 | % batch refers to videos (not for frames) 161 | if isempty(batch) 162 | inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []}; 163 | return; 164 | end 165 | 166 | isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ; 167 | 168 | if ~isVal, transformation='multiScaleRegular'; else transformation='none';end 169 | 170 | names = imdb.images.names(batch); 171 | 172 | 173 | % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ; 174 | 175 | namesM = {}; 176 | nVids = numel(batch); 177 | 178 | VideoId1 = []; 179 | VideoId2 = []; 180 | 181 | % step-size 182 | stepSize = 6; 183 | 184 | % pool nFrames into a dynamic image 185 | nFrames = 10; 186 | % number of dynamic images to be max pooled later 187 | nDynImgs = opts.numDynImgs ; 188 | opts = rmfield(opts,'numDynImgs') ; 189 | 190 | 191 | c1 = 1; 192 | for v=1:nVids 193 | 194 | name = names{v}; 195 | nFrms = numel(name)/2; 196 | 197 | nSample = nFrames; 198 | 199 | if isVal 200 | startF = 1 ; 201 | else 202 | startF = ceil(stepSize/2) ; 203 | end 204 | nr = numel(startF:stepSize:nFrms); 205 | 206 | % jitter by removing 50 % and limit a batch to nMaxs * nSamples images 207 | if nr > 1 && (~isVal && nr>nDynImgs) 208 | rat = min(nDynImgs,ceil(0.50*nr)); 209 | ri = randperm(nr); 210 | ri = ri(1:rat); 211 | r = zeros(1,nr); 212 | r(ri) = 1; 213 | else 214 | r = ones(1,nr); 215 | end 216 | 217 | c3 = 1; 218 | c2 = 0; 219 | 220 | for f=startF:stepSize:nFrms 221 | if r(c3) 222 | idx = f:min(f+nSample-1,nFrms) ; 223 | if numel(idx) 0 248 | if useGpu 249 | im = gpuArray(im) ; 250 | end 251 | inputs = {'input', im, 'label', imdb.images.label(batch), ... 252 | 'VideoId1', VideoId1, 'VideoId2', VideoId2}; 253 | 254 | end 255 | -------------------------------------------------------------------------------- /dicnn/cnn_dicnn_rgb.m: -------------------------------------------------------------------------------- 1 | function [net, info] = cnn_dicnn_rgb(varargin) 2 | %CNN_DICNN_RGB Fine-tunes a pre-trained CNN with dynamic images on RGB frames 3 | % (DI in pami journal) on UCF101 dataset 4 | 5 | 6 | run(fullfile(fileparts(mfilename('fullpath')), ... 7 | '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ; 8 | 9 | run(fullfile(fileparts(mfilename('fullpath')), ... 10 | '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ; 11 | 12 | run(fullfile(fileparts(mfilename('fullpath')), ... 13 | '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ; 14 | 15 | addpath Layers Datasets 16 | 17 | opts.dataDir = fullfile('data','UCF101') ; 18 | opts.expDir = fullfile('exp', 'UCF101') ; 19 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat'); 20 | opts.datasetFn = @cnn_ucf101_setup_data ; 21 | opts.networkFn = @cnn_init_resnext ; 22 | opts.network = [] ; 23 | 24 | [opts, varargin] = vl_argparse(opts, varargin) ; 25 | 26 | opts.numFetchThreads = 8 ; 27 | 28 | opts.lite = false ; 29 | opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat'); 30 | opts.pool1Layer = 'conv0'; % before conv1 31 | opts.pool1Type = 'arpool'; 32 | opts.pool2Layer = 'pool5'; 33 | opts.pool2Type = 'maxpool'; 34 | opts.DropOutRate = 0.5 ; 35 | opts.epochFactor = 5 ; 36 | 37 | opts.split = 1; % data split 38 | opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1] 39 | opts.train = struct() ; 40 | opts.train.gpus = []; 41 | opts.train.batchSize = 128 ; 42 | opts.train.numSubBatches = 16 ; 43 | opts.train.solver = [] ; 44 | opts.train.prefetch = true ; 45 | opts.train.numEpochs = 30 ; 46 | opts.train.randomSeed = 0 ; 47 | % resnet50 48 | % opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)]; 49 | % caffe-ref 50 | opts.train.learningRate = 1e-3 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)]; 51 | 52 | opts = vl_argparse(opts, varargin) ; 53 | if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end 54 | % opts.train.numEpochs = numel(opts.train.learningRate); 55 | 56 | % ------------------------------------------------------------------------- 57 | % Prepare data 58 | % ------------------------------------------------------------------------- 59 | 60 | if exist(opts.imdbPath,'file') 61 | imdb = load(opts.imdbPath) ; 62 | else 63 | imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ; 64 | mkdir(opts.expDir) ; 65 | save(opts.imdbPath, '-struct', 'imdb') ; 66 | end 67 | 68 | % UCF101 has 3 data splits 69 | if opts.split>3 70 | error('split should be <=3'); 71 | end 72 | imdb.images.set = imdb.images.sets(opts.split,:); 73 | 74 | % reverse frame order 75 | if opts.reverseDyn 76 | for i=1:numel(imdb.images.names) 77 | imdb.images.names{i} = imdb.images.names{i}(end:-1:1); 78 | end 79 | end 80 | 81 | % ------------------------------------------------------------------------- 82 | % Prepare model 83 | % ------------------------------------------------------------------------- 84 | if isempty(opts.network) 85 | net = load(opts.modelPath); 86 | if isfield(net,'net') 87 | net = net.net; 88 | end 89 | opts.nCls = max(imdb.images.label) ; 90 | net = opts.networkFn(net,opts); 91 | 92 | if numel(net.meta.normalization.averageImage)>3 93 | sz = size(net.meta.normalization.averageImage) ; 94 | net.meta.normalization.averageImage = ... 95 | mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ; 96 | end 97 | 98 | % Set the class names in the network 99 | net.meta.classes.name = imdb.classes.name ; 100 | net.meta.classes.description = imdb.classes.name ; 101 | else 102 | assert(isa(opts.network,'dagnn.DagNN')) ; 103 | net = opts.network ; 104 | end 105 | % ------------------------------------------------------------------------- 106 | % Learn 107 | % ------------------------------------------------------------------------- 108 | if opts.epochFactor>0 109 | opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ; 110 | else 111 | opts.train.train = NaN ; 112 | opts.train.numEpochs = 1 ; 113 | end 114 | opts.train.val = find(imdb.images.set==3) ; 115 | 116 | [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ... 117 | 'expDir', opts.expDir, ... 118 | opts.train) ; 119 | 120 | % ------------------------------------------------------------------------- 121 | % Report accuracy 122 | % ------------------------------------------------------------------------- 123 | errlayer = net.getLayerIndex('errMC') ; 124 | 125 | if ~isnan(errlayer) 126 | cats = imdb.classes.name ; 127 | accs = net.layers(errlayer).block.accuracy ; 128 | 129 | if numel(cats)~=numel(accs) 130 | error('wrong number of classes\n') ; 131 | end 132 | 133 | for i=1:numel(cats) 134 | fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ; 135 | end 136 | fprintf('Mean accuracy %.1f\n',100*mean(accs)) ; 137 | end 138 | 139 | % ------------------------------------------------------------------------- 140 | function fn = getBatchFn(opts, meta) 141 | % ------------------------------------------------------------------------- 142 | useGpu = numel(opts.train.gpus) > 0 ; 143 | 144 | bopts.numThreads = opts.numFetchThreads ; 145 | bopts.imageSize = meta.normalization.imageSize ; 146 | if isfield(meta.normalization,'border') 147 | bopts.border = meta.normalization.border ; 148 | else 149 | bopts.border = meta.normalization.imageSize(1:2) ./ ... 150 | meta.normalization.cropSize - meta.normalization.imageSize(1:2); 151 | 152 | end 153 | 154 | % bopts.averageImage = []; 155 | bopts.averageImage = meta.normalization.averageImage ; 156 | bopts.interpolation = meta.normalization.interpolation ; 157 | bopts.keepAspect = meta.normalization.keepAspect ; 158 | % bopts.rgbVariance = meta.augmentation.rgbVariance ; 159 | % bopts.transformation = meta.augmentation.transformation ; 160 | 161 | 162 | fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ; 163 | 164 | 165 | 166 | % ------------------------------------------------------------------------- 167 | function inputs = getDagNNBatch(opts, useGpu, imdb, batch) 168 | % ------------------------------------------------------------------------- 169 | 170 | % batch refers to videos (not for frames) 171 | if isempty(batch) 172 | inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []}; 173 | return; 174 | end 175 | 176 | isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ; 177 | 178 | % if ~isVal, transformation='stretch'; else transformation='none';end 179 | if ~isVal, transformation='multiScaleRegular'; else transformation='none';end 180 | 181 | names = imdb.images.names(batch); 182 | 183 | 184 | % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ; 185 | 186 | namesM = {}; 187 | nVids = numel(batch); 188 | 189 | VideoId1 = []; 190 | VideoId2 = []; 191 | 192 | % step-size 193 | stepSize = 6; 194 | 195 | % pool nFrames into a dynamic image 196 | nFrames = 10; 197 | % number of dynamic images to be max pooled later 198 | nDynImgs = 10; 199 | 200 | 201 | c1 = 1; 202 | for v=1:nVids 203 | 204 | name = names{v}; 205 | 206 | if isVal 207 | startF = 1 ; 208 | else 209 | startF = ceil(stepSize/2) ; 210 | end 211 | 212 | nFrms = numel(name); 213 | 214 | nSample = nFrames; 215 | nr = numel(startF:stepSize:nFrms); 216 | 217 | % jitter by removing 50 % and limit a batch to nMaxs * nSamples images 218 | if nr > 1 && (~isVal && nr>nDynImgs) 219 | rat = min(nDynImgs,ceil(0.50*nr)); 220 | ri = randperm(nr); 221 | ri = ri(1:rat); 222 | r = zeros(1,nr); 223 | r(ri) = 1; 224 | else 225 | if nr>2*nDynImgs 226 | rat = 2*nDynImgs; 227 | ri = randperm(nr); 228 | ri = ri(1:rat); 229 | r = zeros(1,nr); 230 | r(ri) = 1; 231 | else 232 | r = ones(1,nr); 233 | end 234 | end 235 | 236 | c3 = 1; 237 | c2 = 0; 238 | 239 | for f=startF:stepSize:nFrms 240 | if r(c3) 241 | idx = f:min(f+nSample-1,nFrms) ; 242 | if numel(idx) 0 261 | if useGpu 262 | im = gpuArray(im) ; 263 | end 264 | inputs = {'input', im, 'label', imdb.images.label(batch), ... 265 | 'VideoId1', VideoId1, 'VideoId2', VideoId2}; 266 | end 267 | -------------------------------------------------------------------------------- /dicnn/cnn_train_dicnn_dag.m: -------------------------------------------------------------------------------- 1 | function [net,stats] = cnn_train_dicnn_dag(net, imdb, getBatch, varargin) 2 | %CNN_DICNN_TRAIN_DAG Demonstrates training a CNN using the DagNN wrapper 3 | % CNN_TRAIN_DAG() is similar to CNN_TRAIN(), but works with 4 | % the DagNN wrapper instead of the SimpleNN wrapper. 5 | 6 | % Copyright (C) 2014-16 Andrea Vedaldi. 7 | % All rights reserved. 8 | % 9 | % This file is part of the VLFeat library and is made available under 10 | % the terms of the BSD license (see the COPYING file). 11 | addpath(fullfile(vl_rootnn, 'examples')); 12 | 13 | opts.expDir = fullfile('data','exp') ; 14 | opts.continue = true ; 15 | opts.batchSize = 256 ; 16 | opts.numSubBatches = 1 ; 17 | opts.train = [] ; 18 | opts.val = [] ; 19 | opts.gpus = [] ; 20 | opts.prefetch = false ; 21 | opts.epochSize = inf; 22 | opts.numEpochs = 300 ; 23 | opts.learningRate = 0.001 ; 24 | opts.weightDecay = 0.0005 ; 25 | 26 | opts.solver = [] ; % Empty array means use the default SGD solver 27 | [opts, varargin] = vl_argparse(opts, varargin) ; 28 | if ~isempty(opts.solver) 29 | assert(isa(opts.solver, 'function_handle') && nargout(opts.solver) == 2,... 30 | 'Invalid solver; expected a function handle with two outputs.') ; 31 | % Call without input arguments, to get default options 32 | opts.solverOpts = opts.solver() ; 33 | end 34 | 35 | opts.momentum = 0.9 ; 36 | opts.saveSolverState = true ; 37 | opts.nesterovUpdate = false ; 38 | opts.randomSeed = 0 ; 39 | opts.profile = false ; 40 | opts.parameterServer.method = 'mmap' ; 41 | opts.parameterServer.prefix = 'mcn' ; 42 | 43 | opts.derOutputs = {'objective', 1} ; 44 | opts.extractStatsFn = @extractStats ; 45 | opts.plotStatistics = true; 46 | opts.postEpochFn = [] ; % postEpochFn(net,params,state) called after each epoch; can return a new learning rate, 0 to stop, [] for no change 47 | opts = vl_argparse(opts, varargin) ; 48 | 49 | if ~exist(opts.expDir, 'dir'), mkdir(opts.expDir) ; end 50 | if isempty(opts.train), opts.train = find(imdb.images.set==1) ; end 51 | if isempty(opts.val), opts.val = find(imdb.images.set==2) ; end 52 | if isscalar(opts.train) && isnumeric(opts.train) && isnan(opts.train) 53 | opts.train = [] ; 54 | end 55 | if isscalar(opts.val) && isnumeric(opts.val) && isnan(opts.val) 56 | opts.val = [] ; 57 | end 58 | 59 | % ------------------------------------------------------------------------- 60 | % Initialization 61 | % ------------------------------------------------------------------------- 62 | 63 | evaluateMode = isempty(opts.train) ; 64 | if ~evaluateMode 65 | if isempty(opts.derOutputs) 66 | error('DEROUTPUTS must be specified when training.\n') ; 67 | end 68 | end 69 | 70 | % ------------------------------------------------------------------------- 71 | % Train and validate 72 | % ------------------------------------------------------------------------- 73 | 74 | modelPath = @(ep) fullfile(opts.expDir, sprintf('net-epoch-%d.mat', ep)); 75 | modelFigPath = fullfile(opts.expDir, 'net-train.pdf') ; 76 | 77 | start = opts.continue * findLastCheckpoint(opts.expDir) ; 78 | if start >= 1 79 | fprintf('%s: resuming by loading epoch %d\n', mfilename, start) ; 80 | [net, state, stats] = loadState(modelPath(start)) ; 81 | else 82 | state = [] ; 83 | end 84 | 85 | for epoch=start+1:opts.numEpochs 86 | 87 | % Set the random seed based on the epoch and opts.randomSeed. 88 | % This is important for reproducibility, including when training 89 | % is restarted from a checkpoint. 90 | 91 | rng(epoch + opts.randomSeed) ; 92 | prepareGPUs(opts, epoch == start+1) ; 93 | 94 | % Train for one epoch. 95 | params = opts ; 96 | params.epoch = epoch ; 97 | params.learningRate = opts.learningRate(min(epoch, numel(opts.learningRate))) ; 98 | params.train = opts.train(randperm(numel(opts.train))) ; % shuffle 99 | params.train = params.train(1:min(opts.epochSize, numel(opts.train))); 100 | params.val = opts.val(randperm(numel(opts.val))) ; 101 | params.imdb = imdb ; 102 | params.getBatch = getBatch ; 103 | 104 | if numel(opts.gpus) <= 1 105 | [net, state] = processEpoch(net, state, params, 'train') ; 106 | [net, state] = processEpoch(net, state, params, 'val') ; 107 | if ~evaluateMode 108 | saveState(modelPath(epoch), net, state) ; 109 | end 110 | lastStats = state.stats ; 111 | else 112 | spmd 113 | [net, state] = processEpoch(net, state, params, 'train') ; 114 | [net, state] = processEpoch(net, state, params, 'val') ; 115 | if labindex == 1 && ~evaluateMode 116 | saveState(modelPath(epoch), net, state) ; 117 | end 118 | lastStats = state.stats ; 119 | end 120 | lastStats = accumulateStats(lastStats) ; 121 | end 122 | 123 | stats.train(epoch) = lastStats.train ; 124 | stats.val(epoch) = lastStats.val ; 125 | clear lastStats ; 126 | saveStats(modelPath(epoch), stats) ; 127 | 128 | if opts.plotStatistics 129 | switchFigure(1) ; clf ; 130 | plots = setdiff(... 131 | cat(2,... 132 | fieldnames(stats.train)', ... 133 | fieldnames(stats.val)'), {'num', 'time'}) ; 134 | for p = plots 135 | p = char(p) ; 136 | values = zeros(0, epoch) ; 137 | leg = {} ; 138 | for f = {'train', 'val'} 139 | f = char(f) ; 140 | if isfield(stats.(f), p) 141 | tmp = [stats.(f).(p)] ; 142 | values(end+1,:) = tmp(1,:)' ; 143 | leg{end+1} = f ; 144 | end 145 | end 146 | subplot(1,numel(plots),find(strcmp(p,plots))) ; 147 | plot(1:epoch, values','o-') ; 148 | xlabel('epoch') ; 149 | title(p) ; 150 | legend(leg{:}) ; 151 | grid on ; 152 | end 153 | drawnow ; 154 | print(1, modelFigPath, '-dpdf') ; 155 | end 156 | 157 | if ~isempty(opts.postEpochFn) 158 | if nargout(opts.postEpochFn) == 0 159 | opts.postEpochFn(net, params, state) ; 160 | else 161 | lr = opts.postEpochFn(net, params, state) ; 162 | if ~isempty(lr), opts.learningRate = lr; end 163 | if opts.learningRate == 0, break; end 164 | end 165 | end 166 | end 167 | 168 | % With multiple GPUs, return one copy 169 | if isa(net, 'Composite'), net = net{1} ; end 170 | 171 | % ------------------------------------------------------------------------- 172 | function [net, state] = processEpoch(net, state, params, mode) 173 | % ------------------------------------------------------------------------- 174 | % Note that net is not strictly needed as an output argument as net 175 | % is a handle class. However, this fixes some aliasing issue in the 176 | % spmd caller. 177 | 178 | % initialize with momentum 0 179 | if isempty(state) || isempty(state.solverState) 180 | state.solverState = cell(1, numel(net.params)) ; 181 | state.solverState(:) = {0} ; 182 | end 183 | 184 | % move CNN to GPU as needed 185 | numGpus = numel(params.gpus) ; 186 | if numGpus >= 1 187 | net.move('gpu') ; 188 | for i = 1:numel(state.solverState) 189 | s = state.solverState{i} ; 190 | if isnumeric(s) 191 | state.solverState{i} = gpuArray(s) ; 192 | elseif isstruct(s) 193 | state.solverState{i} = structfun(@gpuArray, s, 'UniformOutput', false) ; 194 | end 195 | end 196 | end 197 | if numGpus > 1 198 | parserv = ParameterServer(params.parameterServer) ; 199 | net.setParameterServer(parserv) ; 200 | else 201 | parserv = [] ; 202 | end 203 | 204 | % profile 205 | if params.profile 206 | if numGpus <= 1 207 | profile clear ; 208 | profile on ; 209 | else 210 | mpiprofile reset ; 211 | mpiprofile on ; 212 | end 213 | end 214 | 215 | num = 0 ; 216 | epoch = params.epoch ; 217 | subset = params.(mode) ; 218 | adjustTime = 0 ; 219 | 220 | stats.num = 0 ; % return something even if subset = [] 221 | stats.time = 0 ; 222 | 223 | start = tic ; 224 | for t=1:params.batchSize:numel(subset) 225 | fprintf('%s: epoch %02d: %3d/%3d:', mode, epoch, ... 226 | fix((t-1)/params.batchSize)+1, ceil(numel(subset)/params.batchSize)) ; 227 | batchSize = min(params.batchSize, numel(subset) - t + 1) ; 228 | 229 | for s=1:params.numSubBatches 230 | % get this image batch and prefetch the next 231 | batchStart = t + (labindex-1) + (s-1) * numlabs ; 232 | batchEnd = min(t+params.batchSize-1, numel(subset)) ; 233 | batch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ; 234 | num = num + numel(batch) ; 235 | if numel(batch) == 0, continue ; end 236 | 237 | inputs = params.getBatch(params.imdb, batch) ; 238 | 239 | if params.prefetch 240 | if s == params.numSubBatches 241 | batchStart = t + (labindex-1) + params.batchSize ; 242 | batchEnd = min(t+2*params.batchSize-1, numel(subset)) ; 243 | else 244 | batchStart = batchStart + numlabs ; 245 | end 246 | nextBatch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ; 247 | params.getBatch(params.imdb, nextBatch) ; 248 | end 249 | 250 | if strcmp(mode, 'train') 251 | net.mode = 'normal' ; 252 | net.accumulateParamDers = (s ~= 1) ; 253 | net.eval(inputs, params.derOutputs, 'holdOn', s < params.numSubBatches) ; 254 | else 255 | net.mode = 'test' ; 256 | net.eval(inputs) ; 257 | end 258 | end 259 | 260 | % Accumulate gradient. 261 | if strcmp(mode, 'train') 262 | if ~isempty(parserv), parserv.sync() ; end 263 | state = accumulateGradients(net, state, params, parserv) ; 264 | end 265 | 266 | % Get statistics. 267 | time = toc(start) + adjustTime ; 268 | batchTime = time - stats.time ; 269 | stats.num = num ; 270 | stats.time = time ; 271 | stats = params.extractStatsFn(stats,net) ; 272 | currentSpeed = batchSize / batchTime ; 273 | averageSpeed = (t + batchSize - 1) / time ; 274 | if t == 3*params.batchSize + 1 275 | % compensate for the first three iterations, which are outliers 276 | adjustTime = 4*batchTime - time ; 277 | stats.time = time + adjustTime ; 278 | end 279 | 280 | fprintf(' %.1f (%.1f) Hz', averageSpeed, currentSpeed) ; 281 | for f = setdiff(fieldnames(stats)', {'num', 'time'}) 282 | f = char(f) ; 283 | fprintf(' %s: %.3f', f, stats.(f)) ; 284 | end 285 | fprintf('\n') ; 286 | end 287 | 288 | % Save back to state. 289 | state.stats.(mode) = stats ; 290 | if params.profile 291 | if numGpus <= 1 292 | state.prof.(mode) = profile('info') ; 293 | profile off ; 294 | else 295 | state.prof.(mode) = mpiprofile('info'); 296 | mpiprofile off ; 297 | end 298 | end 299 | if ~params.saveSolverState 300 | state.solverState = [] ; 301 | else 302 | for i = 1:numel(state.solverState) 303 | s = state.solverState{i} ; 304 | if isnumeric(s) 305 | state.solverState{i} = gather(s) ; 306 | elseif isstruct(s) 307 | state.solverState{i} = structfun(@gather, s, 'UniformOutput', false) ; 308 | end 309 | end 310 | end 311 | 312 | net.reset() ; 313 | net.move('cpu') ; 314 | 315 | % ------------------------------------------------------------------------- 316 | function state = accumulateGradients(net, state, params, parserv) 317 | % ------------------------------------------------------------------------- 318 | numGpus = numel(params.gpus) ; 319 | otherGpus = setdiff(1:numGpus, labindex) ; 320 | 321 | den = params.numSubBatches * max(numGpus,1) ; 322 | 323 | for p=1:numel(net.params) 324 | 325 | if ~isempty(parserv) 326 | parDer = parserv.pullWithIndex(p) ; 327 | else 328 | parDer = net.params(p).der ; 329 | end 330 | 331 | switch net.params(p).trainMethod 332 | 333 | case 'average' % mainly for batch normalization 334 | thisLR = net.params(p).learningRate ; 335 | net.params(p).value = vl_taccum(... 336 | 1 - thisLR, net.params(p).value, ... 337 | (thisLR/den/net.params(p).fanout), parDer) ; 338 | 339 | case 'gradient' 340 | thisDecay = params.weightDecay * net.params(p).weightDecay ; 341 | thisLR = params.learningRate * net.params(p).learningRate ; 342 | 343 | if thisLR>0 || thisDecay>0 344 | % Normalize gradient and incorporate weight decay. 345 | parDer = vl_taccum(1/den, parDer, ... 346 | thisDecay, net.params(p).value) ; 347 | 348 | if isempty(params.solver) 349 | % Default solver is the optimised SGD. 350 | % Update momentum. 351 | state.solverState{p} = vl_taccum(... 352 | params.momentum, state.solverState{p}, ... 353 | -1, parDer) ; 354 | 355 | % Nesterov update (aka one step ahead). 356 | if params.nesterovUpdate 357 | delta = params.momentum * state.solverState{p} - parDer ; 358 | else 359 | delta = state.solverState{p} ; 360 | end 361 | 362 | % Update parameters. 363 | net.params(p).value = vl_taccum(... 364 | 1, net.params(p).value, thisLR, delta) ; 365 | 366 | else 367 | % call solver function to update weights 368 | [net.params(p).value, state.solverState{p}] = ... 369 | params.solver(net.params(p).value, state.solverState{p}, ... 370 | parDer, params.solverOpts, thisLR) ; 371 | end 372 | end 373 | otherwise 374 | error('Unknown training method ''%s'' for parameter ''%s''.', ... 375 | net.params(p).trainMethod, ... 376 | net.params(p).name) ; 377 | end 378 | end 379 | 380 | % ------------------------------------------------------------------------- 381 | function stats = accumulateStats(stats_) 382 | % ------------------------------------------------------------------------- 383 | 384 | for s = {'train', 'val'} 385 | s = char(s) ; 386 | total = 0 ; 387 | 388 | % initialize stats stucture with same fields and same order as 389 | % stats_{1} 390 | stats__ = stats_{1} ; 391 | names = fieldnames(stats__.(s))' ; 392 | values = zeros(1, numel(names)) ; 393 | fields = cat(1, names, num2cell(values)) ; 394 | stats.(s) = struct(fields{:}) ; 395 | 396 | for g = 1:numel(stats_) 397 | stats__ = stats_{g} ; 398 | num__ = stats__.(s).num ; 399 | total = total + num__ ; 400 | 401 | for f = setdiff(fieldnames(stats__.(s))', 'num') 402 | f = char(f) ; 403 | stats.(s).(f) = stats.(s).(f) + stats__.(s).(f) * num__ ; 404 | 405 | if g == numel(stats_) 406 | stats.(s).(f) = stats.(s).(f) / total ; 407 | end 408 | end 409 | end 410 | stats.(s).num = total ; 411 | end 412 | 413 | % ------------------------------------------------------------------------- 414 | function stats = extractStats(stats, net) 415 | % ------------------------------------------------------------------------- 416 | sel = find(cellfun(@(x) isa(x,'dagnn.Loss'), {net.layers.block})) ; 417 | for i = 1:numel(sel) 418 | if net.layers(sel(i)).block.ignoreAverage, continue; end; 419 | stats.(net.layers(sel(i)).outputs{1}) = net.layers(sel(i)).block.average ; 420 | end 421 | 422 | % ------------------------------------------------------------------------- 423 | function saveState(fileName, net_, state) 424 | % ------------------------------------------------------------------------- 425 | net = net_.saveobj() ; 426 | save(fileName, 'net', 'state') ; 427 | 428 | % ------------------------------------------------------------------------- 429 | function saveStats(fileName, stats) 430 | % ------------------------------------------------------------------------- 431 | if exist(fileName) 432 | save(fileName, 'stats', '-append') ; 433 | else 434 | save(fileName, 'stats') ; 435 | end 436 | 437 | % ------------------------------------------------------------------------- 438 | function [net, state, stats] = loadState(fileName) 439 | % ------------------------------------------------------------------------- 440 | load(fileName, 'net', 'state', 'stats') ; 441 | net = dagnn.DagNN.loadobj(net) ; 442 | if isempty(whos('stats')) 443 | error('Epoch ''%s'' was only partially saved. Delete this file and try again.', ... 444 | fileName) ; 445 | end 446 | 447 | % ------------------------------------------------------------------------- 448 | function epoch = findLastCheckpoint(modelDir) 449 | % ------------------------------------------------------------------------- 450 | list = dir(fullfile(modelDir, 'net-epoch-*.mat')) ; 451 | tokens = regexp({list.name}, 'net-epoch-([\d]+).mat', 'tokens') ; 452 | epoch = cellfun(@(x) sscanf(x{1}{1}, '%d'), tokens) ; 453 | epoch = max([epoch 0]) ; 454 | 455 | % ------------------------------------------------------------------------- 456 | function switchFigure(n) 457 | % ------------------------------------------------------------------------- 458 | if get(0,'CurrentFigure') ~= n 459 | try 460 | set(0,'CurrentFigure',n) ; 461 | catch 462 | figure(n) ; 463 | end 464 | end 465 | 466 | % ------------------------------------------------------------------------- 467 | function clearMex() 468 | % ------------------------------------------------------------------------- 469 | clear vl_tmove vl_imreadjpeg ; 470 | 471 | % ------------------------------------------------------------------------- 472 | function prepareGPUs(opts, cold) 473 | % ------------------------------------------------------------------------- 474 | numGpus = numel(opts.gpus) ; 475 | if numGpus > 1 476 | % check parallel pool integrity as it could have timed out 477 | pool = gcp('nocreate') ; 478 | if ~isempty(pool) && pool.NumWorkers ~= numGpus 479 | delete(pool) ; 480 | end 481 | pool = gcp('nocreate') ; 482 | if isempty(pool) 483 | parpool('local', numGpus) ; 484 | cold = true ; 485 | end 486 | 487 | end 488 | if numGpus >= 1 && cold 489 | fprintf('%s: resetting GPU\n', mfilename) 490 | clearMex() ; 491 | if numGpus == 1 492 | gpuDevice(opts.gpus) 493 | else 494 | spmd 495 | clearMex() ; 496 | gpuDevice(opts.gpus(labindex)) 497 | end 498 | end 499 | end 500 | --------------------------------------------------------------------------------