├── .gitmodules
├── dicnn
    ├── visualize_approximate_dynamic_images.m
    ├── compute_approximate_dynamic_images.m
    ├── cnn_init_cafferef.m
    ├── cnn_video_rgb_get_batch.m
    ├── cnn_init_resnext.m
    ├── cnn_video_of_get_batch.m
    ├── cnn_single_of.m
    ├── cnn_single_rgb.m
    ├── cnn_dicnn_of.m
    ├── cnn_dicnn_rgb.m
    └── cnn_train_dicnn_dag.m
├── utils
    └── extract_frames.sh
├── Layers
    ├── L2Normalize.m
    ├── TemporalPooling.m
    ├── AppRankPooling.m
    ├── vl_nnpooltemporal.m
    ├── vl_nnarpooltemporal.m
    ├── vl_nnl2norm.m
    ├── ErrorMultiClass.m
    ├── LossNormalized.m
    └── BatchNormN.m
├── main_train.m
├── Datasets
    ├── cnn_hmdb51_setup_data.m
    ├── cnn_hmdb51_of_setup_data.m
    ├── cnn_ucf101_setup_data.m
    └── cnn_ucf101_of_setup_data.m
└── README.md


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "matconvnet"]
2 | 	path = matconvnet
3 | 	url = https://github.com/vlfeat/matconvnet
4 | 	branch = master
5 | 


--------------------------------------------------------------------------------
/dicnn/visualize_approximate_dynamic_images.m:
--------------------------------------------------------------------------------
1 | function visualize_approximate_dynamic_images(images)
2 | % VISUALIZE_DYNAMIC_IMAGES
3 | 
4 | di = compute_approximate_dynamic_images(images) ;
5 | 
6 | di = di - min(di(:)) ;
7 | di = 255 * di ./ max(di(:)) ;
8 | image(uint8(di)) ;
9 | 


--------------------------------------------------------------------------------
/utils/extract_frames.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | 
 3 | # This script converts videos into frames
 4 | # for different fps change (-r 1)
 5 | 
 6 | for f in *.avi
 7 |   do g=`echo $f | sed 's/\.avi//'`;
 8 |   echo Processing $f; 
 9 |   mkdir -p frames/$g/ ;
10 |   ffmpeg -i $f frames/$g/image-%04d.jpeg ; 
11 | done
12 | 


--------------------------------------------------------------------------------
/Layers/L2Normalize.m:
--------------------------------------------------------------------------------
 1 | classdef L2Normalize < dagnn.ElementWise
 2 |   % author: Hakan Bilen
 3 |   % dagnn wrapper for l2 normalization
 4 |   
 5 |   properties
 6 |     scale = 1;
 7 |     clip = [-inf inf];
 8 |     offset = 0;
 9 |   end
10 |   
11 |   methods
12 |     function outputs = forward(obj, inputs, params)
13 |       outputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset]);
14 |     end
15 |     
16 |     function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
17 |       derInputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset],derOutputs{1});
18 |       derParams = {} ;
19 |     end
20 |     
21 |     function obj = L2Normalize(varargin)
22 |       obj.load(varargin) ;  
23 |     end  
24 |     
25 |   end
26 | end
27 | 
28 | 


--------------------------------------------------------------------------------
/Layers/TemporalPooling.m:
--------------------------------------------------------------------------------
 1 | classdef TemporalPooling < dagnn.ElementWise
 2 |   % author: Hakan Bilen
 3 |   % dagnn wrapper for approximate rank pooling
 4 |   
 5 |   properties
 6 |     method = 'max';
 7 |   end
 8 |  
 9 |   methods
10 |     function outputs = forward(obj, inputs, params)
11 |       outputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method);
12 |     end
13 |     
14 |     function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
15 |       derInputs = cell(1,2);
16 |       derInputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method,derOutputs{1});
17 |       derParams = {} ;
18 |     end
19 |     
20 |     function obj = TemporalPooling(varargin)
21 |       obj.load(varargin) ;  
22 |     end  
23 |     
24 |   end
25 | end
26 | 
27 | 


--------------------------------------------------------------------------------
/dicnn/compute_approximate_dynamic_images.m:
--------------------------------------------------------------------------------
 1 | function di = compute_approximate_dynamic_images(images)
 2 | % Computes approximate dynamic images for a given array of images
 3 | % IMAGES must be a tensor of H x W x D x N dimensionality or
 4 | % cell of image names
 5 | 
 6 | % For the exact dynamic images, use the code
 7 | % http://users.cecs.anu.edu.au/~basura/dynamic_images/code.zip
 8 | % Explained here http://arxiv.org/abs/1512.01848
 9 | 
10 | if isempty(images)
11 |   di = [] ;
12 |   return ;
13 | end
14 | 
15 | 
16 | if iscell(images)
17 |   imagesA = cell(1,numel(images)) ; 
18 |   for i=1:numel(images)
19 |     if ~ischar(images{i})
20 |       error('images must be an array of images or cell of image names') ;
21 |     end
22 |     imagesA{i} = imread(images{i}) ;
23 |   end
24 |   images = cat(4,imagesA{:}) ;
25 | end
26 | 
27 | N = size(images,4) ;
28 | di = vl_nnarpooltemporal(single(images),ones(1,N)) ;
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/Layers/AppRankPooling.m:
--------------------------------------------------------------------------------
 1 | classdef AppRankPooling < dagnn.ElementWise
 2 |   % author: Hakan Bilen
 3 |   % dagnn wrapper for approximate rank pooling
 4 |   
 5 |   properties
 6 |     scale = 1 
 7 |   end
 8 |     
 9 |   methods
10 |     function outputs = forward(obj, inputs, params)
11 |       outputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2}) * obj.scale ;
12 |     end
13 |     
14 |     function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
15 |       derInputs = cell(1,2);
16 |       derInputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2},derOutputs{1}) * obj.scale;
17 |       derParams = {} ;
18 |     end
19 |     
20 |     function outputSizes = getOutputSizes(obj, inputSizes)
21 |       % This is not correct, dim(4) depends on inputs{2}
22 |       outputSizes{1} = inputSizes{1} ;
23 |     end
24 |     
25 |     function obj = AppRankPooling(varargin)
26 |       obj.load(varargin) ;  
27 |     end  
28 |     
29 |   end
30 | end
31 | 
32 | 


--------------------------------------------------------------------------------
/Layers/vl_nnpooltemporal.m:
--------------------------------------------------------------------------------
 1 | function Y = vl_nnpooltemporal(X,ids,method,dzdy)
 2 | % author: Hakan Bilen
 3 | % temporal pooling along frames
 4 | % ids indicates frame-video association
 5 | % method 'max' or 'avg'
 6 | 
 7 | sz = size(X);
 8 | forward = logical(nargin<4);
 9 | Xp = permute(X,[4,1,2,3]);
10 | 
11 | if numel(ids)~=size(X,4)
12 |   error('Error: ids dimension does not match with X!');
13 | end
14 | 
15 | nVideos = max(ids);
16 | 
17 | if forward
18 |   Yp = zeros([nVideos,sz(1:3)],'like',X);
19 |   for v=1:nVideos
20 |     % pool among frames
21 |     indv = find(ids==v);
22 |     Yp(v,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], ...
23 |       'pad', 0, 'stride', [numel(indv),1], 'method', method) ;
24 |   end
25 | else
26 |   dzdyp = permute(dzdy,[4,1,2,3]);
27 |   Yp = zeros(size(Xp),'like',Xp);
28 |   for v=1:nVideos
29 |     % pool among frames
30 |     indv = find(ids==v);
31 |     Yp(indv,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], dzdyp(v,:,:,:), ...
32 |       'pad', 0, 'stride', [numel(indv),1], 'method', method) ;
33 |   end
34 |   
35 | end
36 | % permute back
37 | Y = permute(Yp,[2,3,4,1]);
38 | 
39 | % if forward
40 | %   fprintf(' fwd-ptemp %.2f ',sqrt(sum(Y(:).^2)));
41 | % else
42 | %   fprintf(' back-ptemp %.2f ',sqrt(sum(Y(:).^2)));
43 | % end
44 | 


--------------------------------------------------------------------------------
/Layers/vl_nnarpooltemporal.m:
--------------------------------------------------------------------------------
 1 | function Y = vl_nnarpooltemporal(X,ids,dzdy)
 2 | % author: Hakan Bilen
 3 | % approximate rank pooling
 4 | % ids indicates frame-video association (must be in range [1-N])
 5 | 
 6 | sz = size(X);
 7 | forward = logical(nargin<3);
 8 | 
 9 | if numel(ids)~=size(X,4)
10 |   error('Error: ids dimension does not match with X!');
11 | end
12 | 
13 | nVideos = max(ids);
14 | 
15 | if forward
16 |   Y = zeros([sz(1:3),nVideos],'like',X);
17 | else
18 |   Y = zeros(size(X),'like',X);
19 | end
20 | 
21 | for v=1:nVideos
22 |   % pool among frames
23 |   indv = find(ids==v);
24 |   if isempty(indv)
25 |     error('Error: No frames in video %d',v);
26 |   end
27 |   N = numel(indv);
28 |   % magic numbers
29 |   fw = zeros(1,N);
30 |   if N==1
31 |     fw = 1;
32 |   else
33 |     for i=1:N
34 |       fw(i) = sum((2*(i:N)-N-1) ./ (i:N));
35 |     end
36 |   end
37 |   
38 |   if forward
39 |     Y(:,:,:,v) =  sum(bsxfun(@times,X(:,:,:,indv),...
40 |       reshape(single(fw),[1 1 1 numel(indv)])),4);    
41 |   else
42 |     Y(:,:,:,indv) = (bsxfun(@times,repmat(dzdy(:,:,:,v),[1,1,1,numel(indv)]),...
43 |       reshape(fw,[1 1 1 numel(indv)]))) ;
44 |   end
45 | end
46 | %
47 | % if forward
48 |   %   fprintf(' fwd-arpool %.2f ',sqrt(sum(Y(:).^2)));
49 |   % else
50 |   %   fprintf(' back-arpool %f ',sqrt(sum(Y(:).^2)));
51 | % end
52 | 
53 | 


--------------------------------------------------------------------------------
/Layers/vl_nnl2norm.m:
--------------------------------------------------------------------------------
 1 | function y = vl_nnl2norm(x,param,dzdy)
 2 | % author: Hakan Bilen
 3 | % l2 normalize whole feature map
 4 | 
 5 | sc = param(1);
 6 | clip = param(2:3);
 7 | offset = param(4);
 8 | 
 9 | if nargin == 3
10 |   assert(all(size(x) == size(dzdy)));
11 | else
12 |   dzdy = [];
13 | end
14 | 
15 | x_sz = size(x);
16 | if ~all(x_sz([1 2]) == 1)
17 |   % Create an array of size #channels x #samples
18 |   x = reshape(x, prod(x_sz(1:3)), []);
19 | end
20 | 
21 | 
22 | x = x + offset;
23 | 
24 | if isempty(dzdy)
25 |  
26 |   y = (bsxfun(@times, x, sc./(sqrt(sum(x .* x)) + single(1e-12))));
27 |   % clip max values
28 |   if all(y(:)<clip(1) | y(:)>clip(2))
29 |     warning('Too small clipping interval');
30 |     fprintf('min %f max %f\n',min(y(:)),max(y(:)));
31 |   end
32 |   
33 |   y(y(:)<clip(1)) = clip(1);
34 |   y(y(:)>clip(2)) = clip(2);
35 |   
36 |   
37 | else
38 |   if ~all(x_sz([1 2]) == 1)
39 |     dzdy = reshape(dzdy, prod(x_sz(1:3)), []);
40 |   end
41 |   
42 |   len_ = 1./sqrt(sum(x.*x)+single(1e-12));
43 |   dzdy_ = bsxfun(@times,dzdy,len_.^3);
44 |   y = sc * (bsxfun(@times,dzdy,len_)-bsxfun(@times,x,sum(x.*dzdy_)));
45 | end
46 | 
47 | if ~all(x_sz([1 2]) == 1)
48 |   y = reshape(y, x_sz);
49 | end
50 | % 
51 | % if isempty(dzdy)
52 | %   fprintf(' fwd-l2 %.2f ',sqrt(sum(y(:).^2)));
53 | % else
54 | %   fprintf(' back-l2 %f dzdy %f ',sqrt(sum(y(:).^2)),sqrt(sum(dzdy(:).^2)));
55 | % end
56 | 


--------------------------------------------------------------------------------
/Layers/ErrorMultiClass.m:
--------------------------------------------------------------------------------
 1 | classdef ErrorMultiClass < dagnn.Loss
 2 | % author: Hakan Bilen
 3 | % computes multi-class accuracy
 4 | % inputs{1}->scores
 5 | % inputs{2}->gt labels
 6 |   properties
 7 |     nImgPerClass = []
 8 |     nCorPred = []
 9 |     accuracy = []
10 |     resetLayer = false 
11 |   end
12 |     
13 |   methods
14 |     function outputs = forward(obj, inputs, params)
15 |       
16 |       if numel(inputs)~=2
17 |         error('wrong number of inputs');
18 |       end
19 |       
20 |       nCls = size(inputs{1},3);
21 |       
22 |       if obj.resetLayer || isempty(obj.nImgPerClass)
23 |         obj.nImgPerClass = zeros(1,size(inputs{1},3));
24 |         obj.nCorPred = zeros(1,size(inputs{1},3));
25 |         obj.accuracy = zeros(1,size(inputs{1},3));
26 |         
27 |         if obj.resetLayer
28 |           obj.resetLayer = false ;
29 |           obj.average = 0 ;
30 |         end
31 |       end
32 |       
33 |       
34 |       [~,predictions] = max(gather(squeeze(inputs{1})),[],1);
35 |       
36 |       for c=1:nCls
37 |         obj.nImgPerClass(c) = obj.nImgPerClass(c) + sum(inputs{2}==c);
38 |         obj.nCorPred(c)     = obj.nCorPred(c) + sum(predictions==c & inputs{2}==c);
39 |       end
40 |       
41 |       ni = obj.nImgPerClass;
42 |       ni(ni==0) = 1;
43 |       
44 |       obj.accuracy = obj.nCorPred ./ ni;
45 |       obj.average = (1-mean(obj.accuracy));
46 |       outputs{1} =  obj.average;
47 |     end
48 |     
49 |     function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
50 |       derInputs = cell(1,2);
51 |       derParams = {} ;
52 |     end
53 |     
54 |     function reset(obj)
55 |       obj.resetLayer = true ;
56 | %       obj.nImgPerClass = [];
57 | %       obj.nCorPred = [];
58 | %       obj.accuracy = [];
59 | %       obj.average = 0;
60 |     end
61 |     
62 |     
63 |     function obj = ErrorMultiClass(varargin)
64 |       obj.load(varargin) ;
65 |       obj.loss = 'error_multi_class' ;
66 |     end
67 |   end
68 | end
69 | 


--------------------------------------------------------------------------------
/Layers/LossNormalized.m:
--------------------------------------------------------------------------------
 1 | classdef LossNormalized < dagnn.Loss
 2 | %   properties
 3 | %     loss = 'softmaxlog'
 4 | %     ignoreAverage = false
 5 | %     opts = {}
 6 | %   end
 7 | %   properties (Transient)
 8 | %     average = 0
 9 | %     numAveraged = 0
10 | %   end
11 | 
12 |   methods
13 |     function outputs = forward(obj, inputs, params)
14 |       outputs{1} = vl_nnloss(inputs{1}, inputs{2}, [], 'loss', obj.loss, obj.opts{:}) ;
15 |       obj.accumulateAverage(inputs, outputs);
16 |       if numel(size(inputs{1}))>3
17 |         bs = size(inputs{1},4) ;
18 |       else
19 |         bs = 1 ;
20 |       end
21 |       outputs{1} = outputs{1} / bs ;
22 |     end
23 | 
24 |     function accumulateAverage(obj, inputs, outputs)
25 |       if obj.ignoreAverage, return; end;
26 |       n = obj.numAveraged ;
27 |       m = n + size(inputs{1}, 1) *  size(inputs{1}, 2) * size(inputs{1}, 4);
28 |       obj.average = bsxfun(@plus, n * obj.average, gather(outputs{1})) / m ;
29 |       obj.numAveraged = m ;
30 |     end
31 | 
32 |     function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
33 |       if numel(size(inputs{1}))>3
34 |         bs = size(inputs{1},4) ;
35 |       else
36 |         bs = 1 ;
37 |       end
38 |       
39 |       derInputs{1} = vl_nnloss(inputs{1}, inputs{2}, derOutputs{1}, 'loss', obj.loss, obj.opts{:}) / bs;
40 |       derInputs{2} = [] ;
41 |       derParams = {} ;
42 |     end
43 | 
44 |     function reset(obj)
45 |       obj.average = 0 ;
46 |       obj.numAveraged = 0 ;
47 |     end
48 | 
49 |     function outputSizes = getOutputSizes(obj, inputSizes, paramSizes)
50 |       outputSizes{1} = [1 1 1 inputSizes{1}(4)] ;
51 |     end
52 | 
53 |     function rfs = getReceptiveFields(obj)
54 |       % the receptive field depends on the dimension of the variables
55 |       % which is not known until the network is run
56 |       rfs(1,1).size = [NaN NaN] ;
57 |       rfs(1,1).stride = [NaN NaN] ;
58 |       rfs(1,1).offset = [NaN NaN] ;
59 |       rfs(2,1) = rfs(1,1) ;
60 |     end
61 | 
62 |     function obj = LossNormalized(varargin)
63 |       obj.load(varargin) ;
64 |     end
65 |   end
66 | end
67 | 


--------------------------------------------------------------------------------
/Layers/BatchNormN.m:
--------------------------------------------------------------------------------
 1 | classdef BatchNormN < dagnn.ElementWise
 2 |   properties
 3 |     numChannels
 4 |     epsilon = 1e-5
 5 |     opts = {'NoCuDNN'} % ours seems slightly faster
 6 |   end
 7 | 
 8 |   properties (Transient)
 9 |     moments
10 |   end
11 | 
12 |   methods
13 |     function outputs = forward(obj, inputs, params)
14 |       if strcmp(obj.net.mode, 'test')
15 |         outputs{1} = vl_nnbnorm(inputs{1}, params{1}, params{2}, ...
16 |                                 'moments', params{3}, ...
17 |                                 'epsilon', obj.epsilon, ...
18 |                                 obj.opts{:}) ;
19 |       else
20 |         [outputs{1},obj.moments] = ...
21 |             vl_nnbnorm(inputs{1}, params{1}, params{2}, ...
22 |                        'epsilon', obj.epsilon, ...
23 |                        obj.opts{:}) ;
24 |       end
25 |     end
26 | 
27 |     function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
28 |       [derInputs{1}, derParams{1}, derParams{2}, derParams{3}] = ...
29 |         vl_nnbnorm(inputs{1}, params{1}, params{2}, derOutputs{1}, ...
30 |                    'epsilon', obj.epsilon, ...
31 |                    'moments', obj.moments, ...
32 |                    obj.opts{:}) ;
33 |       obj.moments = [] ;
34 |       % multiply the moments update by the number of images in the batch
35 |       % this is required to make the update additive for subbatches
36 |       % and will eventually be normalized away
37 |       % derParams{3} = derParams{3} * size(inputs{1},4) ;
38 |     end
39 | 
40 |     % ---------------------------------------------------------------------
41 |     function obj = BatchNormN(varargin)
42 |       obj.load(varargin{:}) ;
43 |     end
44 | 
45 |     function params = initParams(obj)
46 |       params{1} = ones(obj.numChannels,1,'single') ;
47 |       params{2} = zeros(obj.numChannels,1,'single') ;
48 |       params{3} = zeros(obj.numChannels,2,'single') ;
49 |     end
50 | 
51 |     function attach(obj, net, index)
52 |       attach@dagnn.ElementWise(obj, net, index) ;
53 |       p = net.getParamIndex(net.layers(index).params{3}) ;
54 |       net.params(p).trainMethod = 'average' ;
55 |       net.params(p).learningRate = 0.1 ;
56 |     end
57 |   end
58 | end
59 | 


--------------------------------------------------------------------------------
/main_train.m:
--------------------------------------------------------------------------------
 1 | model = 'resnext50' ; % {'cafferef','resnext50','resnext101'}
 2 | input = 'rgb' ; % {'rgb','of'}
 3 | dataset = 'ucf101' ; % {'ucf101','hmdb51'}  hmdb51 requires more iterations to train (add more epochs to learning rate)
 4 | opts.train.batchSize = 128 ;
 5 | opts.train.numSubBatches = 32 ; % increase the number (16,32) if it does not fit into gpu mem 
 6 | opts.epochFactor = 5 ;
 7 | opts.split = 1 ;
 8 | 
 9 | opts.train.gpus = 1 ;
10 | 
11 | run matconvnet/matlab/vl_setupnn.m ;
12 | vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ;
13 | vl_contrib install autonn ; vl_contrib setup autonn ;
14 | 
15 | % addpath(fullfile('matconvnet','contrib','mcnExtraLayers','matlab')) ;
16 | 
17 | opts.expDir = ['exp/' model 'rgb-arpool-split' num2str(opts.split)] ;
18 | if strcmp(input,'rgb')  
19 |   opts.DropOutRate = 0.5 ;
20 |   trainfn = @cnn_dicnn_rgb ;
21 | elseif strcmp(input,'of')  
22 |   opts.DropOutRate = 0.8 ;
23 |   trainfn = @cnn_dicnn_of ;
24 | end
25 | 
26 | if strcmp(model,'cafferef')  
27 | 
28 |   opts.pool1Layer = 'conv1' ;
29 |   % download from http://www.vlfeat.org/matconvnet/models/imagenet-caffe-ref.mat
30 |   opts.modelPath = fullfile('models','imagenet-caffe-ref.mat') ;
31 |   opts.networkFn = @cnn_init_cafferef ;
32 |   
33 |   if strcmp(input,'rgb')  
34 |     opts.train.learningRate = 1e-3 * [ones(1,2) 0.1*ones(1,2)] ;
35 |   else
36 |     opts.train.learningRate = 3e-3 * [ones(1,10) 0.1*ones(1,2)] ;
37 |   end
38 | 
39 |   opts.train.numEpochs = numel(opts.train.learningRate) ;
40 | elseif strcmp(model,'resnext50') || strcmp(model,'resnext101')
41 |   % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_50_32x4d-pt-mcn.mat
42 |   % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_101_32x4d-pt-mcn.mat
43 |   if strcmp(model,'resnext50')
44 |     opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ;
45 |   else
46 |     opts.modelPath = fullfile('models','resnext_101_32x4d-pt-mcn.mat') ;
47 |   end
48 |   opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ;
49 |   opts.networkFn = @cnn_init_resnext ;
50 |   if strcmp(input,'rgb')  
51 |     opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,8) ] ;
52 |   else
53 |     opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,2) ] ;
54 |   end
55 | end
56 | 
57 | addpath dicnn ;
58 | 
59 | [net, info] = trainfn(opts)
60 | 


--------------------------------------------------------------------------------
/Datasets/cnn_hmdb51_setup_data.m:
--------------------------------------------------------------------------------
 1 | function imdb = cnn_hmdb51_setup_data(varargin)
 2 | % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
 3 | % http://crcv.ucf.edu/data/UCF101.php
 4 | % this script requires UCF101 downloaded and frames extracted in frames
 5 | % folder
 6 | 
 7 | opts.dataDir = fullfile('data','HMDB51') ;
 8 | opts.lite = false ;
 9 | % opts = vl_argparse(opts, varargin) ;
10 | 
11 | %% ------------------------------------------------------------------------
12 | %                                                  Load categories metadata
13 | % -------------------------------------------------------------------------
14 | % find images
15 | imagePath = fullfile(opts.dataDir, 'frames', '*') ;
16 | images = dir(imagePath) ;
17 | 
18 | videoNames = cell(1,numel(images)) ;
19 | frameNames = cell(1,numel(images)) ;
20 | nrFrames = zeros(1,numel(images)) ;
21 | for i=1:numel(images)
22 |   
23 |   frames = dir(fullfile(opts.dataDir,'frames',images(i).name,'frame*.jpg')) ;
24 |   framesc = cell(1,numel(frames)) ;
25 |   if ~isempty(numel(frames))
26 |     for j=1:numel(frames)
27 |       framesc{j} = frames(j).name ;
28 |     end
29 |     frameNames{i} = strcat(images(i).name,'/',framesc) ;
30 |     nrFrames(i) = numel(framesc) ;
31 |     videoNames{i} = images(i).name ; 
32 |   end
33 | end
34 | 
35 | videoNames(nrFrames==0) = [] ;
36 | frameNames(nrFrames==0) = [] ;
37 | % nrFrames(nrFrames==0) = [] ;
38 | 
39 | 
40 | % find metadata
41 | % ncls = 51 ;
42 | 
43 | 
44 | metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ;
45 | 
46 | splits = dir(metaPath) ;
47 | 
48 | % splitFiles = cell(1,3*ncls) ;
49 | cats = cell(1,numel(videoNames)) ;
50 | sets = zeros(3,numel(videoNames)) ;
51 | catNames = cell(1,numel(splits)) ;
52 | 
53 | for i=1:numel(splits)
54 |   j = strfind(splits(i).name,'_test_') ;
55 |   splitno = str2double(splits(i).name(j+11)) ;
56 |   catNames{i} = splits(i).name(1:j-1) ;
57 |   t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ;
58 |   
59 |   vids = cell(1,numel(t.textdata)) ;
60 |   for k=1:numel(t.textdata)
61 |     vids{k} = t.textdata{k}(1:end-4) ;
62 |   end
63 |   
64 |   [ia,ib] = ismember(vids,videoNames) ;
65 |   assert(all(ia)) ;
66 |   sets(splitno,ib) = t.data' ;
67 |   cats(ib) = repmat(catNames(i),numel(ia),1) ;
68 | end
69 | 
70 | [cu,~,labels] = unique(cats) ;
71 | sets(sets(:)==2) = 3 ;
72 | 
73 | imdb.classes.name = cu ;
74 | imdb.images.name = videoNames ;
75 | imdb.images.names = frameNames ;
76 | imdb.images.label = labels' ;
77 | imdb.images.sets = sets ;
78 | imdb.imageDir = fullfile(opts.dataDir, 'frames') ;
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/Datasets/cnn_hmdb51_of_setup_data.m:
--------------------------------------------------------------------------------
 1 | function imdb = cnn_hmdb51_of_setup_data(varargin)
 2 | % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
 3 | % http://crcv.ucf.edu/data/UCF101.php
 4 | % this script requires UCF101 downloaded and frames extracted in frames
 5 | % folder
 6 | 
 7 | 
 8 | opts.dataDir = fullfile('data','HMDB51') ;
 9 | opts.lite = false ;
10 | % opts = vl_argparse(opts, varargin) ;
11 | 
12 | %% ------------------------------------------------------------------------
13 | %                                                  Load categories metadata
14 | % -------------------------------------------------------------------------
15 | % find images
16 | imagePath = fullfile(opts.dataDir, 'tvl1_flow', 'u', '*') ;
17 | images = dir(imagePath) ;
18 | 
19 | videoNames = cell(1,numel(images)) ;
20 | frameNames = cell(1,numel(images)) ;
21 | nrFrames = zeros(1,numel(images)) ;
22 | for i=1:numel(images)
23 |   
24 |   frames = dir(fullfile(opts.dataDir,'tvl1_flow','u',images(i).name,'frame*.jpg')) ;
25 |   framesc = cell(1,numel(frames)) ;
26 |   if ~isempty(numel(frames))
27 |     for j=1:numel(frames)
28 |       framesc{j} = frames(j).name ;
29 |     end
30 |     frameNames{i} = framesc ;
31 |     frameNames{i} = strcat(images(i).name,'/',framesc) ;
32 |     nrFrames(i) = numel(framesc) ;
33 |     videoNames{i} = images(i).name ; 
34 |   end
35 | end
36 | 
37 | videoNames(nrFrames==0) = [] ;
38 | frameNames(nrFrames==0) = [] ;
39 | % nrFrames(nrFrames==0) = [] ;
40 | 
41 | 
42 | frameNamesuv = cell(1,numel(frameNames)) ;
43 | for i=1:numel(frameNames)
44 |   nn = frameNames{i} ;
45 |   nn1 = strcat('u/',nn) ;
46 |   nn2 = strcat('v/',nn) ;
47 |   
48 |   frameNamesuv{i} = cell(1,2*numel(nn1)) ;
49 |   frameNamesuv{i}(1:2:end) = nn1 ;
50 |   frameNamesuv{i}(2:2:end) = nn2 ;
51 | end
52 | 
53 | % find metadata
54 | % ncls = 51 ;
55 | 
56 | metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ;
57 | 
58 | splits = dir(metaPath) ;
59 | 
60 | cats = cell(1,numel(videoNames)) ;
61 | sets = zeros(3,numel(videoNames)) ;
62 | catNames = cell(1,numel(splits)) ;
63 | 
64 | for i=1:numel(splits)
65 |   j = strfind(splits(i).name,'_test_') ;
66 |   splitno = str2double(splits(i).name(j+11)) ;
67 |   catNames{i} = splits(i).name(1:j-1) ;
68 |   t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ;
69 |   
70 |   vids = cell(1,numel(t.textdata)) ;
71 |   for k=1:numel(t.textdata)
72 |     vids{k} = t.textdata{k}(1:end-4) ;
73 |   end
74 |   
75 |   [ia,ib] = ismember(vids,videoNames) ;
76 |   assert(all(ia)) ;
77 |   sets(splitno,ib) = t.data' ;
78 |   cats(ib) = repmat(catNames(i),numel(ia),1) ;
79 | end
80 | 
81 | [cu,~,labels] = unique(cats) ;
82 | sets(sets(:)==2) = 3 ;
83 | 
84 | imdb.classes.name = cu ;
85 | imdb.images.name = videoNames ;
86 | imdb.images.names = frameNamesuv ;
87 | imdb.images.label = labels' ;
88 | imdb.images.sets = sets ;
89 | imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ;
90 | 


--------------------------------------------------------------------------------
/Datasets/cnn_ucf101_setup_data.m:
--------------------------------------------------------------------------------
  1 | function imdb = cnn_ucf101_setup_data(varargin)
  2 | % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
  3 | % http://crcv.ucf.edu/data/UCF101.php
  4 | % this script requires UCF101 downloaded and frames extracted in frames
  5 | % folder
  6 | 
  7 | opts.dataDir = fullfile('data','UCF101') ;
  8 | opts.lite = false ;
  9 | opts = vl_argparse(opts, varargin) ;
 10 | 
 11 | %% ------------------------------------------------------------------------
 12 | %                                                  Load categories metadata
 13 | % -------------------------------------------------------------------------
 14 | 
 15 | % find metadata
 16 | metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ;
 17 | 
 18 | fprintf('using metadata %s\n', metaPath) ;
 19 | tmp = importdata(metaPath);
 20 | nCls = numel(tmp);
 21 | 
 22 | if nCls ~= 101
 23 |   error('Wrong meta file %s',metaPath);
 24 | end
 25 | 
 26 | cats = cell(1,nCls);
 27 | for i=1:numel(tmp)
 28 |   t = strsplit(tmp{i});
 29 |   cats{i} = t{2};
 30 | end
 31 | 
 32 | imdb.classes.name = cats ;
 33 | imdb.imageDir = fullfile(opts.dataDir, 'frames') ;
 34 | 
 35 | %% ------------------------------------------------------------------------
 36 | %                                              load image names and labels
 37 | % -------------------------------------------------------------------------
 38 | 
 39 | fprintf('searching training images ...\n') ;
 40 | names = {} ;
 41 | name = {};
 42 | labels = {} ;
 43 | for d = dir(fullfile(imdb.imageDir, 'v_*'))'
 44 |   [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ;
 45 |   if lab==0
 46 |     error('no class label found for %s',d.name);
 47 |   end
 48 |   ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ;
 49 |   name{end+1} = d.name;
 50 |   names{end+1} = strcat([d.name, filesep], {ims.name}) ;
 51 |   labels{end+1} = lab ;
 52 |   if mod(numel(names), 10) == 0, fprintf('.') ; end
 53 |   if mod(numel(names), 500) == 0, fprintf('\n') ; end
 54 |   %fprintf('found %s with %d images\n', d.name, numel(ims)) ;
 55 | end
 56 | % names = horzcat(names{:}) ;
 57 | labels = horzcat(labels{:}) ;
 58 | 
 59 | imdb.images.id = 1:numel(names) ;
 60 | imdb.images.name = name ;
 61 | imdb.images.names = names ;
 62 | imdb.images.label = labels ;
 63 | 
 64 | 
 65 | %% ------------------------------------------------------------------------
 66 | %                                                 load train / test splits
 67 | % -------------------------------------------------------------------------
 68 | 
 69 | fprintf('labeling data...(this may take couple of minutes)') ;
 70 | imdb.images.sets = zeros(3, numel(names)) ;
 71 | setNames = {'train','test'};
 72 | setVal = [1,3];
 73 | 
 74 | for s=1:numel(setNames)
 75 |   for i=1:3
 76 |     trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',...
 77 |       setNames{s},i)) ;
 78 |     trainList = importdata(trainFl);
 79 |     if isfield(trainList,'textdata')
 80 |       trainList = trainList.textdata;
 81 |     end
 82 |     for j=1:numel(trainList)
 83 |       tmp = strsplit(trainList{j},'/');
 84 |       [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ;
 85 |       if lab==0
 86 |         error('cannot find the video %s',tmp{2});
 87 |       end
 88 | %       if trainList.data(j) ~= labels(lab)
 89 | %         error('Labels do not match for %s',tmp{2});
 90 | %       end
 91 |       imdb.images.sets(i,lab) = setVal(s);
 92 |     end
 93 |   end  
 94 | end
 95 | fprintf('\n') ;
 96 | %% ------------------------------------------------------------------------
 97 | %                                                            Postprocessing
 98 | % -------------------------------------------------------------------------
 99 | 
100 | % sort categories by WNID (to be compatible with other implementations)
101 | [imdb.classes.name,perm] = sort(imdb.classes.name) ;
102 | relabel(perm) = 1:numel(imdb.classes.name) ;
103 | ok = imdb.images.label >  0 ;
104 | imdb.images.label(ok) = relabel(imdb.images.label(ok)) ;
105 | 
106 | if opts.lite
107 |   % pick a small number of images for the first 10 classes
108 |   % this cannot be done for test as we do not have test labels
109 |   clear keep ;
110 |   for i=1:10
111 |     sel = find(imdb.images.label == i) ;
112 |     train = sel(imdb.images.sets(1,sel) == 1) ;
113 |     test = sel(imdb.images.sets(1,sel) == 3) ;
114 |     keep{i} = [train test] ;
115 |   end
116 |   keep = keep{:};
117 |   imdb.images.id = imdb.images.id(keep) ;
118 |   imdb.images.name = imdb.images.name(keep) ;
119 |   imdb.images.names = imdb.images.names(keep) ;
120 |   imdb.images.sets = imdb.images.sets(1,keep) ;
121 |   imdb.images.label = imdb.images.label(keep) ;
122 | end
123 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dynamic Image Networks for Action Recognition
  2 | ## Improved Results (see the extended version of CVPR paper)
  3 | 
  4 | 
  5 | ResNeXt-50        | HMDB51 (%) | UCF101 (%) |
  6 | ------------------|--------|--------|
  7 | SI                |  53.5  |  87.6  |
  8 | DI                |  57.3  |  86.6  |
  9 | OF                |  55.8  |  84.9  |
 10 | DOF               |  58.9  |  86.6  |
 11 | SI+OF             |  67.5  |  93.9  |
 12 | SI+DI             |  61.3  |  90.6  |
 13 | OF+DOF            |  62.6  |  89.1  |
 14 | SI+DI+OF+DOF      |  71.5  |  95.0  |
 15 | SI+DI+OF+DOF+iDT  |  74.2  |  95.4  |
 16 | 
 17 | * Results are in the standard average multi-class accuracy (%)
 18 | * SI: RGB image
 19 | * DI: dynamic RBG image
 20 | * OF: optical flow 
 21 | * DOF: dynamic optical flow 
 22 | * iDT: improved trajectory features 
 23 | 
 24 | 
 25 | ## Installation
 26 | 1. Clone the Dynamic Image Net repository:
 27 | 
 28 |     ```Shell
 29 |     git clone --recursive  https://github.com/hbilen/dynamic-image-nets
 30 |     ```
 31 |     
 32 | 2. Compile matconvnet toolbox: (see [http://www.vlfeat.org/matconvnet/install/](http://www.vlfeat.org/matconvnet/install/))
 33 | 
 34 | 3. Install additional matconvnet packages
 35 |     
 36 |   ```Shell
 37 |     run matconvnet/matlab/vl_setupnn.m ;
 38 |     vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ;
 39 |     vl_contrib install autonn ; vl_contrib setup autonn ;
 40 |   ```
 41 | 
 42 | 4. Download your dataset : (e.g. UCF101 from [http://crcv.ucf.edu/data/UCF101.php](http://crcv.ucf.edu/data/UCF101.php))
 43 | 
 44 | 5. Convert videos to frames, resize them to 256x256 and store them in such a directory structure:
 45 | Alternatively, you can download RGB and precomputed optical flow frames from [Christoph Feichtenhofer](http://ftp.tugraz.at/pub/feichtenhofer/tsfusion/data/) and copy RGB frames under "UCF101/frames" and optical flow frames under "UCF101/tvl1_flow".
 46 |     
 47 |     ```Shell
 48 |     data/UCF101/ucfTrainTestlist/
 49 |     ├── classInd.txt
 50 |     ├── testlist01.txt
 51 |     ├── testlist02.txt
 52 |     ├── testlist03.txt
 53 |     ├── trainlist01.txt
 54 |     ├── trainlist02.txt
 55 |     └── trainlist03.txt
 56 |     data/UCF101/frames/
 57 |     ├── v_ApplyEyeMakeup_g01_c01
 58 |     │   ├── 00001.jpg
 59 |     │   ├── 00002.jpg
 60 |     │   ├── 00003.jpg
 61 |     │   ├── 00004.jpg
 62 |     │   ├── 00005.jpg
 63 |     ```
 64 | 
 65 | ## Compute and Visualise Approximate Dynamic Images
 66 | 1. If you want to compute approximate dynamic images, get a list of ordered frames from a video and try
 67 |   ```matlab
 68 |   di = compute_approximate_dynamic_images(images) ;
 69 |   ```
 70 | 
 71 | 2. If you want to visualise approximate dynamic images, get a list of ordered frames from a video and try
 72 |   ```matlab
 73 |   visualize_approximate_dynamic_images(images)
 74 |   ```
 75 | 
 76 | ## Train a Dynamic Image Net
 77 | You can modify the options in `main_train.m` and train your model by running
 78 |     ```matlab
 79 |     main_train
 80 |     ```
 81 |     
 82 | Note: If you want to train a model on a different dataset than UCF101 or HMDB51, you need to write a custom script `cnn_dataset_setup_data` to build your database (imdb).
 83 | 
 84 | ## Evaluation
 85 | 1. Download the CNN Models for the UCF101 dataset, that are used in the journal, from [here](http://groups.inf.ed.ac.uk/hbilen-data/data/resnext50_dicnn.tar).
 86 | 2. Choose the right model, split and input type (e.g.)
 87 |     ```matlab
 88 |     net = load('resnext50-rgb-arpool-split1.mat') ;
 89 |     net = dagnn.DagNN.loadobj(net) ;
 90 |     net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr') ;
 91 |     opts.network = net ;
 92 |     opts.split = 1 ;
 93 |     opts.train.gpus = 1 ;
 94 |     opts.epochFactor = 0 ; 
 95 |     [net, info] = cnn_dicnn_rgb(opts)
 96 |     ```
 97 | 
 98 | ## Citing Dynamic Image Networks
 99 | 
100 | If you find the code useful, please cite:
101 | 
102 |         @inproceedings{Bilen2016a,
103 |           author    = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A. and Gould, S.",
104 |           title     = "Dynamic Image Networks for Action Recognition",
105 |           booktitle = "CVPR",
106 |           year      = "2016"
107 |         }
108 |         @journal{Bilen2017a,
109 |           author    = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A.",
110 |           title     = "Action Recognition with Dynamic Image Networks",
111 |           journal   = " IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)",
112 |           year      = "2017"
113 |         }
114 | 
115 | ## License
116 | The analysis work performed with the program(s) must be non-proprietary work. Licensee and its contract users must be or be affiliated with an academic facility. Licensee may additionally permit individuals who are students at such academic facility to access and use the program(s). Such students will be considered contract users of licensee. The program(s) may not be used for commercial competitive analysis (such as benchmarking) or for any commercial activity, including consulting.
117 | 
118 | 


--------------------------------------------------------------------------------
/Datasets/cnn_ucf101_of_setup_data.m:
--------------------------------------------------------------------------------
  1 | function imdb = cnn_ucf101_of_setup_data(varargin)
  2 | % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
  3 | % http://crcv.ucf.edu/data/UCF101.php
  4 | % this script requires UCF101 downloaded and frames extracted in frames
  5 | % folder
  6 | 
  7 | opts.dataDir = fullfile('data','UCF101') ;
  8 | opts.lite = false ;
  9 | opts = vl_argparse(opts, varargin) ;
 10 | 
 11 | %% ------------------------------------------------------------------------
 12 | %                                                  Load categories metadata
 13 | % -------------------------------------------------------------------------
 14 | 
 15 | % find metadata
 16 | metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ;
 17 | 
 18 | fprintf('using metadata %s\n', metaPath) ;
 19 | tmp = importdata(metaPath);
 20 | nCls = numel(tmp);
 21 | 
 22 | if nCls ~= 101
 23 |   error('Wrong meta file %s',metaPath);
 24 | end
 25 | 
 26 | cats = cell(1,nCls);
 27 | for i=1:numel(tmp)
 28 |   t = strsplit(tmp{i});
 29 |   cats{i} = t{2};
 30 | end
 31 | 
 32 | imdb.classes.name = sort(cats) ;
 33 | imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow', 'u') ;
 34 | 
 35 | %% ------------------------------------------------------------------------
 36 | %                                              load image names and labels
 37 | % -------------------------------------------------------------------------
 38 | 
 39 | fprintf('searching training images ...\n') ;
 40 | names = {} ;
 41 | name = {};
 42 | labels = {} ;
 43 | for d = dir(fullfile(imdb.imageDir, 'v_*'))'
 44 |   [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ;
 45 |   if lab==0
 46 |     error('no class label found for %s',d.name);
 47 |   end
 48 |   ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ;
 49 |   name{end+1} = d.name;
 50 |   names{end+1} = strcat([d.name, filesep], {ims.name}) ;
 51 |   labels{end+1} = lab ;
 52 |   if mod(numel(names), 10) == 0, fprintf('.') ; end
 53 |   if mod(numel(names), 500) == 0, fprintf('\n') ; end
 54 |   %fprintf('found %s with %d images\n', d.name, numel(ims)) ;
 55 | end
 56 | % names = horzcat(names{:}) ;
 57 | 
 58 | labels = horzcat(labels{:}) ;
 59 | % labels = [labels ; labels] ;
 60 | labels = labels(:)' ;
 61 | 
 62 | for i=1:numel(names)
 63 |   nn = names{i} ;
 64 |   nn1 = strcat('u/',nn) ;
 65 |   nn2 = strcat('v/',nn) ;
 66 |   
 67 |   names{i} = cell(1,2*numel(nn1)) ;
 68 |   names{i}(1:2:end) = nn1 ;
 69 |   names{i}(2:2:end) = nn2 ;
 70 | end
 71 | 
 72 | imdb.images.id = 1:numel(names) ;
 73 | imdb.images.name = name ;
 74 | imdb.images.names = names ;
 75 | imdb.images.label = labels ;
 76 | imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ;
 77 | 
 78 | %% ------------------------------------------------------------------------
 79 | %                                                 load train / test splits
 80 | % -------------------------------------------------------------------------
 81 | 
 82 | fprintf('labeling data...(this may take couple of minutes)') ;
 83 | imdb.images.sets = zeros(3, numel(names)) ;
 84 | setNames = {'train','test'};
 85 | setVal = [1,3];
 86 | 
 87 | for s=1:numel(setNames)
 88 |   for i=1:3
 89 |     trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',...
 90 |       setNames{s},i)) ;
 91 |     trainList = importdata(trainFl);
 92 |     if isfield(trainList,'textdata')
 93 |       trainList = trainList.textdata;
 94 |     end
 95 |     for j=1:numel(trainList)
 96 |       tmp = strsplit(trainList{j},'/');
 97 |       [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ;
 98 |       if lab==0
 99 | %         error('cannot find the video %s',tmp{2}(1:end-4));
100 |         warning('cannot find the video %s',tmp{2}(1:end-4));
101 |         continue ;
102 |       end
103 | %       if trainList.data(j) ~= labels(lab)
104 | %         error('Labels do not match for %s',tmp{2});
105 | %       end
106 |       imdb.images.sets(i,lab) = setVal(s);
107 |     end
108 |   end  
109 | end
110 | fprintf('\n') ;
111 | %% ------------------------------------------------------------------------
112 | %                                                            Postprocessing
113 | % -------------------------------------------------------------------------
114 | 
115 | % sort categories by WNID (to be compatible with other implementations)
116 | [imdb.classes.name,perm] = sort(imdb.classes.name) ;
117 | relabel(perm) = 1:numel(imdb.classes.name) ;
118 | ok = imdb.images.label >  0 ;
119 | imdb.images.label(ok) = relabel(imdb.images.label(ok)) ;
120 | 
121 | if opts.lite
122 |   % pick a small number of images for the first 10 classes
123 |   % this cannot be done for test as we do not have test labels
124 |   clear keep ;
125 |   for i=1:10
126 |     sel = find(imdb.images.label == i) ;
127 |     train = sel(imdb.images.sets(1,sel) == 1) ;
128 |     test = sel(imdb.images.sets(1,sel) == 3) ;
129 |     keep{i} = [train test] ;
130 |   end
131 |   keep = keep{:};
132 |   imdb.images.id = imdb.images.id(keep) ;
133 |   imdb.images.name = imdb.images.name(keep) ;
134 |   imdb.images.names = imdb.images.names(keep) ;
135 |   imdb.images.sets = imdb.images.sets(1,keep) ;
136 |   imdb.images.label = imdb.images.label(keep) ;
137 | end
138 | 


--------------------------------------------------------------------------------
/dicnn/cnn_init_cafferef.m:
--------------------------------------------------------------------------------
  1 | % -------------------------------------------------------------------------
  2 | function net = cnn_init_cafferef(net,opts)
  3 | % -------------------------------------------------------------------------
  4 | 
  5 | drop6p = find(cellfun(@(a) strcmp(a.name, 'dropout6'), net.layers)==1);
  6 | drop7p = find(cellfun(@(a) strcmp(a.name, 'dropout7'), net.layers)==1);
  7 | 
  8 | if ~isempty(drop6p)
  9 |   assert(~isempty(drop7p));
 10 |   net.layers{drop6p}.rate = opts.DropOutRate;
 11 |   net.layers{drop7p}.rate = opts.DropOutRate;
 12 | else
 13 |   relu6p = find(cellfun(@(a) strcmp(a.name, 'relu6'), net.layers)==1);
 14 |   relu7p = find(cellfun(@(a) strcmp(a.name, 'relu7'), net.layers)==1);
 15 | 
 16 |   drop6 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout6') ;
 17 |   drop7 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout7') ;
 18 |   net.layers = [net.layers(1:relu6p) drop6 net.layers(relu6p+1:relu7p) drop7 net.layers(relu7p+1:end)];
 19 | end
 20 | 
 21 | % replace fc8
 22 | fc8l = cellfun(@(a) strcmp(a.name, 'fc8'), net.layers)==1;
 23 | 
 24 | nCls = opts.nCls ;
 25 | % nCls = 101;
 26 | sizeW = size(net.layers{fc8l}.weights{1});
 27 | 
 28 | if sizeW(4)~=nCls
 29 |   net.layers{fc8l}.weights = {zeros(sizeW(1),sizeW(2),sizeW(3),nCls,'single'), ...
 30 |     zeros(1, nCls, 'single')};
 31 | end
 32 | 
 33 | % change loss
 34 | % net.layers(end) = [];
 35 | net.layers{end} = struct('name','loss', 'type','softmaxloss') ;
 36 | 
 37 | % convert to dagnn
 38 | net = dagnn.DagNN.fromSimpleNN(net, 'canonicalNames', true) ;
 39 | 
 40 | poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
 41 | assert(~isempty(poolLyr1));
 42 | % configure appr-rank-pool
 43 | switch opts.pool1Type
 44 |   case 'arpool'
 45 |     if strcmp(opts.pool1Layer,'conv1')
 46 |       net.addLayer('arpool',AppRankPooling('scale',1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN');
 47 |       net.addLayer('l2normalize',L2Normalize('scale',6000,'clip',[-128 128]),...
 48 |         'DynImgN','DynImg');
 49 |     else
 50 |       net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN');
 51 |       net.addLayer('reluP',dagnn.ReLU(),...
 52 |       {'DynImgN'},'DynImg');
 53 |     end
 54 |     net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ;  
 55 |   case 'ppool1'
 56 |     if strcmp(opts.pool1Layer,'conv1')
 57 |       net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
 58 |       {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'});
 59 |     else
 60 |       net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
 61 |       {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'});
 62 |     net.addLayer('reluP',dagnn.ReLU(),...
 63 |       {'DynImgN'},'DynImg');
 64 |     end
 65 |     
 66 |     net.layers(poolLyr1).inputs{1} = 'DynImg' ;
 67 | %     net.params(end-1).value = 0.01 * randn(1,1,10,1,'single');
 68 |     net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
 69 |     net.params(end).value = zeros(1,1,'single');    
 70 |     
 71 |     net.params(end-1).learningRate = 0.1 ;
 72 |     net.params(end).learningRate = 0.2 ;
 73 |   case 'ppool2'
 74 |     if strcmp(opts.pool1Layer,'conv1')
 75 |       net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
 76 |       {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'});
 77 |     else
 78 |       net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
 79 |       {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'});
 80 |     net.addLayer('reluP',dagnn.ReLU(),...
 81 |       {'DynImgN'},'DynImg');
 82 |     end
 83 |     
 84 |     net.layers(poolLyr1).inputs{1} = 'DynImg' ;
 85 | %     net.params(end-1).value = 0.01 * randn(1,1,10,1,'single');
 86 |     net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
 87 |     net.params(end).value = zeros(1,1,'single');    
 88 |     
 89 |     net.params(end-1).learningRate = 0.1 ;
 90 |     net.params(end).learningRate = 0.2 ;
 91 |   case 'none'
 92 |     
 93 |   otherwise
 94 |     error('Unknown pool type %s', opts.pool1Type) ;
 95 | end
 96 | 
 97 | 
 98 | 
 99 | % second pool layer (max pooling)
100 | poolLyr2 = find(arrayfun(@(a) strcmp(a.name, opts.pool2Layer), net.layers)==1);
101 | net.addLayer('tempPoolMax',TemporalPooling('method','max'),...
102 |   {net.layers(poolLyr2(1)).inputs{1},'VideoId2'},'tempPoolMax');
103 | 
104 | net.layers(poolLyr2).inputs{1} = 'tempPoolMax';
105 | 
106 | % add multi-class error
107 | net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr');
108 | 
109 | net_ = net.saveobj ;
110 | net = dagnn.DagNN.loadobj(net_) ;
111 | 
112 | net.removeLayer('loss') ;
113 | net.addLayer('loss', ...
114 |              LossNormalized('loss', 'softmaxlog') ,...
115 |              {'prediction', 'label'}, ...
116 |              'objective') ;
117 |            
118 | % replace standard matconvnet bnorm with my version
119 | bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1);
120 | for i=1:numel(bns)
121 |   bb = net.layers(bns(i)).block ;
122 |   net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,...
123 |   'epsilon',bb.epsilon,...
124 |   'opts',bb.opts) ;
125 | end
126 | 


--------------------------------------------------------------------------------
/dicnn/cnn_video_rgb_get_batch.m:
--------------------------------------------------------------------------------
  1 | function imo = cnn_video_rgb_get_batch(images, vids, varargin)
  2 | % CNN_VIDEO_RGB_GET_BATCH  Load, preprocess, and pack images for CNN evaluation
  3 | 
  4 | % video ids
  5 | % use same spatial jittering for frames from the same video
  6 | % NOTE: all the frames from a video should have the same size (wxh)
  7 | 
  8 | opts.imageSize = [227, 227] ;
  9 | opts.border = [29, 29] ;
 10 | opts.keepAspect = true ;
 11 | opts.numAugments = 1 ;
 12 | opts.transformation = 'none' ;
 13 | opts.averageImage = [] ;
 14 | opts.rgbVariance = zeros(0,3,'single') ;
 15 | opts.interpolation = 'bilinear' ;
 16 | opts.numThreads = 1 ;
 17 | opts.prefetch = false ;
 18 | opts.subMean = false ; % subtract the mean from each video
 19 | opts.lazyResize = true ;
 20 | 
 21 | opts = vl_argparse(opts, varargin);
 22 | 
 23 | % fetch is true if images is a list of filenames (instead of
 24 | % a cell array of images)
 25 | fetch = numel(images) >= 1 && ischar(images{1}) ;
 26 | 
 27 | % prefetch is used to load images in a separate thread
 28 | prefetch = fetch & opts.prefetch ;
 29 | 
 30 | if prefetch
 31 |   vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ;
 32 |   imo = [] ;
 33 |   return ;
 34 | end
 35 | if fetch
 36 |   im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ;
 37 | else
 38 |   im = images ;
 39 | end
 40 | 
 41 | tfs = [] ;
 42 | switch opts.transformation
 43 |   case 'none'
 44 |     tfs = [
 45 |       .5 ;
 46 |       .5 ;
 47 |       0 ] ;
 48 |   case 'f5'
 49 |     tfs = [...
 50 |       .5 0 0 1 1 .5 0 0 1 1 ;
 51 |       .5 0 1 0 1 .5 0 1 0 1 ;
 52 |       0 0 0 0 0  1 1 1 1 1] ;
 53 |   case 'f25'
 54 |     [tx,ty] = meshgrid(linspace(0,1,5)) ;
 55 |     tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ;
 56 |     tfs_ = tfs ;
 57 |     tfs_(3,:) = 1 ;
 58 |     tfs = [tfs,tfs_] ;
 59 |   case 'stretch'
 60 |   case 'multiScaleRegular'
 61 |   otherwise
 62 |     error('Uknown transformations %s', opts.transformation) ;
 63 | end
 64 | [~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ;
 65 | 
 66 | if ~isempty(opts.rgbVariance) && isempty(opts.averageImage)
 67 |   opts.averageImage = zeros(1,1,3) ;
 68 | end
 69 | if numel(opts.averageImage) == 3
 70 |   opts.averageImage = reshape(opts.averageImage, 1,1,3) ;
 71 | end
 72 | 
 73 | imo = zeros(opts.imageSize(1), opts.imageSize(2), 3, ...
 74 |   numel(images)*opts.numAugments, 'single') ;
 75 | 
 76 | nVid = max(vids);
 77 | si = 1 ;
 78 | countv = 1;
 79 | for v=1:nVid
 80 |   
 81 |   vid = find(vids==v);
 82 |   
 83 |   for i=1:numel(images(vid))
 84 |     
 85 |     % acquire image
 86 |     if isempty(im{i})
 87 |       imt = imread(images{vid(i)}) ;
 88 |       imt = single(imt) ; % faster than im2single (and multiplies by 255)
 89 |     else
 90 |       imt = im{vid(i)} ;
 91 |     end
 92 |     if size(imt,3) == 1
 93 |       imt = cat(3, imt, imt, imt) ;
 94 |     end
 95 |     
 96 |     % resize
 97 |     w = size(imt,2) ;
 98 |     h = size(imt,1) ;
 99 |     factor = [(opts.imageSize(1)+opts.border(1))/h ...
100 |       (opts.imageSize(2)+opts.border(2))/w];
101 |     
102 |     if opts.keepAspect
103 |       factor = max(factor) ;
104 |     end
105 |     if any(abs(factor - 1) > 0.0001)
106 |       imt = imresize(imt, ...
107 |         'scale', factor, ...
108 |         'method', opts.interpolation) ;
109 |     end
110 |     
111 |     % crop & flip
112 |     if i==1
113 |       w = size(imt,2) ;
114 |       h = size(imt,1) ;
115 |       switch opts.transformation
116 |         case 'stretch'
117 |           sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ;
118 |           dx = randi(w - sz(2) + 1, 1) ;
119 |           dy = randi(h - sz(1) + 1, 1) ;
120 |           flip = rand > 0.5 ;
121 |         case 'multiScaleRegular'
122 |           reg_szs = [256, 224, 192, 168] ;
123 |           sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4));
124 |           
125 |           dy = [0 h-sz(1) 0 h-sz(1)  floor((h-sz(1)+1)/2)] + 1;
126 |           dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1;
127 |           corner = randi(5);
128 |           dx = dx(corner); dy = dy(corner);
129 |           flip = rand > 0.5 ;
130 |         otherwise
131 |           tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ;
132 |           sz = opts.imageSize(1:2) ;
133 |           dx = floor((w - sz(2)) * tf(2)) + 1 ;
134 |           dy = floor((h - sz(1)) * tf(1)) + 1 ;
135 |           flip = tf(3) ;
136 |       end
137 |       
138 |     end
139 |     
140 |     if opts.lazyResize
141 |       sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ;
142 |       sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ;
143 |     else
144 |       factor = [opts.imageSize(1)/sz(1) ...
145 |         opts.imageSize(2)/sz(2)];
146 |       if any(abs(factor - 1) > 0.0001)
147 |         imt =   imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), ...
148 |           opts.imageSize(1:2), 'Antialiasing', false, ...
149 |          'Method', opts.interpolation);
150 |       end
151 |       sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1);
152 |     end
153 |     
154 |     
155 |     if flip
156 |       sx = fliplr(sx) ;   
157 |     end
158 |     
159 |     imo(:,:,:,si) = imt(sy,sx,:) ;
160 |     si = si + 1 ;
161 |   end
162 |   countv = countv + numel(images(vid));
163 | 
164 | end
165 | 
166 | if ~isempty(opts.averageImage) && numel(opts.averageImage)==3
167 |   if ~isempty(opts.rgbVariance)
168 |     imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(3,1), 1,1,3)) ;
169 |   else
170 |     imo = bsxfun(@minus, imo, opts.averageImage) ;
171 |   end
172 | end
173 | 


--------------------------------------------------------------------------------
/dicnn/cnn_init_resnext.m:
--------------------------------------------------------------------------------
  1 | % -------------------------------------------------------------------------
  2 | function net = cnn_init_resnext(net,opts)
  3 | % -------------------------------------------------------------------------
  4 | % initialize classifier
  5 | net = dagnn.DagNN.loadobj(net) ;
  6 | 
  7 | % convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1);
  8 | 
  9 | fclayer = net.getLayer('classifier_0') ;
 10 | sizeW = size(net.params(fclayer.paramIndexes(1)).value);
 11 | 
 12 | % opts.nCls = 101;
 13 | nCls = opts.nCls ;
 14 | DropOutRate = opts.DropOutRate ; 
 15 | 
 16 | 
 17 | net.params(fclayer.paramIndexes(1)).value = ...
 18 |   0.01 * randn([sizeW(1:3),nCls],'single') ;
 19 | net.params(fclayer.paramIndexes(2)).value = zeros(nCls,1,'single') ;
 20 | 
 21 | 
 22 | % change loss
 23 | softmax = find(arrayfun(@(a) isa(a.block, 'dagnn.SoftMax'), net.layers)==1);
 24 | if ~isempty(softmax)
 25 |   net.removeLayer(net.layers(softmax(1)).name) ;
 26 | end
 27 | % convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1);
 28 | fclayer = find(arrayfun(@(a) strcmp(a.name, 'classifier_0'), net.layers)==1);
 29 | net.renameVar(net.layers(fclayer(end)).name,'prediction') ;
 30 | net.renameVar('data','input') ;
 31 | 
 32 | %------------------------------------------------------------------------%
 33 | % configure appr-rank-pool
 34 | switch opts.pool1Type
 35 |   case 'arpool'
 36 |     if strcmp(opts.pool1Layer,'conv0')
 37 |       poolLyr1 = 1 ;
 38 |       net.addLayer('arpool',AppRankPooling('scale',0.1),{'input','VideoId1'},'DynImg');
 39 |       net.setLayerInputs(net.layers(1).name,{'DynImg'}) ;
 40 |     else
 41 |       poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
 42 |       assert(~isempty(poolLyr1));
 43 |       net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg');
 44 |       net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ;
 45 |     end
 46 |   case 'ppool1'
 47 |     if strcmp(opts.pool1Layer,'conv0')
 48 |       poolLyr1 = 1 ;
 49 |     else
 50 |       poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
 51 |     end
 52 |     net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
 53 |       {'features_4_0_merge','VideoId1'},'DynImg0',{'conv0f','conv0b'});
 54 |     
 55 | %     net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
 56 |     net.params(end-1).value = 0.1 * randn(1,1,10,1,'single');
 57 |     net.params(end).value = zeros(1,1,'single');  
 58 |     
 59 |     net.addLayer('BnormDyn',dagnn.BatchNorm('numChannels',256),'DynImg0','DynImg',...
 60 |       {'dym','dyb','dybx'}) ;
 61 |     net.params(end-2).value =  ones(256,1,'single') ;
 62 |     net.params(end-1).value =  zeros(256,1,'single') ;
 63 |     net.params(end).value   =  zeros(256,2,'single') ;
 64 |     
 65 | %     net.addLayer('reluP',dagnn.ReLU(),...
 66 | %       {'DynImg1'},'DynImg');
 67 |     net.layers(16).inputs{1} = 'DynImg' ;
 68 |     for i=numel(net.params)-4:numel(net.params),
 69 |       net.params(i).learningRate = 0.1 * net.params(i).learningRate;
 70 |     end
 71 |   case 'none'
 72 |   otherwise
 73 |     error('Unknown pool type %s', opts.pool1Type) ;
 74 | end
 75 | 
 76 | 
 77 | net.rebuild() ;
 78 | %------------------------------------------------------------------------%
 79 | % second pool layer (max pooling)
 80 | % poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'pool5'), net.layers)==1);
 81 | poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_merge'), net.layers)==1);
 82 | net.addLayer('tempPoolMax',TemporalPooling('method','max'),...
 83 |   {net.layers(poolLyr2(1)).outputs{1},'VideoId2'},'tempPoolMax');
 84 | 
 85 | % change the input of fc last layer
 86 | % net.setLayerInputs(net.layers(convs(end)).name,'tempPoolMax') ;
 87 | % net.addLayer('bnar',dagnn.BatchNorm('numChannels',2048),{'tempPoolMax'},...
 88 | %   'tempPoolMaxbn',{'bnar_m','bnar_b','bnar_x'});
 89 | poolLyr2next = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_id_relu'), net.layers)==1);
 90 | net.setLayerInputs(net.layers(poolLyr2next(1)).name,{'tempPoolMax'}) ;
 91 | net.rebuild() ;
 92 | %------------------------------------------------------------------------%
 93 | % add drop-out layers
 94 | if DropOutRate>0
 95 | 
 96 |   pool5 = find(arrayfun(@(a) strcmp(a.name, 'features_8'), net.layers)==1);
 97 |   oo = net.layers(pool5(1)).outputs{1};
 98 |   net.addLayer('drop_pool5',dagnn.DropOut('rate',DropOutRate),...
 99 |     oo,sprintf('drop_%s',oo),{});
100 |   net.setLayerInputs('classifier_permute',{sprintf('drop_%s',oo)}) ;
101 | end
102 | 
103 | 
104 | %------------------------------------------------------------------------%
105 | % add multi-class error
106 | net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr');
107 | 
108 | net.addLayer('loss', ...
109 |              LossNormalized('loss', 'softmaxlog') ,...
110 |              {'prediction', 'label'}, ...
111 |              'objective') ;
112 | 
113 | %------------------------------------------------------------------------%
114 | net.rebuild()
115 | 
116 | % replace standard matconvnet bnorm with my version
117 | bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1);
118 | for i=1:numel(bns)
119 |   bb = net.layers(bns(i)).block ;
120 |   net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,...
121 |   'epsilon',bb.epsilon,...
122 |   'opts',bb.opts) ;
123 | end
124 | 
125 | % dagMergeBatchNorm(net) ;
126 | % dagRemoveLayersOfType(net, 'dagnn.BatchNorm') ;
127 | net_ = net.saveobj ;
128 | net = dagnn.DagNN.loadobj(net_) ;
129 | net.meta.normalization.border = [32 32] ;
130 | 


--------------------------------------------------------------------------------
/dicnn/cnn_video_of_get_batch.m:
--------------------------------------------------------------------------------
  1 | function imo = cnn_video_of_get_batch(images, vids, varargin)
  2 | % CNN_VIDEO_OF_GET_BATCH  Load, preprocess, and pack images for CNN evaluation
  3 | 
  4 | % video ids
  5 | % use same spatial jittering for frames from the same video
  6 | % NOTE: all the frames from a video should have the same size (wxh)
  7 | 
  8 | opts.imageSize = [227, 227] ;
  9 | opts.border = [29, 29] ;
 10 | opts.keepAspect = true ;
 11 | opts.numAugments = 1 ;
 12 | opts.transformation = 'multiScaleRegular' ;
 13 | opts.averageImage = [] ;
 14 | opts.rgbVariance = zeros(0,2,'single') ;
 15 | opts.interpolation = 'bilinear' ;
 16 | opts.numThreads = 1 ;
 17 | opts.prefetch = false ;
 18 | opts.lazyResize = true ;
 19 | opts.subMean = false; % subtract the mean from each video
 20 | opts = vl_argparse(opts, varargin);
 21 | 
 22 | % fetch is true if images is a list of filenames (instead of
 23 | % a cell array of images)
 24 | fetch = numel(images) >= 1 && ischar(images{1}) ;
 25 | 
 26 | % prefetch is used to load images in a separate thread
 27 | prefetch = fetch & opts.prefetch ;
 28 | 
 29 | if prefetch
 30 |   vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ;
 31 |   imo = [] ;
 32 |   return ;
 33 | end
 34 | if fetch
 35 |   im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ;
 36 | else
 37 |   im = images ;
 38 | end
 39 | 
 40 | tfs = [] ;
 41 | switch opts.transformation
 42 |   case 'none'
 43 |     tfs = [
 44 |       .5 ;
 45 |       .5 ;
 46 |       0 ] ;
 47 |   case 'f5'
 48 |     tfs = [...
 49 |       .5 0 0 1 1 .5 0 0 1 1 ;
 50 |       .5 0 1 0 1 .5 0 1 0 1 ;
 51 |       0 0 0 0 0  1 1 1 1 1] ;
 52 |   case 'f25'
 53 |     [tx,ty] = meshgrid(linspace(0,1,5)) ;
 54 |     tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ;
 55 |     tfs_ = tfs ;
 56 |     tfs_(3,:) = 1 ;
 57 |     tfs = [tfs,tfs_] ;
 58 |   case 'stretch'
 59 |   case 'multiScaleRegular'
 60 |   otherwise
 61 |     error('Uknown transformations %s', opts.transformation) ;
 62 | end
 63 | [~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ;
 64 | 
 65 | if ~isempty(opts.rgbVariance) && isempty(opts.averageImage)
 66 |   opts.averageImage = zeros(1,1,2) ;
 67 | end
 68 | if numel(opts.averageImage) == 2
 69 |   opts.averageImage = reshape(opts.averageImage, 1,1,2) ;
 70 | end
 71 | 
 72 | imo = zeros(opts.imageSize(1), opts.imageSize(2), 2, ...
 73 |   numel(images)/2*opts.numAugments, 'single') ;
 74 | 
 75 | nVid = max(vids);
 76 | si = 1 ;
 77 | countv = 1;
 78 | for v=1:nVid
 79 |   
 80 |   vid = find(vids==v);
 81 |   
 82 |   for i=1:numel(images(vid))
 83 |     
 84 |     % acquire image
 85 |     if isempty(im{i})
 86 |       imt1 = imread(images{2*vid(i)-1}) ;
 87 |       imt2 = imread(images{2*vid(i)}) ;
 88 |     else
 89 |       imt1 = im{2*vid(i)-1} ;
 90 |       imt2 = im{2*vid(i)} ;
 91 |     end
 92 |     imt = single(cat(3,imt1,imt2)) ; % faster than im2single (and multiplies by 255)
 93 |  
 94 |     % resize
 95 |     w = size(imt,2) ;
 96 |     h = size(imt,1) ;
 97 |     factor = [(opts.imageSize(1)+opts.border(1))/h ...
 98 |       (opts.imageSize(2)+opts.border(2))/w];
 99 |     
100 |     if opts.keepAspect
101 |       factor = max(factor) ;
102 |     end
103 |     if any(abs(factor - 1) > 0.0001)
104 |       imt = imresize(imt, ...
105 |         'scale', factor, ...
106 |         'method', opts.interpolation) ;
107 |     end
108 |     
109 |     % crop & flip
110 |     if i==1
111 |       flip = rand > 0.5 ;
112 |       w = size(imt,2) ;
113 |       h = size(imt,1) ;
114 |       switch opts.transformation
115 |         case 'stretch'
116 |           sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ;
117 |           dx = randi(w - sz(2) + 1, 1) ;
118 |           dy = randi(h - sz(1) + 1, 1) ;
119 | %           flip = rand > 0.5 ;
120 |         case 'multiScaleRegular'
121 |           reg_szs = [256, 224, 192, 168] ;          
122 |           sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4));
123 |  
124 |           dy = [0 h-sz(1) 0 h-sz(1)  floor((h-sz(1)+1)/2)] + 1;
125 |           dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1;
126 |           corner = randi(5);
127 |           dx = dx(corner); dy = dy(corner); 
128 |         otherwise
129 |           tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ;
130 |           sz = opts.imageSize(1:2) ;
131 |           dx = floor((w - sz(2)) * tf(2)) + 1 ;
132 |           dy = floor((h - sz(1)) * tf(1)) + 1 ;
133 | %           flip = tf(3) ;
134 |       end
135 |       
136 |     end
137 |     if opts.lazyResize
138 |       sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ;
139 |       sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ;
140 |     else
141 |       factor = [opts.imageSize(1)/sz(1) ...
142 |                   opts.imageSize(2)/sz(2)];
143 |       if any(abs(factor - 1) > 0.0001)
144 |         imt =   imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), [opts.imageSize(1:2)],...
145 |           'Antialiasing', false, 'Method', opts.interpolation);
146 |       end
147 |       sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1);
148 |     end
149 |     if flip
150 |       sx = fliplr(sx) ; 
151 |       imo(:,:,1,si) = 255 - imt(sy,sx,1) ;
152 |       imo(:,:,2,si) = imt(sy,sx,2) ;
153 |     else 
154 |       imo(:,:,:,si) = imt(sy,sx,:) ;
155 |     end
156 |     si = si + 1 ;
157 |   end
158 | 
159 |   countv = countv + numel(images(vid));
160 | end
161 | if ~isempty(opts.averageImage) && numel(opts.averageImage)==2
162 |   if ~isempty(opts.rgbVariance)
163 |     imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(2,1), 1,1,3)) ;
164 |   else
165 |     imo = bsxfun(@minus, imo, opts.averageImage) ;
166 |   end
167 | end
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/dicnn/cnn_single_of.m:
--------------------------------------------------------------------------------
  1 | function [net, info] = cnn_single_of(varargin)
  2 | %CNN_SINGLE_OF Demonstrates fine-tuning a pre-trained CNN with static 
  3 | % optical flow (OF in pami journal) on UCF101 dataset
  4 | 
  5 | run(fullfile(fileparts(mfilename('fullpath')), ...
  6 |   '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;
  7 | 
  8 | addpath Layers Datasets
  9 | 
 10 | opts.dataDir = fullfile('data','UCF101') ;
 11 | opts.expDir  = fullfile('exp', 'UCF101') ;
 12 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ;
 13 | [opts, varargin] = vl_argparse(opts, varargin) ;
 14 | 
 15 | opts.numFetchThreads = 8 ;
 16 | 
 17 | opts.lite = false ;
 18 | opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat');
 19 | 
 20 | opts.DropOutRate = 0.85 ;
 21 | opts.datasetFn = @cnn_ucf101_of_setup_data ;
 22 | opts.networkFn = @cnn_resnext_init ;
 23 | 
 24 | opts.split = 1; % data split
 25 | opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
 26 | opts.numDynImgs = 10 ;
 27 | opts.epochFactor = 5 ;
 28 | opts.pool1Layer = 'conv0'; % before conv1
 29 | opts.pool1Type = 'none' ;
 30 | opts.pool2Layer = 'fc6' ;
 31 | 
 32 | opts.train = struct() ;
 33 | opts.train.gpus = [];
 34 | opts.train.batchSize = 128 ;
 35 | opts.train.numSubBatches = 32 ;
 36 | opts.train.solver = [] ;
 37 | opts.train.prefetch = true ;
 38 | opts.train.learningRate = 1e-2 ;
 39 | opts.train.numEpochs = 30 ;
 40 | 
 41 | opts = vl_argparse(opts, varargin) ;
 42 | if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;
 43 | 
 44 | 
 45 | % -------------------------------------------------------------------------
 46 | %                                                              Prepare data
 47 | % -------------------------------------------------------------------------
 48 | 
 49 | if exist(opts.imdbPath,'file')
 50 |   imdb = load(opts.imdbPath) ;
 51 | else
 52 |   imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
 53 |   mkdir(opts.expDir) ;
 54 |   save(opts.imdbPath, '-struct', 'imdb') ;
 55 | end
 56 | 
 57 | % UCF101 has 3 data splits
 58 | if opts.split>3
 59 |   error('split should be <=3');
 60 | end
 61 | imdb.images.set = imdb.images.sets(opts.split,:);
 62 | 
 63 | % reverse frame order
 64 | if opts.reverseDyn
 65 |   for i=1:numel(imdb.images.names)
 66 |     imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
 67 |   end
 68 | end
 69 | % -------------------------------------------------------------------------
 70 | %                                                             Prepare model
 71 | % -------------------------------------------------------------------------
 72 | net = load(opts.modelPath);
 73 | if isfield(net,'net')
 74 |   net = net.net;
 75 | end
 76 | opts.nCls = max(imdb.images.label) ;
 77 | % net = dagnn.DagNN.loadobj(net) ;
 78 | net = opts.networkFn(net,opts) ;
 79 | 
 80 | % two channels instead of 3 RGB
 81 | net.params(1).value = net.params(1).value(:,:,1:2,:) ; 
 82 | 
 83 | % Set the class names in the network
 84 | net.meta.classes.name = imdb.classes.name ;
 85 | net.meta.classes.description = imdb.classes.name ;
 86 | 
 87 | % -------------------------------------------------------------------------
 88 | %                                                                     Learn
 89 | % -------------------------------------------------------------------------
 90 | if opts.epochFactor>0
 91 |   opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
 92 | else
 93 |   opts.train.train = NaN ;
 94 | end
 95 | opts.train.val = find(imdb.images.set==3) ;
 96 | 
 97 | [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
 98 |                       'expDir', opts.expDir, ...
 99 |                       opts.train) ;
100 | 
101 | % -------------------------------------------------------------------------
102 | %                                                          Report accuracy
103 | % -------------------------------------------------------------------------
104 | errlayer = net.getLayerIndex('errMC') ;
105 | 
106 | if ~isnan(errlayer)
107 |   cats = imdb.classes.name ;
108 |   accs = net.layers(errlayer).block.accuracy ; 
109 |   
110 |   if numel(cats)~=numel(accs)
111 |     error('wrong number of classes\n') ;
112 |   end
113 |   
114 |   for i=1:numel(cats)
115 |     fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
116 |   end
117 |   fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
118 | end
119 | 
120 | % -------------------------------------------------------------------------
121 | function fn = getBatchFn(opts, meta)
122 | % -------------------------------------------------------------------------
123 | useGpu = numel(opts.train.gpus) > 0 ;
124 | 
125 | bopts.numThreads = opts.numFetchThreads ;
126 | bopts.imageSize = meta.normalization.imageSize ;
127 | if isfield(meta.normalization,'border')
128 |   bopts.border = meta.normalization.border ;  
129 | else
130 |   bopts.border = meta.normalization.imageSize(1:2) ./ ...
131 |     meta.normalization.cropSize - meta.normalization.imageSize(1:2);
132 | end
133 | 
134 | bopts.averageImage = 128 * ones([1 1 2],'single') ;
135 | bopts.numDynImgs = opts.numDynImgs ;
136 | % bopts.averageImage = meta.normalization.averageImage ;
137 | % bopts.rgbVariance = meta.augmentation.rgbVariance ;
138 | % bopts.transformation = meta.augmentation.transformation ;
139 | bopts.transformation = 'stretch' ;
140 | bopts.transformation = 'multiScaleRegular' ;
141 | 
142 | fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;
143 | 
144 | 
145 | 
146 | % -------------------------------------------------------------------------
147 | function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
148 | % -------------------------------------------------------------------------
149 | 
150 | % batch refers to videos (not for frames)
151 | if isempty(batch)
152 |   inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
153 |   return;
154 | end
155 | 
156 | isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;
157 | 
158 | if ~isVal, transformation='multiScaleRegular'; else transformation='none';end
159 | 
160 | names = imdb.images.names(batch);
161 | 
162 | 
163 | % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;
164 | 
165 | namesM = {};
166 | nVids = numel(batch);
167 | 
168 | VideoId1 = [];
169 | VideoId2 = [];
170 | 
171 | % step-size
172 | stepSize = 6;
173 | % pool nFrames into a dynamic image
174 | nFrames = 1;
175 | % number of dynamic images to be max pooled later
176 | nDynImgs = opts.numDynImgs ;
177 | opts = rmfield(opts,'numDynImgs') ;
178 | 
179 | 
180 | c1 = 1;
181 | for v=1:nVids
182 |   
183 |   name = names{v};
184 |   nFrms = numel(name)/2;
185 | 
186 |   nSample = nFrames;
187 |   nr = numel(1:stepSize:nFrms);
188 |   
189 |   % jitter by removing 50 % and limit a batch to nMaxs * nSamples images
190 |   if nr > 1 && (~isVal && nr>nDynImgs)
191 |     rat = min(nDynImgs,ceil(0.50*nr));
192 |     ri = randperm(nr);
193 |     ri = ri(1:rat);
194 |     r = zeros(1,nr);
195 |     r(ri) = 1;
196 |   else
197 |     r = ones(1,nr);
198 |   end
199 |   
200 |   c3 = 1;
201 |   c2 = 0;
202 |   
203 |   for f=1:stepSize:nFrms
204 |     if r(c3)
205 |       idx = f:min(f+nSample-1,nFrms) ;
206 |       if numel(idx)<nFrames
207 |         idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
208 |       end
209 |       idxu = 2*idx - 1;
210 |       idxv = 2*idx;
211 |       idxuv = zeros(1,2 * numel(idxu)) ;
212 |       idxuv(1:2:end) = idxu ;
213 |       idxuv(2:2:end) = idxv ;
214 |             
215 |       namesM{end+1} = name(idxuv);
216 |       VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
217 |       c1 = c1 + 1;
218 |       c2 = c2 + 1;
219 |     end
220 |     c3 = c3 + 1;
221 |   end
222 |   VideoId2 = [VideoId2 v * ones(1,c2) ] ;
223 | end
224 | 
225 | images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;
226 | 
227 | im = cnn_video_of_get_batch(images, VideoId1, opts, ...
228 |   'transformation', transformation, 'prefetch', nargout == 0, ...
229 |   'subMean', false) ;
230 | 
231 | if nargout > 0
232 |   if useGpu
233 |     im = gpuArray(im) ;
234 |   end
235 |   inputs = {'input', im, 'label', imdb.images.label(batch), ...
236 |     'VideoId2', VideoId2};
237 | 
238 | end
239 | 


--------------------------------------------------------------------------------
/dicnn/cnn_single_rgb.m:
--------------------------------------------------------------------------------
  1 |   function [net, info] = cnn_single_rgb(varargin)
  2 | %CNN_SINGLE_RGB Demonstrates fine-tuning a pre-trained CNN with static 
  3 | % RGB frames (SI in pami journal) on UCF101 dataset
  4 | 
  5 | 
  6 | run(fullfile(fileparts(mfilename('fullpath')), ...
  7 |   '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;
  8 | 
  9 | addpath Layers Datasets
 10 | 
 11 | opts.dataDir = fullfile('data','UCF101') ;
 12 | opts.expDir  = fullfile('exp', 'UCF101') ;
 13 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat');
 14 | opts.datasetFn = @cnn_ucf101_setup_data ;
 15 | opts.networkFn = @cnn_init_resnext ;
 16 | opts.pool1Type = 'none' ;
 17 | opts.pool1Layer = 'conv1' ;
 18 | opts.pool2Layer = '' ;
 19 | [opts, varargin] = vl_argparse(opts, varargin) ;
 20 | 
 21 | opts.numFetchThreads = 8 ;
 22 | 
 23 | opts.lite = false ;
 24 | opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat');
 25 | opts.ARPoolLayer = 'conv0'; % before conv1
 26 | opts.DropOutRate = 0.5 ;
 27 | opts.epochFactor = 5 ;
 28 | 
 29 | opts.split = 1; % data split
 30 | opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
 31 | opts.train = struct() ;
 32 | opts.train.gpus = [];
 33 | opts.train.batchSize = 128 ;
 34 | opts.train.numSubBatches = 16 ;
 35 | opts.train.solver = [] ;
 36 | opts.train.prefetch = true ;
 37 | opts.train.numEpochs = 30 ;
 38 | % resnet50
 39 | opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
 40 | % caffe-ref
 41 | opts.train.learningRate = 1e-4 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
 42 | 
 43 | opts = vl_argparse(opts, varargin) ;
 44 | if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;
 45 | % opts.train.numEpochs = numel(opts.train.learningRate);
 46 | 
 47 | % -------------------------------------------------------------------------
 48 | %                                                              Prepare data
 49 | % -------------------------------------------------------------------------
 50 | 
 51 | if exist(opts.imdbPath,'file')
 52 |   imdb = load(opts.imdbPath) ;
 53 | else
 54 |   imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
 55 |   mkdir(opts.expDir) ;
 56 |   save(opts.imdbPath, '-struct', 'imdb') ;
 57 | end
 58 | 
 59 | % UCF101 has 3 data splits
 60 | if opts.split>3
 61 |   error('split should be <=3');
 62 | end
 63 | imdb.images.set = imdb.images.sets(opts.split,:);
 64 | 
 65 | % reverse frame order
 66 | if opts.reverseDyn
 67 |   for i=1:numel(imdb.images.names)
 68 |     imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
 69 |   end
 70 | end
 71 | 
 72 | % -------------------------------------------------------------------------
 73 | %                                                             Prepare model
 74 | % -------------------------------------------------------------------------
 75 | net = load(opts.modelPath);
 76 | if isfield(net,'net')
 77 |   net = net.net;
 78 | end
 79 | opts.nCls = max(imdb.images.label) ;
 80 | net = opts.networkFn(net,opts);
 81 | 
 82 | if numel(net.meta.normalization.averageImage)>3
 83 |   sz = size(net.meta.normalization.averageImage) ;
 84 |   net.meta.normalization.averageImage = ...
 85 |     mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ;
 86 | end
 87 | 
 88 | % Set the class names in the network
 89 | net.meta.classes.name = imdb.classes.name ;
 90 | net.meta.classes.description = imdb.classes.name ;
 91 | % -------------------------------------------------------------------------
 92 | %                                                                     Learn
 93 | % -------------------------------------------------------------------------
 94 | if opts.epochFactor>0
 95 |   opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
 96 | else
 97 |   opts.train.train = NaN ;
 98 | end
 99 | opts.train.val = find(imdb.images.set==3) ;
100 | 
101 | [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
102 |                       'expDir', opts.expDir, ...
103 |                       opts.train) ;
104 | 
105 | % -------------------------------------------------------------------------
106 | %                                                          Report accuracy
107 | % -------------------------------------------------------------------------
108 | errlayer = net.getLayerIndex('errMC') ;
109 | 
110 | if ~isnan(errlayer)
111 |   cats = imdb.classes.name ;
112 |   accs = net.layers(errlayer).block.accuracy ; 
113 |   
114 |   if numel(cats)~=numel(accs)
115 |     error('wrong number of classes\n') ;
116 |   end
117 |   
118 |   for i=1:numel(cats)
119 |     fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
120 |   end
121 |   fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
122 | end
123 | 
124 | % -------------------------------------------------------------------------
125 | function fn = getBatchFn(opts, meta)
126 | % -------------------------------------------------------------------------
127 | useGpu = numel(opts.train.gpus) > 0 ;
128 | 
129 | bopts.numThreads = opts.numFetchThreads ;
130 | bopts.imageSize = meta.normalization.imageSize ;
131 | if isfield(meta.normalization,'border')
132 |   bopts.border = meta.normalization.border ;  
133 | else
134 |   bopts.border = meta.normalization.imageSize(1:2) ./ ...
135 |     meta.normalization.cropSize - meta.normalization.imageSize(1:2);
136 | 
137 | end
138 | 
139 | % bopts.averageImage = []; 
140 | bopts.averageImage = meta.normalization.averageImage ;
141 | bopts.interpolation = meta.normalization.interpolation ;
142 | bopts.keepAspect = meta.normalization.keepAspect ;
143 | % bopts.rgbVariance = meta.augmentation.rgbVariance ;
144 | % bopts.transformation = meta.augmentation.transformation ;
145 | 
146 | 
147 | fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;
148 | 
149 | 
150 | 
151 | % -------------------------------------------------------------------------
152 | function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
153 | % -------------------------------------------------------------------------
154 | 
155 | % batch refers to videos (not for frames)
156 | if isempty(batch)
157 |   inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
158 |   return;
159 | end
160 | 
161 | isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;
162 | 
163 | % if ~isVal, transformation='stretch'; else transformation='none';end
164 | if ~isVal, transformation='multiScaleRegular'; else transformation='none';end
165 | 
166 | names = imdb.images.names(batch);
167 | 
168 | 
169 | % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;
170 | 
171 | namesM = {};
172 | nVids = numel(batch);
173 | 
174 | VideoId1 = [];
175 | VideoId2 = [];
176 | 
177 | % step-size
178 | stepSize = 6;
179 | % pool nFrames into a dynamic image
180 | nFrames = 1;
181 | % number of dynamic images to be max pooled later
182 | nDynImgs = 10;
183 | 
184 | 
185 | c1 = 1;
186 | for v=1:nVids
187 |   
188 |   name = names{v};
189 |   nFrms = numel(name);
190 | 
191 |   nSample = nFrames;
192 |   nr = numel(1:stepSize:nFrms);
193 |   
194 |   % jitter by removing 50 % and limit a batch to nMaxs * nSamples images
195 |   if nr > 1 && (~isVal && nr>nDynImgs)
196 |     rat = min(nDynImgs,ceil(0.50*nr));
197 |     ri = randperm(nr);
198 |     ri = ri(1:rat);
199 |     r = zeros(1,nr);
200 |     r(ri) = 1;
201 |   else
202 |     r = ones(1,nr);
203 |   end
204 |   
205 |   c3 = 1;
206 |   c2 = 0;
207 |   
208 |   for f=1:stepSize:nFrms
209 |     if r(c3)
210 |       idx = f:min(f+nSample-1,nFrms) ;
211 |       if numel(idx)<nFrames
212 |         idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
213 |       end
214 |       namesM{end+1} = name(idx);
215 |       VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
216 |       c1 = c1 + 1;
217 |       c2 = c2 + 1;
218 |     end
219 |     c3 = c3 + 1;
220 |   end
221 |   VideoId2 = [VideoId2 v * ones(1,c2) ] ;
222 | end
223 | 
224 | images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;
225 | 
226 | im = cnn_video_rgb_get_batch(images, VideoId1, opts, ...
227 |   'transformation', transformation, 'prefetch', nargout == 0, ...
228 |   'subMean', false) ;
229 | 
230 | if nargout > 0
231 |   if useGpu
232 |     im = gpuArray(im) ;
233 |   end
234 |   inputs = {'input', im, 'label', imdb.images.label(batch), ...
235 |     'VideoId2', VideoId2};
236 | end
237 | 


--------------------------------------------------------------------------------
/dicnn/cnn_dicnn_of.m:
--------------------------------------------------------------------------------
  1 | function [net, info] = cnn_dicnn_of(varargin)
  2 | %CNN_DICNN_OF Fine-tunes a pre-trained CNN with dynamic images on optical
  3 | % (DOF in pami journal) flow frames on UCF101 dataset
  4 | 
  5 | 
  6 | run(fullfile(fileparts(mfilename('fullpath')), ...
  7 |   '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;
  8 | 
  9 | run(fullfile(fileparts(mfilename('fullpath')), ...
 10 |   '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ;
 11 | 
 12 | run(fullfile(fileparts(mfilename('fullpath')), ...
 13 |   '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ;
 14 | 
 15 | addpath Layers Datasets
 16 | 
 17 | opts.dataDir = fullfile('data','UCF101') ;
 18 | opts.expDir  = fullfile('exp', 'UCF101') ;
 19 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ;
 20 | [opts, varargin] = vl_argparse(opts, varargin) ;
 21 | 
 22 | opts.numFetchThreads = 8 ;
 23 | 
 24 | opts.lite = false ;
 25 | opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat');
 26 | opts.pool1Layer = 'conv0'; % before conv1
 27 | opts.pool1Type = 'arpool'; % before conv1
 28 | opts.pool2Layer = 'fc6'; % before conv1
 29 | opts.DropOutRate = 0.85 ;
 30 | opts.datasetFn = @cnn_ucf101_of_setup_data ;
 31 | opts.networkFn = @cnn_init_resnext ;
 32 | opts.network = [] ;
 33 | 
 34 | opts.split = 1; % data split
 35 | opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
 36 | opts.numDynImgs = 10 ;
 37 | opts.epochFactor = 5 ;
 38 | 
 39 | opts.train = struct() ;
 40 | opts.train.gpus = [];
 41 | opts.train.batchSize = 128 ;
 42 | opts.train.numSubBatches = 32 ;
 43 | opts.train.solver = [] ;
 44 | opts.train.prefetch = true ;
 45 | opts.train.learningRate = 1e-2 ;
 46 | opts.train.numEpochs = 30 ;
 47 | % opts.train.savePreds = true ;
 48 | opts.train.randomSeed = 0 ;
 49 | 
 50 | opts = vl_argparse(opts, varargin) ;
 51 | if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;
 52 | 
 53 | 
 54 | % -------------------------------------------------------------------------
 55 | %                                                              Prepare data
 56 | % -------------------------------------------------------------------------
 57 | 
 58 | if exist(opts.imdbPath,'file')
 59 |   imdb = load(opts.imdbPath) ;
 60 | else
 61 |   imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
 62 |   mkdir(opts.expDir) ;
 63 |   save(opts.imdbPath, '-struct', 'imdb') ;
 64 | end
 65 | 
 66 | % UCF101 has 3 data splits
 67 | if opts.split>3
 68 |   error('split should be <=3');
 69 | end
 70 | imdb.images.set = imdb.images.sets(opts.split,:);
 71 | 
 72 | % reverse frame order
 73 | if opts.reverseDyn
 74 |   for i=1:numel(imdb.images.names)
 75 |     imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
 76 |   end
 77 | end
 78 | % -------------------------------------------------------------------------
 79 | %                                                             Prepare model
 80 | % -------------------------------------------------------------------------
 81 | if isempty(opts.network)
 82 |   net = load(opts.modelPath);
 83 |   if isfield(net,'net')
 84 |     net = net.net;
 85 |   end
 86 |   opts.nCls = max(imdb.images.label) ;
 87 |   % net = dagnn.DagNN.loadobj(net) ;
 88 |   net = opts.networkFn(net,opts) ;
 89 |   
 90 |   % two channels instead of 3 RGB
 91 |   net.params(1).value = net.params(1).value(:,:,1:2,:) ;
 92 |   
 93 |   % Set the class names in the network
 94 |   net.meta.classes.name = imdb.classes.name ;
 95 |   net.meta.classes.description = imdb.classes.name ;
 96 | else
 97 |   assert(isa(opts.network,'dagnn.DagNN')) ;
 98 |   net = opts.network ;
 99 | end
100 | 
101 | % -------------------------------------------------------------------------
102 | %                                                                     Learn
103 | % -------------------------------------------------------------------------
104 | if opts.epochFactor>0
105 |   opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
106 | else
107 |   opts.train.train = NaN ;
108 |   opts.train.numEpochs = 1 ;
109 | end
110 | opts.train.val = find(imdb.images.set==3) ;
111 | 
112 | [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
113 |                       'expDir', opts.expDir, ...
114 |                       opts.train) ;
115 | 
116 | 
117 | % -------------------------------------------------------------------------
118 | %                                                          Report accuracy
119 | % -------------------------------------------------------------------------
120 | errlayer = net.getLayerIndex('errMC') ;
121 | 
122 | if ~isnan(errlayer)
123 |   cats = imdb.classes.name ;
124 |   accs = net.layers(errlayer).block.accuracy ; 
125 |   
126 |   if numel(cats)~=numel(accs)
127 |     error('wrong number of classes\n') ;
128 |   end
129 |   
130 |   for i=1:numel(cats)
131 |     fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
132 |   end
133 |   fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
134 | end
135 | % -------------------------------------------------------------------------
136 | function fn = getBatchFn(opts, meta)
137 | % -------------------------------------------------------------------------
138 | useGpu = numel(opts.train.gpus) > 0 ;
139 | 
140 | bopts.numThreads = opts.numFetchThreads ;
141 | bopts.imageSize = meta.normalization.imageSize ;
142 | if isfield(meta.normalization,'border')
143 |   bopts.border = meta.normalization.border ;  
144 | else
145 |   bopts.border = meta.normalization.imageSize(1:2) ./ ...
146 |     meta.normalization.cropSize - meta.normalization.imageSize(1:2);
147 | end
148 | 
149 | bopts.averageImage = 128 * ones([1 1 2],'single') ;
150 | bopts.numDynImgs = opts.numDynImgs ;
151 | 
152 | fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;
153 | 
154 | 
155 | 
156 | % -------------------------------------------------------------------------
157 | function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
158 | % -------------------------------------------------------------------------
159 | 
160 | % batch refers to videos (not for frames)
161 | if isempty(batch)
162 |   inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
163 |   return;
164 | end
165 | 
166 | isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;
167 | 
168 | if ~isVal, transformation='multiScaleRegular'; else transformation='none';end
169 | 
170 | names = imdb.images.names(batch);
171 | 
172 | 
173 | % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;
174 | 
175 | namesM = {};
176 | nVids = numel(batch);
177 | 
178 | VideoId1 = [];
179 | VideoId2 = [];
180 | 
181 | % step-size
182 | stepSize = 6;
183 | 
184 | % pool nFrames into a dynamic image
185 | nFrames = 10;
186 | % number of dynamic images to be max pooled later
187 | nDynImgs = opts.numDynImgs ;
188 | opts = rmfield(opts,'numDynImgs') ;
189 | 
190 | 
191 | c1 = 1;
192 | for v=1:nVids
193 |   
194 |   name = names{v};
195 |   nFrms = numel(name)/2;
196 | 
197 |   nSample = nFrames;
198 |   
199 |   if isVal
200 |     startF = 1 ;
201 |   else
202 |     startF = ceil(stepSize/2) ;
203 |   end
204 |   nr = numel(startF:stepSize:nFrms);
205 |   
206 |   % jitter by removing 50 % and limit a batch to nMaxs * nSamples images
207 |   if nr > 1 && (~isVal && nr>nDynImgs)
208 |     rat = min(nDynImgs,ceil(0.50*nr));
209 |     ri = randperm(nr);
210 |     ri = ri(1:rat);
211 |     r = zeros(1,nr);
212 |     r(ri) = 1;
213 |   else
214 |     r = ones(1,nr);
215 |   end
216 |   
217 |   c3 = 1;
218 |   c2 = 0;
219 |   
220 |   for f=startF:stepSize:nFrms
221 |     if r(c3)
222 |       idx = f:min(f+nSample-1,nFrms) ;
223 |       if numel(idx)<nFrames
224 |         idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
225 |       end
226 |       idxu = 2*idx - 1;
227 |       idxv = 2*idx;
228 |       idxuv = zeros(1,2 * numel(idxu)) ;
229 |       idxuv(1:2:end) = idxu ;
230 |       idxuv(2:2:end) = idxv ;
231 |             
232 |       namesM{end+1} = name(idxuv);
233 |       VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
234 |       c1 = c1 + 1;
235 |       c2 = c2 + 1;
236 |     end
237 |     c3 = c3 + 1;
238 |   end
239 |   VideoId2 = [VideoId2 v * ones(1,c2) ] ;
240 | end
241 | 
242 | images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;
243 | 
244 | im = cnn_video_of_get_batch(images, VideoId1, opts, ...
245 |   'transformation', transformation, 'prefetch', nargout == 0) ;
246 | 
247 | if nargout > 0
248 |   if useGpu
249 |     im = gpuArray(im) ;
250 |   end
251 |   inputs = {'input', im, 'label', imdb.images.label(batch), ...
252 |     'VideoId1', VideoId1, 'VideoId2', VideoId2};
253 | 
254 | end
255 | 


--------------------------------------------------------------------------------
/dicnn/cnn_dicnn_rgb.m:
--------------------------------------------------------------------------------
  1 | function [net, info] = cnn_dicnn_rgb(varargin)
  2 | %CNN_DICNN_RGB Fine-tunes a pre-trained CNN with dynamic images on RGB frames
  3 | % (DI in pami journal) on UCF101 dataset
  4 | 
  5 | 
  6 | run(fullfile(fileparts(mfilename('fullpath')), ...
  7 |   '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;
  8 | 
  9 | run(fullfile(fileparts(mfilename('fullpath')), ...
 10 |   '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ;
 11 | 
 12 | run(fullfile(fileparts(mfilename('fullpath')), ...
 13 |   '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ;
 14 | 
 15 | addpath Layers Datasets
 16 | 
 17 | opts.dataDir = fullfile('data','UCF101') ;
 18 | opts.expDir  = fullfile('exp', 'UCF101') ;
 19 | opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat');
 20 | opts.datasetFn = @cnn_ucf101_setup_data ;
 21 | opts.networkFn = @cnn_init_resnext ;
 22 | opts.network = [] ;
 23 | 
 24 | [opts, varargin] = vl_argparse(opts, varargin) ;
 25 | 
 26 | opts.numFetchThreads = 8 ;
 27 | 
 28 | opts.lite = false ;
 29 | opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat');
 30 | opts.pool1Layer = 'conv0'; % before conv1
 31 | opts.pool1Type = 'arpool'; 
 32 | opts.pool2Layer = 'pool5'; 
 33 | opts.pool2Type = 'maxpool'; 
 34 | opts.DropOutRate = 0.5 ;
 35 | opts.epochFactor = 5 ;
 36 | 
 37 | opts.split = 1; % data split
 38 | opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
 39 | opts.train = struct() ;
 40 | opts.train.gpus = [];
 41 | opts.train.batchSize = 128 ;
 42 | opts.train.numSubBatches = 16 ;
 43 | opts.train.solver = [] ;
 44 | opts.train.prefetch = true ;
 45 | opts.train.numEpochs = 30 ;
 46 | opts.train.randomSeed = 0 ;
 47 | % resnet50
 48 | % opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
 49 | % caffe-ref
 50 | opts.train.learningRate = 1e-3 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
 51 | 
 52 | opts = vl_argparse(opts, varargin) ;
 53 | if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end
 54 | % opts.train.numEpochs = numel(opts.train.learningRate);
 55 | 
 56 | % -------------------------------------------------------------------------
 57 | %                                                              Prepare data
 58 | % -------------------------------------------------------------------------
 59 | 
 60 | if exist(opts.imdbPath,'file')
 61 |   imdb = load(opts.imdbPath) ;
 62 | else
 63 |   imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
 64 |   mkdir(opts.expDir) ;
 65 |   save(opts.imdbPath, '-struct', 'imdb') ;
 66 | end
 67 | 
 68 | % UCF101 has 3 data splits
 69 | if opts.split>3
 70 |   error('split should be <=3');
 71 | end
 72 | imdb.images.set = imdb.images.sets(opts.split,:);
 73 | 
 74 | % reverse frame order
 75 | if opts.reverseDyn
 76 |   for i=1:numel(imdb.images.names)
 77 |     imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
 78 |   end
 79 | end
 80 | 
 81 | % -------------------------------------------------------------------------
 82 | %                                                             Prepare model
 83 | % -------------------------------------------------------------------------
 84 | if isempty(opts.network)
 85 |   net = load(opts.modelPath);
 86 |   if isfield(net,'net')
 87 |     net = net.net;
 88 |   end
 89 |   opts.nCls = max(imdb.images.label) ;
 90 |   net = opts.networkFn(net,opts);
 91 | 
 92 |   if numel(net.meta.normalization.averageImage)>3
 93 |     sz = size(net.meta.normalization.averageImage) ;
 94 |     net.meta.normalization.averageImage = ...
 95 |       mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ;
 96 |   end
 97 | 
 98 |   % Set the class names in the network
 99 |   net.meta.classes.name = imdb.classes.name ;
100 |   net.meta.classes.description = imdb.classes.name ;
101 | else
102 |   assert(isa(opts.network,'dagnn.DagNN')) ;
103 |   net = opts.network ;
104 | end
105 | % -------------------------------------------------------------------------
106 | %                                                                     Learn
107 | % -------------------------------------------------------------------------
108 | if opts.epochFactor>0
109 |   opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
110 | else
111 |   opts.train.train = NaN ;
112 |   opts.train.numEpochs = 1 ;
113 | end
114 | opts.train.val = find(imdb.images.set==3) ;
115 | 
116 | [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
117 |   'expDir', opts.expDir, ...
118 |   opts.train) ;
119 | 
120 | % -------------------------------------------------------------------------
121 | %                                                          Report accuracy
122 | % -------------------------------------------------------------------------
123 | errlayer = net.getLayerIndex('errMC') ;
124 | 
125 | if ~isnan(errlayer)
126 |   cats = imdb.classes.name ;
127 |   accs = net.layers(errlayer).block.accuracy ; 
128 |   
129 |   if numel(cats)~=numel(accs)
130 |     error('wrong number of classes\n') ;
131 |   end
132 |   
133 |   for i=1:numel(cats)
134 |     fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
135 |   end
136 |   fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
137 | end
138 | 
139 | % -------------------------------------------------------------------------
140 | function fn = getBatchFn(opts, meta)
141 | % -------------------------------------------------------------------------
142 | useGpu = numel(opts.train.gpus) > 0 ;
143 | 
144 | bopts.numThreads = opts.numFetchThreads ;
145 | bopts.imageSize = meta.normalization.imageSize ;
146 | if isfield(meta.normalization,'border')
147 |   bopts.border = meta.normalization.border ;  
148 | else
149 |   bopts.border = meta.normalization.imageSize(1:2) ./ ...
150 |     meta.normalization.cropSize - meta.normalization.imageSize(1:2);
151 | 
152 | end
153 | 
154 | % bopts.averageImage = []; 
155 | bopts.averageImage = meta.normalization.averageImage ;
156 | bopts.interpolation = meta.normalization.interpolation ;
157 | bopts.keepAspect = meta.normalization.keepAspect ;
158 | % bopts.rgbVariance = meta.augmentation.rgbVariance ;
159 | % bopts.transformation = meta.augmentation.transformation ;
160 | 
161 | 
162 | fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;
163 | 
164 | 
165 | 
166 | % -------------------------------------------------------------------------
167 | function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
168 | % -------------------------------------------------------------------------
169 | 
170 | % batch refers to videos (not for frames)
171 | if isempty(batch)
172 |   inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
173 |   return;
174 | end
175 | 
176 | isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;
177 | 
178 | % if ~isVal, transformation='stretch'; else transformation='none';end
179 | if ~isVal, transformation='multiScaleRegular'; else transformation='none';end
180 | 
181 | names = imdb.images.names(batch);
182 | 
183 | 
184 | % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;
185 | 
186 | namesM = {};
187 | nVids = numel(batch);
188 | 
189 | VideoId1 = [];
190 | VideoId2 = [];
191 | 
192 | % step-size
193 | stepSize = 6;
194 | 
195 | % pool nFrames into a dynamic image
196 | nFrames = 10;
197 | % number of dynamic images to be max pooled later
198 | nDynImgs = 10;
199 | 
200 | 
201 | c1 = 1;
202 | for v=1:nVids
203 |   
204 |   name = names{v};
205 |     
206 |   if isVal
207 |     startF = 1 ;
208 |   else
209 |     startF = ceil(stepSize/2) ;
210 |   end
211 |   
212 |   nFrms = numel(name);
213 | 
214 |   nSample = nFrames;
215 |   nr = numel(startF:stepSize:nFrms);
216 |   
217 |   % jitter by removing 50 % and limit a batch to nMaxs * nSamples images
218 |   if nr > 1 && (~isVal && nr>nDynImgs)
219 |     rat = min(nDynImgs,ceil(0.50*nr));
220 |     ri = randperm(nr);
221 |     ri = ri(1:rat);
222 |     r = zeros(1,nr);
223 |     r(ri) = 1;
224 |   else
225 |     if nr>2*nDynImgs
226 |       rat = 2*nDynImgs;
227 |       ri = randperm(nr);
228 |       ri = ri(1:rat);
229 |       r = zeros(1,nr);
230 |       r(ri) = 1;
231 |     else
232 |       r = ones(1,nr);
233 |     end
234 |   end
235 |   
236 |   c3 = 1;
237 |   c2 = 0;
238 |   
239 |   for f=startF:stepSize:nFrms
240 |     if r(c3)
241 |       idx = f:min(f+nSample-1,nFrms) ;
242 |       if numel(idx)<nFrames
243 |         idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
244 |       end
245 |       namesM{end+1} = name(idx);
246 |       VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
247 |       c1 = c1 + 1;
248 |       c2 = c2 + 1;
249 |     end
250 |     c3 = c3 + 1;
251 |   end
252 |   VideoId2 = [VideoId2 v * ones(1,c2) ] ;
253 | end
254 | 
255 | images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;
256 | 
257 | im = cnn_video_rgb_get_batch(images, VideoId1, opts, ...
258 |   'transformation', transformation, 'prefetch', nargout == 0) ;
259 | 
260 | if nargout > 0
261 |   if useGpu
262 |     im = gpuArray(im) ;
263 |   end
264 |   inputs = {'input', im, 'label', imdb.images.label(batch), ...
265 |     'VideoId1', VideoId1, 'VideoId2', VideoId2};
266 | end
267 | 


--------------------------------------------------------------------------------
/dicnn/cnn_train_dicnn_dag.m:
--------------------------------------------------------------------------------
  1 | function [net,stats] = cnn_train_dicnn_dag(net, imdb, getBatch, varargin)
  2 | %CNN_DICNN_TRAIN_DAG Demonstrates training a CNN using the DagNN wrapper
  3 | %    CNN_TRAIN_DAG() is similar to CNN_TRAIN(), but works with
  4 | %    the DagNN wrapper instead of the SimpleNN wrapper.
  5 | 
  6 | % Copyright (C) 2014-16 Andrea Vedaldi.
  7 | % All rights reserved.
  8 | %
  9 | % This file is part of the VLFeat library and is made available under
 10 | % the terms of the BSD license (see the COPYING file).
 11 | addpath(fullfile(vl_rootnn, 'examples'));
 12 | 
 13 | opts.expDir = fullfile('data','exp') ;
 14 | opts.continue = true ;
 15 | opts.batchSize = 256 ;
 16 | opts.numSubBatches = 1 ;
 17 | opts.train = [] ;
 18 | opts.val = [] ;
 19 | opts.gpus = [] ;
 20 | opts.prefetch = false ;
 21 | opts.epochSize = inf;
 22 | opts.numEpochs = 300 ;
 23 | opts.learningRate = 0.001 ;
 24 | opts.weightDecay = 0.0005 ;
 25 | 
 26 | opts.solver = [] ;  % Empty array means use the default SGD solver
 27 | [opts, varargin] = vl_argparse(opts, varargin) ;
 28 | if ~isempty(opts.solver)
 29 |   assert(isa(opts.solver, 'function_handle') && nargout(opts.solver) == 2,...
 30 |     'Invalid solver; expected a function handle with two outputs.') ;
 31 |   % Call without input arguments, to get default options
 32 |   opts.solverOpts = opts.solver() ;
 33 | end
 34 | 
 35 | opts.momentum = 0.9 ;
 36 | opts.saveSolverState = true ;
 37 | opts.nesterovUpdate = false ;
 38 | opts.randomSeed = 0 ;
 39 | opts.profile = false ;
 40 | opts.parameterServer.method = 'mmap' ;
 41 | opts.parameterServer.prefix = 'mcn' ;
 42 | 
 43 | opts.derOutputs = {'objective', 1} ;
 44 | opts.extractStatsFn = @extractStats ;
 45 | opts.plotStatistics = true;
 46 | opts.postEpochFn = [] ;  % postEpochFn(net,params,state) called after each epoch; can return a new learning rate, 0 to stop, [] for no change
 47 | opts = vl_argparse(opts, varargin) ;
 48 | 
 49 | if ~exist(opts.expDir, 'dir'), mkdir(opts.expDir) ; end
 50 | if isempty(opts.train), opts.train = find(imdb.images.set==1) ; end
 51 | if isempty(opts.val), opts.val = find(imdb.images.set==2) ; end
 52 | if isscalar(opts.train) && isnumeric(opts.train) && isnan(opts.train)
 53 |   opts.train = [] ;
 54 | end
 55 | if isscalar(opts.val) && isnumeric(opts.val) && isnan(opts.val)
 56 |   opts.val = [] ;
 57 | end
 58 | 
 59 | % -------------------------------------------------------------------------
 60 | %                                                            Initialization
 61 | % -------------------------------------------------------------------------
 62 | 
 63 | evaluateMode = isempty(opts.train) ;
 64 | if ~evaluateMode
 65 |   if isempty(opts.derOutputs)
 66 |     error('DEROUTPUTS must be specified when training.\n') ;
 67 |   end
 68 | end
 69 | 
 70 | % -------------------------------------------------------------------------
 71 | %                                                        Train and validate
 72 | % -------------------------------------------------------------------------
 73 | 
 74 | modelPath = @(ep) fullfile(opts.expDir, sprintf('net-epoch-%d.mat', ep));
 75 | modelFigPath = fullfile(opts.expDir, 'net-train.pdf') ;
 76 | 
 77 | start = opts.continue * findLastCheckpoint(opts.expDir) ;
 78 | if start >= 1
 79 |   fprintf('%s: resuming by loading epoch %d\n', mfilename, start) ;
 80 |   [net, state, stats] = loadState(modelPath(start)) ;
 81 | else
 82 |   state = [] ;
 83 | end
 84 | 
 85 | for epoch=start+1:opts.numEpochs
 86 | 
 87 |   % Set the random seed based on the epoch and opts.randomSeed.
 88 |   % This is important for reproducibility, including when training
 89 |   % is restarted from a checkpoint.
 90 | 
 91 |   rng(epoch + opts.randomSeed) ;
 92 |   prepareGPUs(opts, epoch == start+1) ;
 93 | 
 94 |   % Train for one epoch.
 95 |   params = opts ;
 96 |   params.epoch = epoch ;
 97 |   params.learningRate = opts.learningRate(min(epoch, numel(opts.learningRate))) ;
 98 |   params.train = opts.train(randperm(numel(opts.train))) ; % shuffle
 99 |   params.train = params.train(1:min(opts.epochSize, numel(opts.train)));
100 |   params.val = opts.val(randperm(numel(opts.val))) ;
101 |   params.imdb = imdb ;
102 |   params.getBatch = getBatch ;
103 | 
104 |   if numel(opts.gpus) <= 1
105 |     [net, state] = processEpoch(net, state, params, 'train') ;
106 |     [net, state] = processEpoch(net, state, params, 'val') ;
107 |     if ~evaluateMode
108 |       saveState(modelPath(epoch), net, state) ;
109 |     end
110 |     lastStats = state.stats ;
111 |   else
112 |     spmd
113 |       [net, state] = processEpoch(net, state, params, 'train') ;
114 |       [net, state] = processEpoch(net, state, params, 'val') ;
115 |       if labindex == 1 && ~evaluateMode
116 |         saveState(modelPath(epoch), net, state) ;
117 |       end
118 |       lastStats = state.stats ;
119 |     end
120 |     lastStats = accumulateStats(lastStats) ;
121 |   end
122 | 
123 |   stats.train(epoch) = lastStats.train ;
124 |   stats.val(epoch) = lastStats.val ;
125 |   clear lastStats ;
126 |   saveStats(modelPath(epoch), stats) ;
127 | 
128 |   if opts.plotStatistics
129 |     switchFigure(1) ; clf ;
130 |     plots = setdiff(...
131 |       cat(2,...
132 |       fieldnames(stats.train)', ...
133 |       fieldnames(stats.val)'), {'num', 'time'}) ;
134 |     for p = plots
135 |       p = char(p) ;
136 |       values = zeros(0, epoch) ;
137 |       leg = {} ;
138 |       for f = {'train', 'val'}
139 |         f = char(f) ;
140 |         if isfield(stats.(f), p)
141 |           tmp = [stats.(f).(p)] ;
142 |           values(end+1,:) = tmp(1,:)' ;
143 |           leg{end+1} = f ;
144 |         end
145 |       end
146 |       subplot(1,numel(plots),find(strcmp(p,plots))) ;
147 |       plot(1:epoch, values','o-') ;
148 |       xlabel('epoch') ;
149 |       title(p) ;
150 |       legend(leg{:}) ;
151 |       grid on ;
152 |     end
153 |     drawnow ;
154 |     print(1, modelFigPath, '-dpdf') ;
155 |   end
156 |   
157 |   if ~isempty(opts.postEpochFn)
158 |     if nargout(opts.postEpochFn) == 0
159 |       opts.postEpochFn(net, params, state) ;
160 |     else
161 |       lr = opts.postEpochFn(net, params, state) ;
162 |       if ~isempty(lr), opts.learningRate = lr; end
163 |       if opts.learningRate == 0, break; end
164 |     end
165 |   end
166 | end
167 | 
168 | % With multiple GPUs, return one copy
169 | if isa(net, 'Composite'), net = net{1} ; end
170 | 
171 | % -------------------------------------------------------------------------
172 | function [net, state] = processEpoch(net, state, params, mode)
173 | % -------------------------------------------------------------------------
174 | % Note that net is not strictly needed as an output argument as net
175 | % is a handle class. However, this fixes some aliasing issue in the
176 | % spmd caller.
177 | 
178 | % initialize with momentum 0
179 | if isempty(state) || isempty(state.solverState)
180 |   state.solverState = cell(1, numel(net.params)) ;
181 |   state.solverState(:) = {0} ;
182 | end
183 | 
184 | % move CNN  to GPU as needed
185 | numGpus = numel(params.gpus) ;
186 | if numGpus >= 1
187 |   net.move('gpu') ;
188 |   for i = 1:numel(state.solverState)
189 |     s = state.solverState{i} ;
190 |     if isnumeric(s)
191 |       state.solverState{i} = gpuArray(s) ;
192 |     elseif isstruct(s)
193 |       state.solverState{i} = structfun(@gpuArray, s, 'UniformOutput', false) ;
194 |     end
195 |   end
196 | end
197 | if numGpus > 1
198 |   parserv = ParameterServer(params.parameterServer) ;
199 |   net.setParameterServer(parserv) ;
200 | else
201 |   parserv = [] ;
202 | end
203 | 
204 | % profile
205 | if params.profile
206 |   if numGpus <= 1
207 |     profile clear ;
208 |     profile on ;
209 |   else
210 |     mpiprofile reset ;
211 |     mpiprofile on ;
212 |   end
213 | end
214 | 
215 | num = 0 ;
216 | epoch = params.epoch ;
217 | subset = params.(mode) ;
218 | adjustTime = 0 ;
219 | 
220 | stats.num = 0 ; % return something even if subset = []
221 | stats.time = 0 ;
222 | 
223 | start = tic ;
224 | for t=1:params.batchSize:numel(subset)
225 |   fprintf('%s: epoch %02d: %3d/%3d:', mode, epoch, ...
226 |           fix((t-1)/params.batchSize)+1, ceil(numel(subset)/params.batchSize)) ;
227 |   batchSize = min(params.batchSize, numel(subset) - t + 1) ;
228 | 
229 |   for s=1:params.numSubBatches
230 |     % get this image batch and prefetch the next
231 |     batchStart = t + (labindex-1) + (s-1) * numlabs ;
232 |     batchEnd = min(t+params.batchSize-1, numel(subset)) ;
233 |     batch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ;
234 |     num = num + numel(batch) ;
235 |     if numel(batch) == 0, continue ; end
236 | 
237 |     inputs = params.getBatch(params.imdb, batch) ;
238 | 
239 |     if params.prefetch
240 |       if s == params.numSubBatches
241 |         batchStart = t + (labindex-1) + params.batchSize ;
242 |         batchEnd = min(t+2*params.batchSize-1, numel(subset)) ;
243 |       else
244 |         batchStart = batchStart + numlabs ;
245 |       end
246 |       nextBatch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ;
247 |       params.getBatch(params.imdb, nextBatch) ;
248 |     end
249 | 
250 |     if strcmp(mode, 'train')
251 |       net.mode = 'normal' ;
252 |       net.accumulateParamDers = (s ~= 1) ;
253 |       net.eval(inputs, params.derOutputs, 'holdOn', s < params.numSubBatches) ;
254 |     else
255 |       net.mode = 'test' ;
256 |       net.eval(inputs) ;
257 |     end
258 |   end
259 | 
260 |   % Accumulate gradient.
261 |   if strcmp(mode, 'train')
262 |     if ~isempty(parserv), parserv.sync() ; end
263 |     state = accumulateGradients(net, state, params, parserv) ;
264 |   end
265 | 
266 |   % Get statistics.
267 |   time = toc(start) + adjustTime ;
268 |   batchTime = time - stats.time ;
269 |   stats.num = num ;
270 |   stats.time = time ;
271 |   stats = params.extractStatsFn(stats,net) ;
272 |   currentSpeed = batchSize / batchTime ;
273 |   averageSpeed = (t + batchSize - 1) / time ;
274 |   if t == 3*params.batchSize + 1
275 |     % compensate for the first three iterations, which are outliers
276 |     adjustTime = 4*batchTime - time ;
277 |     stats.time = time + adjustTime ;
278 |   end
279 | 
280 |   fprintf(' %.1f (%.1f) Hz', averageSpeed, currentSpeed) ;
281 |   for f = setdiff(fieldnames(stats)', {'num', 'time'})
282 |     f = char(f) ;
283 |     fprintf(' %s: %.3f', f, stats.(f)) ;
284 |   end
285 |   fprintf('\n') ;
286 | end
287 | 
288 | % Save back to state.
289 | state.stats.(mode) = stats ;
290 | if params.profile
291 |   if numGpus <= 1
292 |     state.prof.(mode) = profile('info') ;
293 |     profile off ;
294 |   else
295 |     state.prof.(mode) = mpiprofile('info');
296 |     mpiprofile off ;
297 |   end
298 | end
299 | if ~params.saveSolverState
300 |   state.solverState = [] ;
301 | else
302 |   for i = 1:numel(state.solverState)
303 |     s = state.solverState{i} ;
304 |     if isnumeric(s)
305 |       state.solverState{i} = gather(s) ;
306 |     elseif isstruct(s)
307 |       state.solverState{i} = structfun(@gather, s, 'UniformOutput', false) ;
308 |     end
309 |   end
310 | end
311 | 
312 | net.reset() ;
313 | net.move('cpu') ;
314 | 
315 | % -------------------------------------------------------------------------
316 | function state = accumulateGradients(net, state, params, parserv)
317 | % -------------------------------------------------------------------------
318 | numGpus = numel(params.gpus) ;
319 | otherGpus = setdiff(1:numGpus, labindex) ;
320 | 
321 | den = params.numSubBatches * max(numGpus,1) ;
322 | 
323 | for p=1:numel(net.params)
324 | 
325 |   if ~isempty(parserv)
326 |     parDer = parserv.pullWithIndex(p) ;
327 |   else
328 |     parDer = net.params(p).der ;
329 |   end
330 | 
331 |   switch net.params(p).trainMethod
332 | 
333 |     case 'average' % mainly for batch normalization
334 |       thisLR = net.params(p).learningRate ;
335 |       net.params(p).value = vl_taccum(...
336 |           1 - thisLR, net.params(p).value, ...
337 |           (thisLR/den/net.params(p).fanout),  parDer) ;
338 | 
339 |     case 'gradient'
340 |       thisDecay = params.weightDecay * net.params(p).weightDecay ;
341 |       thisLR = params.learningRate * net.params(p).learningRate ;
342 | 
343 |       if thisLR>0 || thisDecay>0
344 |         % Normalize gradient and incorporate weight decay.
345 |         parDer = vl_taccum(1/den, parDer, ...
346 |                            thisDecay, net.params(p).value) ;
347 | 
348 |         if isempty(params.solver)
349 |           % Default solver is the optimised SGD.
350 |           % Update momentum.
351 |           state.solverState{p} = vl_taccum(...
352 |             params.momentum, state.solverState{p}, ...
353 |             -1, parDer) ;
354 | 
355 |           % Nesterov update (aka one step ahead).
356 |           if params.nesterovUpdate
357 |             delta = params.momentum * state.solverState{p} - parDer ;
358 |           else
359 |             delta = state.solverState{p} ;
360 |           end
361 | 
362 |           % Update parameters.
363 |           net.params(p).value = vl_taccum(...
364 |             1,  net.params(p).value, thisLR, delta) ;
365 | 
366 |         else
367 |           % call solver function to update weights
368 |           [net.params(p).value, state.solverState{p}] = ...
369 |             params.solver(net.params(p).value, state.solverState{p}, ...
370 |             parDer, params.solverOpts, thisLR) ;
371 |         end
372 |       end
373 |     otherwise
374 |       error('Unknown training method ''%s'' for parameter ''%s''.', ...
375 |         net.params(p).trainMethod, ...
376 |         net.params(p).name) ;
377 |   end
378 | end
379 | 
380 | % -------------------------------------------------------------------------
381 | function stats = accumulateStats(stats_)
382 | % -------------------------------------------------------------------------
383 | 
384 | for s = {'train', 'val'}
385 |   s = char(s) ;
386 |   total = 0 ;
387 | 
388 |   % initialize stats stucture with same fields and same order as
389 |   % stats_{1}
390 |   stats__ = stats_{1} ;
391 |   names = fieldnames(stats__.(s))' ;
392 |   values = zeros(1, numel(names)) ;
393 |   fields = cat(1, names, num2cell(values)) ;
394 |   stats.(s) = struct(fields{:}) ;
395 | 
396 |   for g = 1:numel(stats_)
397 |     stats__ = stats_{g} ;
398 |     num__ = stats__.(s).num ;
399 |     total = total + num__ ;
400 | 
401 |     for f = setdiff(fieldnames(stats__.(s))', 'num')
402 |       f = char(f) ;
403 |       stats.(s).(f) = stats.(s).(f) + stats__.(s).(f) * num__ ;
404 | 
405 |       if g == numel(stats_)
406 |         stats.(s).(f) = stats.(s).(f) / total ;
407 |       end
408 |     end
409 |   end
410 |   stats.(s).num = total ;
411 | end
412 | 
413 | % -------------------------------------------------------------------------
414 | function stats = extractStats(stats, net)
415 | % -------------------------------------------------------------------------
416 | sel = find(cellfun(@(x) isa(x,'dagnn.Loss'), {net.layers.block})) ;
417 | for i = 1:numel(sel)
418 |   if net.layers(sel(i)).block.ignoreAverage, continue; end;
419 |   stats.(net.layers(sel(i)).outputs{1}) = net.layers(sel(i)).block.average ;
420 | end
421 | 
422 | % -------------------------------------------------------------------------
423 | function saveState(fileName, net_, state)
424 | % -------------------------------------------------------------------------
425 | net = net_.saveobj() ;
426 | save(fileName, 'net', 'state') ;
427 | 
428 | % -------------------------------------------------------------------------
429 | function saveStats(fileName, stats)
430 | % -------------------------------------------------------------------------
431 | if exist(fileName)
432 |   save(fileName, 'stats', '-append') ;
433 | else
434 |   save(fileName, 'stats') ;
435 | end
436 | 
437 | % -------------------------------------------------------------------------
438 | function [net, state, stats] = loadState(fileName)
439 | % -------------------------------------------------------------------------
440 | load(fileName, 'net', 'state', 'stats') ;
441 | net = dagnn.DagNN.loadobj(net) ;
442 | if isempty(whos('stats'))
443 |   error('Epoch ''%s'' was only partially saved. Delete this file and try again.', ...
444 |         fileName) ;
445 | end
446 | 
447 | % -------------------------------------------------------------------------
448 | function epoch = findLastCheckpoint(modelDir)
449 | % -------------------------------------------------------------------------
450 | list = dir(fullfile(modelDir, 'net-epoch-*.mat')) ;
451 | tokens = regexp({list.name}, 'net-epoch-([\d]+).mat', 'tokens') ;
452 | epoch = cellfun(@(x) sscanf(x{1}{1}, '%d'), tokens) ;
453 | epoch = max([epoch 0]) ;
454 | 
455 | % -------------------------------------------------------------------------
456 | function switchFigure(n)
457 | % -------------------------------------------------------------------------
458 | if get(0,'CurrentFigure') ~= n
459 |   try
460 |     set(0,'CurrentFigure',n) ;
461 |   catch
462 |     figure(n) ;
463 |   end
464 | end
465 | 
466 | % -------------------------------------------------------------------------
467 | function clearMex()
468 | % -------------------------------------------------------------------------
469 | clear vl_tmove vl_imreadjpeg ;
470 | 
471 | % -------------------------------------------------------------------------
472 | function prepareGPUs(opts, cold)
473 | % -------------------------------------------------------------------------
474 | numGpus = numel(opts.gpus) ;
475 | if numGpus > 1
476 |   % check parallel pool integrity as it could have timed out
477 |   pool = gcp('nocreate') ;
478 |   if ~isempty(pool) && pool.NumWorkers ~= numGpus
479 |     delete(pool) ;
480 |   end
481 |   pool = gcp('nocreate') ;
482 |   if isempty(pool)
483 |     parpool('local', numGpus) ;
484 |     cold = true ;
485 |   end
486 | 
487 | end
488 | if numGpus >= 1 && cold
489 |   fprintf('%s: resetting GPU\n', mfilename)
490 |   clearMex() ;
491 |   if numGpus == 1
492 |     gpuDevice(opts.gpus)
493 |   else
494 |     spmd
495 |       clearMex() ;
496 |       gpuDevice(opts.gpus(labindex))
497 |     end
498 |   end
499 | end
500 | 


--------------------------------------------------------------------------------